#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Script for illustrating the importance of VISUAL INSPECTION of a dataset.
#
#  The example is called "Anscombe's Quartet" (F.J. Anscombe, "Graphs in Statistical
#  Analysis," American Statistician, 27 [February 1973], 17-21) and consists of four
#  datasets, which have the same:
#   - Mean of each x variable                   9.0
#   - Variance of each x variable              10.0
#   - Mean of each y variable                   7.5
#   - Variance of each y variable               3.75
#   - Correlation between each x and y variable 0.816
#   - Linear regression line                    y = 3 + 0.5x
#
#  However, they are very different! For more information on Anscombe's Quartet, see:
#    http://en.wikipedia.org/wiki/Anscombe's_quartet
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   11th of November 2017
#
# ----------------------------------------------------------------------------------- #
from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
from iminuit import Minuit
from probfit import Chi2Regression #, BinnedChi2, UnbinnedLH, Extended
plt.close('all')

#---------------------------------------------------------------------------------- 
# Run by  ./AnscombesQuartet.py
# Output: plot_AnscombesQuartet.pdf
#---------------------------------------------------------------------------------- 

SavePlots = True

# ------------------------------------------------------------------ #
# Get data in arrays:
# ------------------------------------------------------------------ #

Ndatasets = 4
Npoints   = 11

# Define data samples
x = np.array([ [ 10.0,  8.0, 13.0,  9.0, 11.0, 14.0,  6.0,  4.0, 12.0,  7.0,  5.0 ] ,
               [ 10.0,  8.0, 13.0,  9.0, 11.0, 14.0,  6.0,  4.0, 12.0,  7.0,  5.0 ] ,
               [ 10.0,  8.0, 13.0,  9.0, 11.0, 14.0,  6.0,  4.0, 12.0,  7.0,  5.0 ] ,
               [  8.0,  8.0,  8.0,  8.0,  8.0,  8.0,  8.0, 19.0,  8.0,  8.0,  8.0 ] ])

y = np.array([ [ 8.04,  6.95,  7.58,  8.81,  8.33,  9.96,  7.24,  4.26, 10.84,  4.82,  5.68 ]  ,
               [ 9.14,  8.14,  8.74,  8.77,  9.26,  8.10,  6.13,  3.10,  9.13,  7.26,  4.74 ]  ,
               [ 7.46,  6.77, 12.74,  7.11,  7.81,  8.84,  6.08,  5.39,  8.15,  6.42,  5.73 ]  ,
               [ 6.58,  5.76,  7.71,  8.84,  8.47,  7.04,  5.25, 12.50,  5.56,  7.91,  6.89 ] ])


# ------------------------------------------------------------------ #
# Fit data:
# ------------------------------------------------------------------ #


def linear_fit(x, p0, p1):
    return p0 + p1*x

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(8,8))
ax = ax.flatten() # go from 2d list to 1d list
for idataset in range(Ndatasets):
    
    x_i, y_i = x[idataset], y[idataset] # get the i'th data set
    ax_i = ax[idataset] # get the i'th axis (i.e. subplot i)
    
    ax_i.scatter(x_i, y_i, marker='.', color='blue', s=40)  # make a scatter plot of the i'th data set as blue dots 
    ax_i.set_title('Graph') 

    chi2_object_lin = Chi2Regression(linear_fit, x_i, y_i) # chi2-regression object
    minuit_lin = Minuit(chi2_object_lin, pedantic=False, p0=1., p1=1.) # sets the initial parameters of the fit
    minuit_lin.migrad(); # fit the function to the data

    x_fit = np.linspace(0.9*x_i.min(), 1.1*x_i.max()) # Create the x-axis for the plot of the fitted function
    y_fit = linear_fit(x_fit, *minuit_lin.args) # the fitted function, where we have unpacked the fitted values
    ax_i.plot(x_fit, y_fit, '-r') # plot the fit with a red ("r") line ("-")
    
    string = "p0 = {:.3f} +/- {:.3f}\n".format(minuit_lin.values['p0'], minuit_lin.errors['p0'])
    string += "p1 = {:.3f} +/- {:.3f}".format(minuit_lin.values['p1'], minuit_lin.errors['p1'])
    ax_i.text(0.05, 0.95, string, family='monospace', 
        transform=ax_i.transAxes, fontsize=10, verticalalignment='top')

plt.tight_layout() 
plt.show(block=False)

if (SavePlots) :
    fig.savefig('plot_AnscombesQuartet.pdf', dpi=600)


# Finally, ensure that the program does not termine (and the plot disappears), before you press enter:
try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


#---------------------------------------------------------------------------------- 
#
# First acquaint yourself with the program, and get yourself a "free" (hopefully not first!)
# look at how Python works. Understand that each of the four distributions are being fitted
# with a linear function (here called "fit_p1") and the results plottet. There are comments
# for most lines in the macro!
#
# Run the macro, and then take a close look at each of the four results.
#
#
# Questions:
# ----------
# 1) Looking closely at each of the four fits, determine which points gives the largest
#    contribution to the "mismatch" (that is chi-square) between the data and the fit,
#    if any single.
#
# 2) Which scenario looks most like real data?
#
# 3) Consider how YOU would treat each of the four datasets and fit them!
#
#
# Advanced questions:
# -------------------
# 1) How would you with (smarter) statistical techniques detect that something was
#    not right without looking?
#
#----------------------------------------------------------------------------------