#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Script for illustrating the importance of VISUAL INSPECTION of a dataset. # # The example is called "Anscombe's Quartet" (F.J. Anscombe, "Graphs in Statistical # Analysis," American Statistician, 27 [February 1973], 17-21) and consists of four # datasets, which have the same: # - Mean of each x variable 9.0 # - Variance of each x variable 10.0 # - Mean of each y variable 7.5 # - Variance of each y variable 3.75 # - Correlation between each x and y variable 0.816 # - Linear regression line y = 3 + 0.5x # # However, they are very different! For more information on Anscombe's Quartet, see: # http://en.wikipedia.org/wiki/Anscombe's_quartet # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 11th of November 2017 # # ----------------------------------------------------------------------------------- # from __future__ import division, print_function import numpy as np import matplotlib.pyplot as plt from iminuit import Minuit from probfit import Chi2Regression #, BinnedChi2, UnbinnedLH, Extended plt.close('all') #---------------------------------------------------------------------------------- # Run by ./AnscombesQuartet.py # Output: plot_AnscombesQuartet.pdf #---------------------------------------------------------------------------------- SavePlots = True # ------------------------------------------------------------------ # # Get data in arrays: # ------------------------------------------------------------------ # Ndatasets = 4 Npoints = 11 # Define data samples x = np.array([ [ 10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0 ] , [ 10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0 ] , [ 10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0 ] , [ 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0 ] ]) y = np.array([ [ 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68 ] , [ 9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74 ] , [ 7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73 ] , [ 6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89 ] ]) # ------------------------------------------------------------------ # # Fit data: # ------------------------------------------------------------------ # def linear_fit(x, p0, p1): return p0 + p1*x fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(8,8)) ax = ax.flatten() # go from 2d list to 1d list for idataset in range(Ndatasets): x_i, y_i = x[idataset], y[idataset] # get the i'th data set ax_i = ax[idataset] # get the i'th axis (i.e. subplot i) ax_i.scatter(x_i, y_i, marker='.', color='blue', s=40) # make a scatter plot of the i'th data set as blue dots ax_i.set_title('Graph') chi2_object_lin = Chi2Regression(linear_fit, x_i, y_i) # chi2-regression object minuit_lin = Minuit(chi2_object_lin, pedantic=False, p0=1., p1=1.) # sets the initial parameters of the fit minuit_lin.migrad(); # fit the function to the data x_fit = np.linspace(0.9*x_i.min(), 1.1*x_i.max()) # Create the x-axis for the plot of the fitted function y_fit = linear_fit(x_fit, *minuit_lin.args) # the fitted function, where we have unpacked the fitted values ax_i.plot(x_fit, y_fit, '-r') # plot the fit with a red ("r") line ("-") string = "p0 = {:.3f} +/- {:.3f}\n".format(minuit_lin.values['p0'], minuit_lin.errors['p0']) string += "p1 = {:.3f} +/- {:.3f}".format(minuit_lin.values['p1'], minuit_lin.errors['p1']) ax_i.text(0.05, 0.95, string, family='monospace', transform=ax_i.transAxes, fontsize=10, verticalalignment='top') plt.tight_layout() plt.show(block=False) if (SavePlots) : fig.savefig('plot_AnscombesQuartet.pdf', dpi=600) # Finally, ensure that the program does not termine (and the plot disappears), before you press enter: try: __IPYTHON__ except: raw_input('Press Enter to exit') #---------------------------------------------------------------------------------- # # First acquaint yourself with the program, and get yourself a "free" (hopefully not first!) # look at how Python works. Understand that each of the four distributions are being fitted # with a linear function (here called "fit_p1") and the results plottet. There are comments # for most lines in the macro! # # Run the macro, and then take a close look at each of the four results. # # # Questions: # ---------- # 1) Looking closely at each of the four fits, determine which points gives the largest # contribution to the "mismatch" (that is chi-square) between the data and the fit, # if any single. # # 2) Which scenario looks most like real data? # # 3) Consider how YOU would treat each of the four datasets and fit them! # # # Advanced questions: # ------------------- # 1) How would you with (smarter) statistical techniques detect that something was # not right without looking? # #----------------------------------------------------------------------------------