#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # This Python macro illustrates the use of ChiSquare as a goodness-of-fit measure, # how this distribution comes about, and that it actually works, here with three # different examples! # # References: # Barlow: Chapter 6 # Cowan: Chapter 2.7, Chapter 7 # Bevington: Chapter 6 # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 12th of November 2017 # # ----------------------------------------------------------------------------------- # from __future__ import division, print_function import numpy as np import matplotlib.pyplot as plt from iminuit import Minuit from probfit import Chi2Regression plt.close('all') # function to create a nice string output def nice_string_output(names, values, extra_spacing = 2): max_values = len(max(values, key=len)) max_names = len(max(names, key=len)) string = "" for name, value in zip(names, values): string += "{0:s} {1:>{spacing}} \n".format(name, value, spacing = extra_spacing + max_values + max_names - len(name)) return string[:-2] # ----------------------------------------------------------------------------------- # # Chi Squared Test # ----------------------------------------------------------------------------------- # SavePlots = True r = np.random # Random generator r.seed(42) # Set a random seed (but a fixed one - more on that later.) # ------------------------------------------------------------------ # # Generate and fit LINEAR data: # ------------------------------------------------------------------ # Nexp = 1000 NpointsLin = 17 alpha0 = 3.6 alpha1 = 0.3 sigmay = 0.4 array_Chi2_Lin = np.zeros(Nexp) array_Prob_Lin = np.zeros(Nexp) # Loop over number of experiments to generate data and subsequent Chi2 values: for iexp in range( Nexp ) : # Generate points: xLin = np.arange(NpointsLin)+1 exLin = np.zeros_like(xLin) yLin = alpha0 + alpha1 * xLin + r.normal(0, sigmay, NpointsLin) eyLin = sigmay*np.ones_like(xLin) def fit_function_Lin(x, alpha0, alpha1): return alpha0 + alpha1*x chi2_object = Chi2Regression(fit_function_Lin, xLin, yLin, eyLin) minuitLin = Minuit(chi2_object, pedantic=False, alpha0=1, alpha1=1, print_level=0) minuitLin.migrad(); # perform the actual fit Chi2Lin = minuitLin.fval # the chi2 value NvarLin = 2 # Number of variables (alpha0 and alpha1) NdofLin = NpointsLin - NvarLin # Number of degrees of freedom from scipy import stats ProbLin = stats.chi2.sf(Chi2Lin, NdofLin) # The chi2 probability given N_DOF degrees of freedom array_Chi2_Lin[iexp] = Chi2Lin array_Prob_Lin[iexp] = ProbLin fig, ax = plt.subplots(ncols=3, figsize=(16, 6)) ax[0].errorbar(xLin, yLin, eyLin, fmt='ro', ecolor='k', elinewidth=1, capsize=2, capthick=1) ax[0].plot(xLin, fit_function_Lin(xLin, *minuitLin.args), '-r') names = ['Chi2/ndf', 'Prob', 'p0', 'p1'] values = ["{:.3f} / {:d}".format(Chi2Lin, NdofLin), "{:.3f}".format(ProbLin), "{:.3f} +/- {:.3f}".format(minuitLin.values['alpha0'], minuitLin.errors['alpha0']), "{:.3f} +/- {:.3f}".format(minuitLin.values['alpha1'], minuitLin.errors['alpha1']), ] ax[0].text(0.05, 0.95, nice_string_output(names, values), family='monospace', transform=ax[0].transAxes, fontsize=10, verticalalignment='top') # ------------------------------------------------------------------ # # Generate and fit OSCILLATING data: # ------------------------------------------------------------------ # NpointsOsc = 19 mean = 1.6 amp = 3.3 omega = 0.7 phase = 0.3 sigmay = 0.5 array_Chi2_Osc = np.zeros(Nexp) array_Prob_Osc = np.zeros(Nexp) # Loop over number of experiments to generate data and subsequent Chi2 values: for iexp in range( Nexp ) : # Generate points: xOsc = np.arange(NpointsOsc)+1 exOsc = np.zeros_like(xLin) yOsc = mean + amp*np.cos(omega*xOsc + phase) + r.normal(0, sigmay, NpointsOsc) eyOsc = sigmay*np.ones_like(xOsc) # Fit points: def fit_function_Osc(x, mean, amp, omega, phase): return mean + amp*np.cos(omega*x + phase) chi2_object = Chi2Regression(fit_function_Osc, xOsc, yOsc, eyOsc) minuitOsc = Minuit(chi2_object, pedantic=False, mean=mean, amp=amp, omega=omega, phase=phase, print_level=0) # minuitOsc = Minuit(chi2_object, pedantic=False, mean=0.0, amp=0.0, omega=0.0, phase=0.0, print_level=0) minuitOsc.migrad(); # Perform the actual fit Chi2Osc = minuitOsc.fval # Get the chi2 value NvarOsc = 4 # Number of variables (alpha0 and alpha1) NdofOsc = NpointsOsc - NvarOsc # Number of degrees of freedom ProbOsc = stats.chi2.sf(Chi2Osc, NdofOsc) # The chi2 probability given N degrees of freedom, Ndof array_Chi2_Osc[iexp] = Chi2Osc array_Prob_Osc[iexp] = ProbOsc ax[1].errorbar(xOsc, yOsc, eyOsc, fmt='ro', ecolor='k', elinewidth=1, capsize=2, capthick=1) x_axis = np.linspace(1, NpointsOsc, 1000) ax[1].plot(x_axis, fit_function_Osc(x_axis, *minuitOsc.args), '-r') ax[1].set_ylim(-3.5, 8.5) # If you want to show uncertainties in y: # y, bin_edges = np.histogram(x_all, bins=Nbins, range=(xmin, xmax), normed=False) # x = 0.5*(bin_edges[1:] + bin_edges[:-1]) # sy = np.sqrt(y) names = ['Chi2/ndf', 'Prob', 'mean', 'amp', 'omega', 'phase'] values = ["{:.3f} / {:d}".format(Chi2Osc, NdofOsc), "{:.3f}".format(ProbOsc), "{:.3f} +/- {:.3f}".format(minuitOsc.values['mean'], minuitOsc.errors['mean']), "{:.3f} +/- {:.3f}".format(minuitOsc.values['amp'], minuitOsc.errors['amp']), "{:.3f} +/- {:.3f}".format(minuitOsc.values['omega'], minuitOsc.errors['omega']), "{:.3f} +/- {:.3f}".format(minuitOsc.values['phase'], minuitOsc.errors['phase']), ] ax[1].text(0.05, 0.95, nice_string_output(names, values), family='monospace', transform=ax[1].transAxes, fontsize=10, verticalalignment='top') # ------------------------------------------------------------------ # # Generate and fit EXPONENTIAL binned data: # ------------------------------------------------------------------ # NpointsExp = 1000 # Put this number of points (exponentially distributed) in a histogram. NbinsExp = 17 tau = 3.1 array_Chi2_Exp = np.zeros(Nexp) array_Prob_Exp = np.zeros(Nexp) def fit_function_Exp(x, tau): return 1.0 / tau * np.exp(- x / tau) def fit_function_Exp_Ext(x, tau, N): return N * fit_function_Exp(x, tau) # Loop over number of experiments to generate data and subsequent Chi2 values: for iexp in range( Nexp ) : dataExp = r.exponential(tau, NpointsExp) yExp, xExp_edges = np.histogram(dataExp, bins=NbinsExp, range=(0, NbinsExp)) xExp = (xExp_edges[1:] + xExp_edges[:-1])/2 syExp = np.sqrt(yExp) Chi2_object = Chi2Regression(fit_function_Exp_Ext, xExp[yExp>0], yExp[yExp>0], syExp[yExp>0]) minuitExp = Minuit(Chi2_object, pedantic=False, tau = tau, N=NpointsExp, print_level=0) minuitExp.migrad(); # perform the actual fit Chi2Exp = minuitExp.fval NvarExp = 2 # Number of variables (alpha0 and alpha1) NdofExp = NbinsExp - NvarExp # Number of degrees of freedom ProbExp = stats.chi2.sf(Chi2Exp, NdofExp) # The chi2 probability given N_DOF degrees of freedom array_Chi2_Exp[iexp] = Chi2Exp array_Prob_Exp[iexp] = ProbExp ax[2].hist(dataExp, NbinsExp, range=(0, NbinsExp), histtype='step') ax[2].set_xlim(0, NbinsExp) x_axis = np.linspace(0, NbinsExp, 1000) ax[2].plot(x_axis, fit_function_Exp_Ext(x_axis, *minuitExp.args), '-r') names = ['Entries', 'Mean', 'STD', 'Chi2/ndf', 'Prob', 'N', 'tau'] values = ["{:d}".format(len(dataExp)), "{:.3f}".format(dataExp.mean()), "{:.3f}".format(dataExp.std(ddof=1)), "{:.3f} / {:d}".format(Chi2Lin, NdofLin), "{:.3f}".format(ProbLin), "{:.3f} +/- {:.3f}".format(minuitExp.values['N'], minuitExp.errors['N']), "{:.3f} +/- {:.3f}".format(minuitExp.values['tau'], minuitExp.errors['tau']), ] ax[2].text(0.3, 0.95, nice_string_output(names, values), family='monospace', transform=ax[2].transAxes, fontsize=10, verticalalignment='top') plt.show(block=False) if (SavePlots) : fig.savefig("Chi2Dist_SeveralExamples.pdf") # ------------------------------------------------------------------ # # Fit the data and plot the result: # ------------------------------------------------------------------ # # If there have been more than one experiment, then make another white canvas: if (Nexp > 1) : xaxis = np.linspace(0, 50, 1000) yaxis = stats.chi2.pdf(xaxis, 15) array_Chi2 = [array_Chi2_Lin, array_Chi2_Osc, array_Chi2_Exp] fig2, ax2 = plt.subplots(ncols=3, figsize=(16, 6)) for i in range(3): ax2[i].hist(array_Chi2[i], 50, range=(0, 50), histtype='step', normed=True) ax2[i].plot(xaxis, yaxis) string = "Entries {:>7d}\n".format(len(array_Chi2[i])) string += "Mean {:>10.3f}\n".format(array_Chi2[i].mean()) string += "STD {:>11.4}".format(array_Chi2[i].std(ddof=1)) ax2[i].text(0.6, 0.95, string, family='monospace', transform=ax2[i].transAxes, fontsize=10, verticalalignment='top') plt.show(block=False) if (SavePlots) : fig2.savefig("Chi2Dist_SeveralCases.pdf") # Finally, ensure that the program does not termine (and the plot disappears), before you press enter: try: __IPYTHON__ except: raw_input('Press Enter to exit') #---------------------------------------------------------------------------------- # # Make sure you've read the relevant references and that you understand not only what # the ChiSquare is, but also that it follows the ChiSquare distribution, and that the # probability of obtaining such a ChiSquare or worse can be calculated from it. # # The program generates a certain number of datasets in three different ways, from # which the Chi2 of the fit is recorded. # # # Questions: # ---------- # 1) Why have I chosen the three examples to have 17 points, 19 points and 17 bins? # Hint: What is the number of degrees of freedom in each of the three cases? # # 2) In the example of the linear fit, what number of points lies outside +-1 sigma? # Is that a reasonable number? # # 3) In the oscillatory case, try to drop the line were you set the parameters # (line defining "minuitOsc"), and see how well the fit goes, when it does not # have good starting values. # # 4) In the exponential fit, where do the uncertainties come from? And is the fit # reasonable? # # # Advanced questions: # ------------------- # 1) Why does the last of the three Chi2 distributions not fit quite? # Try to change the number of generated points to 100 instead of 1000, # and/or change the lifetime to tau=2.1. Does this increase the mismatch # of the Chi2 distribution. Does that give you a hint why? # #----------------------------------------------------------------------------------