#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # This Python macro illustrates the use of ChiSquare as a goodness-of-fit measure, # how this distribution comes about, and that it actually works! At the same time, it is # also an illustration of how to do a linear fit analytically, that is without running # the a minimisation algorithm (in our case called Minuit). # # The program generates a certain number of datasets, each consisting of 9 points along # a line. These are then fitted (both analytically and numerically), and the result and # the Chi2 of the fit is recorded along with the probability of the fit. # # References: # Barlow: Chapter 6 # Cowan: Chapter 2.7, Chapter 7 # Bevington: Chapter 6 # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 12th of November 2017 # # ----------------------------------------------------------------------------------- # from __future__ import division, print_function import numpy as np import matplotlib.pyplot as plt from iminuit import Minuit from probfit import Chi2Regression # , BinnedChi2 # , BinnedLH#, , UnbinnedLH, , , Extended plt.close('all') # Function to create a nice string output def nice_string_output(names, values, extra_spacing = 2): max_values = len(max(values, key=len)) max_names = len(max(names, key=len)) string = "" for name, value in zip(names, values): string += "{0:s} {1:>{spacing}} \n".format(name, value, spacing = extra_spacing + max_values + max_names - len(name)) return string[:-2] # ----------------------------------------------------------------------------------- # # Chi Squared Test # ----------------------------------------------------------------------------------- # SavePlots = True r = np.random # Random generator r.seed(42) # Set a random seed (but a fixed one - more on that later.) # ------------------------------------------------------------------ # # Loop over generating and fitting data: # ------------------------------------------------------------------ # Nexp = 1 Npoints = 9 # Parameters used in this problem: alpha0 = 3.6 alpha1 = 0.3 alpha2 = 0.1 sigmay = 1.0 array_alpha0 = np.zeros(Nexp) array_alpha1 = np.zeros(Nexp) array_Chi2 = np.zeros(Nexp) array_Prob = np.zeros(Nexp) for iexp in range( Nexp ) : # Generate data: # -------------- x = np.arange(Npoints)+1 ex = np.zeros_like(x) y = alpha0 + alpha1*x + r.normal(0, sigmay, Npoints) ey = sigmay*np.ones_like(x) # ---------------------- # Fit data analytically: # ---------------------- # The linear fit can be done analytically, as is outlined here # http://www.nbi.dk/~petersen/Teaching/Stat2017/StraightLineFit.pdf # Calculate the relevant sums: s = len(x) sx = np.sum(x) sxx = np.sum(x**2) sy = np.sum(y) sxy = np.sum(x*y) # Use sums in calculations: Delta = sxx * s - sx**2 alpha0_calc = (sy * sxx - sxy * sx) / Delta alpha1_calc = (sxy * s - sy * sx) / Delta sigma_alpha0_calc = sigmay * np.sqrt(sxx / Delta) sigma_alpha1_calc = sigmay * np.sqrt(s / Delta) # So now you have data points and a fit/theory. How to get the fit quality? # The answer is to calculate the Chi2 and Ndof, and from them get their # probability using a function (e.g. from scipy): Chi2_calc = 0.0 for i in range( Npoints ) : fit_value = alpha0_calc + alpha1_calc * x[i] Chi2_calc += ((y[i] - fit_value) / ey[i])**2 Nvar = 2 # Number of variables (alpha0 and alpha1) Ndof_calc = Npoints - Nvar # Number of degrees of freedom # From Chi2 and Ndof, one can calculate the probability of obtaining this # or something worse (i.e. higher Chi2). This is a good function to have! from scipy import stats Prob_calc = stats.chi2.sf(Chi2_calc, Ndof_calc) # The chi2 probability given N degrees of freedom (Ndof) # ----------------------------------------- # Fit data numerically (i.e. using Minuit): # ----------------------------------------- # This is the same fit, but now done numerically and iteratively by Minuit: # It is shorter, but a lot slower! def fit_function(x, alpha0, alpha1): return alpha0 + alpha1*x chi2_object = Chi2Regression(fit_function, x, y, ey) minuit = Minuit(chi2_object, pedantic=False, alpha0=1, alpha1=1, print_level=0) minuit.migrad(); # perform the actual fit minuit_output = [minuit.get_fmin(), minuit.get_param_states()] # save the output parameters in case needed alpha0_fit = minuit.values['alpha0'] alpha1_fit = minuit.values['alpha1'] sigma_alpha0_fit = minuit.errors['alpha0'] sigma_alpha1_fit = minuit.errors['alpha0'] # In Minuit, you can just ask the fit function for it: Chi2_fit = minuit.fval # the chi2 value Prob_fit = stats.chi2.sf(Chi2_fit, Ndof_calc) # The chi2 probability given N degrees of freedom (Ndof) # Let us see, if the calculated and the fitted values agree: if (iexp < 25) : print(" Calc:{:6.3f}+-{:5.3f} {:5.3f}+-{:5.3f} p={:6.4f} Fit:{:6.3f}+-{:5.3f} {:5.3f}+-{:5.3f} p={:6.4f}".format( alpha0_calc, sigma_alpha0_calc, alpha1_calc, sigma_alpha1_calc, Prob_calc, alpha0_fit, sigma_alpha0_fit, alpha1_fit, sigma_alpha1_fit, Prob_fit)) # Fill histograms with fit results: array_alpha0[iexp] = alpha0_calc array_alpha1[iexp] = alpha1_calc array_Chi2[iexp] = Chi2_fit array_Prob[iexp] = Prob_fit # ------------------------------------------------------------------ # # Fit the data and plot the result: # ------------------------------------------------------------------ # fig, ax = plt.subplots() ax.errorbar(x, y, ey, fmt='ro', ecolor='k', elinewidth=1, capsize=2, capthick=1) ax.plot(x, fit_function(x, *minuit.args), '-r') names = ['Chi2/ndf', 'Prob', 'p0', 'p1'] values = ["{:.3f} / {:d}".format(Chi2_fit, Ndof_calc), "{:.3f}".format(Prob_fit), "{:.3f} +/- {:.3f}".format(alpha0_fit, sigma_alpha0_fit), "{:.3f} +/- {:.3f}".format(alpha1_fit, sigma_alpha1_fit), ] ax.text(0.55, 0.2, nice_string_output(names, values), family='monospace', transform=ax.transAxes, fontsize=10, verticalalignment='top') plt.tight_layout() plt.show(block=False) if (SavePlots) : fig.savefig("FitToLine.pdf") # ------------------------------------------------------------------ # # In case we have more than one "experiment" (i.e. fit), we would like # to see how the fit results and the Chi2 values distribute themselves: if (Nexp > 1) : fig2, ax2 = plt.subplots(nrows=2, ncols=2, figsize=(10,10)) ax2 = ax2.flatten() ax2[0].hist(array_alpha0, bins=80, histtype='step') ax2[0].set_title('Histogram of alpha0') string = " Entries {:>9} \n Mean {:>12.3f} \n RMS {:>13.4f}".format(len(array_alpha0), array_alpha0.mean(), array_alpha0.std(ddof=1)) ax2[0].text(0.62, 0.95, string, family='monospace', transform=ax2[0].transAxes, fontsize=10, verticalalignment='top') ax2[1].hist(array_alpha1, bins=80, histtype='step') ax2[1].set_title('Histogram of alpha1') string = " Entries {:>9} \n Mean {:>12.3f} \n RMS {:>13.4f}".format(len(array_alpha1), array_alpha1.mean(), array_alpha1.std(ddof=1)) ax2[1].text(0.05, 0.95, string, family='monospace', transform=ax2[1].transAxes, fontsize=10, verticalalignment='top') ax2[2].hist(array_Chi2, bins=20, histtype='step', normed=True) ax2[2].set_title('Histogram of Chi2') string = " Entries {:>9} \n Mean {:>12.3f} \n RMS {:>13.4f}".format(len(array_Chi2), array_Chi2.mean(), array_Chi2.std(ddof=1)) ax2[2].text(0.62, 0.95, string, family='monospace', transform=ax2[2].transAxes, fontsize=10, verticalalignment='top') x_axis = np.linspace(0, 20, 1000) y_axis = stats.chi2.pdf(x_axis, df=7) ax2[2].plot(x_axis, y_axis, 'r-') ax2[2].text(0.15, 0.05, "Note: This is not a fit!", transform=ax2[2].transAxes, fontsize=10, verticalalignment='top') ax2[3].hist(array_Prob, bins=20, histtype='step') ax2[3].set_title('Histogram of Prob') string = " Entries {:>9} \n Mean {:>12.3f} \n RMS {:>13.4f}".format(len(array_Prob), array_Prob.mean(), array_Prob.std(ddof=1)) ax2[3].text(0.3, 0.2, string, family='monospace', transform=ax2[3].transAxes, fontsize=10, verticalalignment='top') plt.tight_layout() plt.show(block=False) if (SavePlots) : fig2.savefig('FitChi2dist.pdf') # Numbers to illustrate the point, that the ratio Chi2/Ndof is only a rule of thumb: print("\n Considering the relation between Chi2/Ndof ratio and Prob(Chi2,Ndof) vs. Ndof:") print(" High number of degrees of freedom: Prob(240.0,200) = %7.5f"%(stats.chi2.sf(240.0, 200))) print(" Low number of degrees of freedom: Prob( 2.4, 2) = %7.5f"%(stats.chi2.sf(2.4, 2))) # Finally, ensure that the program does not termine (and the plot disappears), before you press enter: try: __IPYTHON__ except: raw_input('Press Enter to exit') #---------------------------------------------------------------------------------- # # Make sure you've read the relevant references and that you understand not only what # the ChiSquare is, but also that it follows the ChiSquare distribution, and that the # probability of obtaining such a ChiSquare or worse (given that the hypothesis/PDF # is the correct one!) can be calculated from it. # # Executive Summary: The ChiSquare method is used both for getting the best fit, # and for testing the quality/probability of this fit. # # Questions: # ---------- # 1) Run the code such that it does exactly one fit, and take a look at the line fitted. # Does this look reasonable, and what is the chance that the input for the data could # actually be from a flat distribution, i.e. without the slope? # # 2) Does the fit reproduce the input numbers well (include the uncertainties in your # answer)? And do the analytical result(s) (Calc) agree with the numerical one(s) (Fit)? # # 3) Now increase the number of experiments to e.g. 1000, and rerun the macro. Figure # out what you see in the window split in 2-by-2, and go through each of these to # see, if you understand every feature of the distributions shown, and if you are # happy with them! Is this what you expected? # Try to make some fits of these output distributions to check, if you're satisfied! # # This somehow makes this the "long" question without any leading questions. # But you should try to be statistically minded enough and have an idea of what to # look for, at least to some degree :-) # # 4) Investigate if the distributions of probabilities is flat, or if it has some # slope to it. Do you understand why the distributions of probabilities is flat? # (This is typically conceptually hard to begin with, but don't worry!) # # 5) Find the line where the random points for the line are generated, and add a # quadratic term in x with the coefficient 0.1. Run the program again with Nexp=1, # and start by looking at the single fit in the graph. Can you see this change? # Now run 1000 experiments again, and see what has changed in the distributions # in the 2-by-2 window when including such a term, and think what consequences # it might have for an experiment. # # NOTE: The reason why the quadratic term is more clear in the latter case is NOT # that it is a better method. It is simply because there is 1000 times more statistics! # # # Advanced questions: # ------------------- # 1) Change the coefficient from question 5) to -0.2. Of course the linear fit does # not do very well. What changes are needed for the fit to be good again? Make # these changes, and see that the condition in question 4) is again met. # # 2) On page 104 in Glen Cowan's book, he lists the conditions under which the ChiSquare # distribution is obtained (hypothesis is linear in fit parameters). Try to make a test, where # this is not the case (e.g. f(x) = cos(a*x)), and see to what degree this requirement # is actually needed. # #----------------------------------------------------------------------------------