#!/usr/bin/env python # ---------------------------------------------------------------------------------- # # "Fitting is an art!" # ---------------------------------------------------------------------------------- # # # Python script for testing which fitting proceedure is likely to give the "best" # results. The three cases in question are: # * Gaussian distribution on constant background (peak searching) # * Double exponential distribution (high correlations) # * Polynomial fit with possible discontinuity # # Consider each case and argue/discuss which fitting function and method should be used, # and if the fits come out correctly. # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 6th of January 2018 # # ---------------------------------------------------------------------------------- # from __future__ import division, print_function import numpy as np import matplotlib.pyplot as plt from scipy import stats from iminuit import Minuit from probfit import Chi2Regression plt.close('all') SavePlots = False # Determining if plots are saved or not r = np.random r.seed(5) # function to create a nice string output def nice_string_output(names, values, extra_spacing = 2): max_values = len(max(values, key=len)) max_names = len(max(names, key=len)) string = "" for name, value in zip(names, values): string += "{0:s} {1:>{spacing}} \n".format(name, value, spacing = extra_spacing + max_values + max_names - len(name)) return string[:-2] # function to draw parameters from a fit on the figure in one line def draw_fit_parameters(minuit_obj, axis, x, y, title=None, color='k', chi2=None, ndof=None, prob=None) : names = [] values = [] if title is not None : names += [title] values += [''] if chi2 is not None and ndof is not None and prob is not None : names += ['Chi2 / ndof','Prob'] values += ["{:.3f} / {:d}".format(chi2, ndof), "{:.3f}".format(prob)] names += minuit_obj.values.keys() values += ["{:.3f} +/- {:.3f}".format(minuit_obj.values[val], minuit_obj.errors[val]) for val in minuit_obj.values.keys()] axis.text(x, y, nice_string_output(names, values), family='monospace', transform=axis.transAxes, fontsize=10, verticalalignment='top', color=color) # function to print correlation matrix with parameter names def pretty_print_correlation_matrix(minuit_obj) : corr = minuit_obj.matrix(correlation=True) names = minuit_obj.values.keys() string = ' | ' string += ' | '.join('{:^7s}'.format(k) for k in names) + ' |\n' string += '---------' + ''.join(np.repeat('-----------',len(corr))) + '-\n' for i in range(len(corr)) : string += ' {:>7s} | '.format(names[i]) + ' | '.join('{: 6.5f}'.format(k) for k in corr[i]) + ' |\n' print(string) #----------------------------------------------------------------------------------- # # CASE 1: Gaussian distribution on a constant background: # f1(x) = N / sqrt(2pi) / sigma * exp(-0.5 * sqr((x-mu)/sigma)) for x in [-inf,inf] #----------------------------------------------------------------------------------- # Npoints_gauss = 200 mux = 3.50 sigmax = 0.25 Npoints_pol0 = 1000 minx = 0.0 maxx = 10.0 Nbins = 100 binwidth_gauss = (maxx-minx) / Nbins # Fill histogram with signal and background events: Array_gauss = np.concatenate(( r.normal(mux,sigmax,Npoints_gauss), r.uniform(minx,maxx,Npoints_pol0), r.normal(mux+2.0,sigmax,int(Npoints_gauss/3)) )) # Make histogram of randomly generated data Hist_gauss, Edges_gauss = np.histogram(Array_gauss, bins=Nbins, range=(minx, maxx)) Centers_gauss = (Edges_gauss[1:] + Edges_gauss[:-1])/2 Errors_gauss = np.sqrt(Hist_gauss) Indexes_gauss = Hist_gauss > 0 # Plot the randomly generated data fig2, ax2 = plt.subplots(figsize=(10, 6)) ax2.set_ylabel("Frequency") ax2.errorbar(Centers_gauss[Indexes_gauss], Hist_gauss[Indexes_gauss], Errors_gauss[Indexes_gauss], fmt='k_', label='randomly generated exponential', ecolor='k', elinewidth=1, capsize=1, capthick=1) ax2.set_ylim(0,ax2.get_ylim()[1]*1.3) ax2.set_xlim(left=0) # Define function (including bin width to get normalisation right): def f_gauss(x, N, mu, sigma, background) : return binwidth_gauss*N / np.sqrt(2.0*np.pi) / sigma * np.exp(-0.5*(x-mu)**2/sigma**2) + background # Perform first fit FitObject_gauss = Chi2Regression(f_gauss, Centers_gauss[Indexes_gauss], Hist_gauss[Indexes_gauss], Errors_gauss[Indexes_gauss]) Minuit_gauss = Minuit(FitObject_gauss, pedantic=False, N=Npoints_gauss, mu=mux, sigma=sigmax, background=Npoints_pol0/Nbins, print_level=0) Minuit_gauss.migrad() chi2_gauss = Minuit_gauss.fval ndof_gauss = FitObject_gauss.ndof prob_gauss = stats.chi2.sf(chi2_gauss, ndof_gauss) # Plot the first fit x_fit = np.linspace(minx, maxx, 1000) y_fit = f_gauss(x_fit,*Minuit_gauss.args) ax2.plot(x_fit, y_fit, 'r-', label='one peak fit') draw_fit_parameters(Minuit_gauss, ax2, 0.05, 0.95, 'One peak fit:','r',chi2_gauss,ndof_gauss,prob_gauss) # NOTE: If one wanted to test the G+pol0 vs. the G+G+pol0 models against each other, # which would of course be very relevant, then a likelihood ratio test would be # obvious. # f_gauss would be your simple (null) hypothesis, G+pol0 # f_gauss2 would be your advanced (alternative) hypothesis, G+G+pol0 # Now, the test statistic D = LLH1 - LLH2 will be Chi2 distributed with Ndof = 3, # as the advanced model has three more parameters (from the second Gaussian). # Finalize the figure plt.legend(loc='best') plt.tight_layout() plt.show(block=False) if (SavePlots) : fig2.savefig("Hist_gauss.pdf", dpi=600) """ """ # ----------------------------------------------------------------------------------- # # # Looking at the first case, but make sure that you also do at least the second case, # as this is at least as important as the first one! Third case is a revisit. # # Questions on CASE 1: # -------------------- # 1) Look at the data plot and the corresponding fit. What type of fit is it? Does it # run well (or at all) without some non-zero input parameters? And once it runs, does # it seem to be reasonable? Why? And does the fitting function include all features # of the data? Why/why not? Try for 10-15 minutes and discuss it with your neighbor, # before reading on! # # - - - - - - 10-15 minutes later - - - - - - # # 2) As it happens, there seem to be a suspectable bump around 5 < x < 6. Try to write # an additional fitting function, which includes this bump in the model, and get the # fit to run. How significant is the bump, based on significance of the amplitude of # the second Gaussian peak? And what test would you apply to this if you wanted to # make a full-fledged hypothesis test between the two models? Are they nested? Can # you actually get a number out? # # - - - - - - 20-30 minutes (success or failure) later - - - - - - # # 3) Imagine that you concluded that there was a new peak, and that you were sure that # it had the same width as the original peak. Does that help you in the fit, and if # so, how? Does the significance of the peak increase? Would it always do that? # Also imagine, that the parameter of interest is the distance between the peaks. How # would you now write your fitting function? #----------------------------------------------------------------------------------- # #----------------------------------------------------------------------------------- # # CASE 2: Double exponential distribution: # # The "bad" fitting function: # f2_bad(x) = N1*exp(-t/r1) + N2*exp(-t/r2) for t in [0,inf] # # The "good" fitting function: # f2_good(x) = N * (f/r1*exp(-t/r1) + (1-f)/r2*exp(-t/r2)) for t in [0,inf] # #----------------------------------------------------------------------------------- # # NOTE: The parameters r1 and r2 need to be positive, and f in [0,1], # in order for this to be a PDF. #----------------------------------------------------------------------------------- # # The whole case have been commented out, just to ensure smooth running of CASE 1. """ Npoints_2exp = 4000 frac = 0.9 # Fraction that "belongs to" first exponential r1 = 9.0 r2 = 3.0 minx_2exp = 0.0 maxx_2exp = 20.0 Nbins_2exp = 200 binwidth_2exp = (maxx_2exp - minx_2exp) / Nbins_2exp # Transformation method: Array_2exp = np.zeros(Npoints_2exp) for i in xrange(Npoints_2exp) : if (r.uniform() < frac) : t = r.exponential(r1) else : t = r.exponential(r2) Array_2exp[i] = t # Make histogram of randomly generated data Hist_2exp, Edges_2exp = np.histogram(Array_2exp, bins=Nbins_2exp, range=(minx_2exp, maxx_2exp)) Centers_2exp = (Edges_2exp[1:] + Edges_2exp[:-1])/2 Errors_2exp = np.sqrt(Hist_2exp) Indexes_2exp = Hist_2exp > 0 # Plot the randomly generated data fig, ax = plt.subplots(figsize=(10, 6)) ax.set_ylabel("Frequency") ax.errorbar(Centers_2exp[Indexes_2exp], Hist_2exp[Indexes_2exp], Errors_2exp[Indexes_2exp], fmt='k_', label='randomly generated exponential', ecolor='k', elinewidth=1, capsize=1, capthick=1) ax.set_ylim(bottom=0) ax.set_xlim(left=0) # The BAD fitting function and fit: # ---------------------------------- def BadFitFunc_2exp(x, N1, N2, tau1, tau2) : return N1*np.exp(-x/tau1) + N2*np.exp(-x/tau2) BadFitObject_2exp = Chi2Regression(BadFitFunc_2exp, Centers_2exp[Indexes_2exp], Hist_2exp[Indexes_2exp], Errors_2exp[Indexes_2exp]) BadMinuit_2exp = Minuit(BadFitObject_2exp, pedantic=False, N1=0.0, N2=0.0, tau1=0.0, tau2=0.0, print_level=0) BadMinuit_2exp.migrad() # Plot the bad fit x_fit = np.linspace(minx_2exp, maxx_2exp, 1000) y_fit = BadFitFunc_2exp(x_fit,*BadMinuit_2exp.args) ax.plot(x_fit, y_fit, 'r-', label='the bad fit') draw_fit_parameters(BadMinuit_2exp, ax, 0.55, 0.80, 'Bad fit:', 'r') # Print the correlations for the bad fit print("Correlation matrix for the bad fit - Everything is heavily correlated:") pretty_print_correlation_matrix(BadMinuit_2exp) # Finalize the figure plt.legend(loc='best') plt.tight_layout() plt.show(block=False) if (SavePlots) : fig.savefig("Hist_2exp.pdf", dpi=600) """ # ----------------------------------------------------------------------------------- # # Questions on CASE 2: # -------------------- # 1) OK, the "bad" fit doesn't work to begin with. Can you see what is missing? There # are in fact several things, but one is very simple to remedy. Think and discuss... # # - - - - - - 5-10 minutes later - - - - - - # # Of course you need to give the fit good initial values! Do this (for example those # the data was produced with!), and run it again. It might work now, but actually that # is not always the case. # The reason is that the "bad" fitting function has two flaws: # * It does not have a correct normalisation, thus making N(1/2) and r(1/2) correlated. # * It does not have one overall normalisation, thus making N1 and N2 correlated. # This gives very high correlations between the parameters, as can be seen from the # correlation matrix printed. # # 2) Both of these problems can be avoided by rewriting the fitting function to include # the correct normalisation (i.e. dividing by the lifetime) and by putting only one # overall normalisation and then dividing the two lifetimes with a fraction (i.e. use # "frac" and "(1.0-frac)" as parameters in front of each exponential term). # Try this (define a "good" function), and see if your fit improves. The way to see # this would in general be to try a lot of different data, but here we will simply see # that the correlations are smaller (especially for the overall normalisation). # # - - - - - - 20-30 minutes (success or failure) later - - - - - - # # If you didn't manage to get this fit going, I've included a "good" fitting function # at the bottom of the program. # # 3) The two lifetimes are naturally very correlated with each other (and the fraction), # when they are very alike. The only thing one can do about this is to fix one parameter. # This is of course not desirable, but one can be forced to do it, if the fit does not # converge otherwise. Note that since the correlation is very high, it is not a great # loss of freedom in the fitting function. # A very common similar example is fitting a "Gaussian-like" peak, which happens to have # more than one width, for example if the data is obtained from two or more sources with # different resolutions. Here, one may choose to let the two (or more) Gaussians have # the same mean, but two different widths (the "precise" and the "poor" measurements). # Typically, the parameter to fix (if any) is the fraction, but never fix a parameter # without first having tried to let it "float". # ----------------------------------------------------------------------------------- # #----------------------------------------------------------------------------------- # # CASE 3: Fitting (possible) discontinuity in graph # f3(x) = for x in [-1.25,1.25] modulo an offset #----------------------------------------------------------------------------------- # """ # Define points for a graph with errors and a small discontinuity: Ndata = 51 offset = 0.0 x = np.array([-1.15+offset + 0.05*i for i in xrange(Ndata)]) y = np.array([(x[i]-offset)**3 - (x[i]-offset) + r.normal(0.0,0.1) for i in xrange(Ndata)]) y[x>0.375+offset] += 0.4 # Adding a small hike ex = np.zeros(Ndata) # Deciding that x is precise. ey = np.ones_like(ex)*0.1 # Stating the uncertainty in # Plot the randomly generated data fig3, ax3 = plt.subplots(figsize=(10, 6)) ax3.set_ylabel("Y") ax3.set_xlabel("X") ax3.errorbar(x,y,xerr=ex,yerr=ey, fmt='k.', label='randomly generated data', ecolor='k', elinewidth=1, capsize=2, capthick=1) ax3.set_xlim(-1.3+offset, 1.5+offset) # Define fitting function (with parameter for the size of the discontinuity): def func_discfit(x, p0, p1, p2, p3, p4) : if (x > 0.375+offset) : return p0 + p1*x + p2*x**2 + p3*x**3 + p4 else : return p0 + p1*x + p2*x**2 + p3*x**3 func_discfit_vec = np.vectorize(func_discfit) # Here we perform the fit # NOTE that p5 is now fixed to 0.0 # Unfix p5 to take the discontinuity into account FitObject = Chi2Regression(func_discfit, x, y, ey) MinuitObj = Minuit(FitObject, pedantic=False, p0=0.0, p1=-1.0, p2=0.0, p3=1.0, p4=0.0, fix_p4=True, print_level=0) MinuitObj.migrad() chi2 = MinuitObj.fval ndof = FitObject.ndof prob = stats.chi2.sf(chi2, ndof) # Draw the fit x_fit = np.linspace(-1.3+offset, 1.5+offset, 1000) y_fit = func_discfit_vec(x_fit,*MinuitObj.args) ax3.plot(x_fit, y_fit, 'r-', label='the fit') draw_fit_parameters(MinuitObj, ax3, 0.55, 0.80, chi2=chi2, ndof=ndof, prob=prob) ax3.text(0.15, 2.0, "Set fix_p4=False to take the discontinuity into account!", fontsize=10, verticalalignment='top') # Finalize the figure plt.legend(loc='best') plt.tight_layout() plt.show(block=False) if (SavePlots) : fig3.savefig("Graph_DiscotinuousFit.pdf", dpi=600) """ # ----------------------------------------------------------------------------------- # # Questions on CASE 3: # -------------------- # 1) The fit works, but gives a poor model of the data. There seem to be a discontinuity # around 0.375, so put this into the fit function (check that it is there), and allow # the parameter controlling this discontinuity to float (i.e. set fix_p4 = False). # Did this make the fit work better? Hopefully. # # 2) The "offset" is set to 0.0 to begin with. Try to increase it to 3.0, and see how # it is simply a translation in x. One would think that since it is a 3rd degree # polynomial fit, it should work nicely for ANY value of the offset. # However, now try to set the offset to 5.0, and 10.0. Does the fit converge nicely # as before? What would it require to get it to converge? Try to see, if you can make # it converge by doing what you suggest. # # - - - - - - 10-15 minutes (success or failure) later - - - - - - # # 3) The problem is both in terms of the initial values of the fit, but also in the # intercorrelations that a significant offset introduces. With a very shifted # x-axis, any change in say p3 (the term in front of x**3) requires huge changes # in the other fitting parameters (too see this, image that the offset is 1000.0). # You may think that such an offset is rare, but it is in fact not, and the classic # case is when fitting data as a function of YEAR, e.g. 1970-2016. Always shift the # x-axis towards zero, in this case fx. by -2000. # # ----------------------------------------------------------------------------------- # # Finally, ensure that the program does not termine (and the plot disappears), before you press enter: try: __IPYTHON__ except: raw_input('Press Enter to exit') # The GOOD fitting function and fit (CASE 2): # ------------------------------------------- #def GoodFitFunc_2exp(x, N, frac, tau1, tau2) : # return N*binwidth_2exp * ( frac/tau1*np.exp(-x/tau1) + (1-frac)/tau2*np.exp(-x/tau2) ) # #GoodFitObject_2exp = Chi2Regression(GoodFitFunc_2exp, Centers_2exp[Indexes_2exp], Hist_2exp[Indexes_2exp], Errors_2exp[Indexes_2exp]) #GoodMinuit_2exp = Minuit(GoodFitObject_2exp, pedantic=False, N=Npoints_2exp, frac=frac, tau1=r1, tau2=r2, limit_frac=(0.0,1.0)) #GoodMinuit_2exp.migrad() #x_fit = np.linspace(minx_2exp, maxx_2exp, 1000) #y_fit = GoodFitFunc_2exp(x_fit,*GoodMinuit_2exp.args) #ax.plot(x_fit, y_fit, 'b--', label='the good fit') #draw_fit_parameters(GoodMinuit_2exp, ax, 0.55, 0.60, 'Good fit:', 'b') #pretty_print_correlation_matrix(GoodMinuit_2exp)