#!/usr/bin/env python # ---------------------------------------------------------------------------------- # # "Fitting is an art!" # ---------------------------------------------------------------------------------- # # # Python/ROOT macro for testing which fitting proceedure is likely to give the "best" # results. The two cases in question are: # * Gaussian distribution on constant background (peak searching) # * Double exponential distribution (high correlations) # # Consider each case and argue/discuss which fitting function and method should be used. # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 6th of January 2017 # # ---------------------------------------------------------------------------------- # from ROOT import * import math from array import array gStyle.SetOptStat("emr") gStyle.SetOptFit(1111) SavePlots = False # Determining if plots are saved or not pi = TMath.Pi() r = TRandom3() r.SetSeed(1) #----------------------------------------------------------------------------------- # # CASE 1: Gaussian distribution on a constant background: # f1(x) = N / sqrt(2pi) / sigma * exp(-0.5 * sqr((x-mu)/sigma)) for x in [-inf,inf] #----------------------------------------------------------------------------------- # Npoints_gauss = 200 mux = 3.50 sigmax = 0.25 Npoints_pol0 = 1000 minx = 0.0 maxx = 10.0 Nbins = 100 binwidth_gauss = (maxx-minx) / float(Nbins) print " The bin width is: %5.2f"%(binwidth_gauss) # Fill histogram with signal and background events: Hist_gauss = TH1F("Hist_gauss", ";x;Frequency", Nbins, minx, maxx) for i in xrange(Npoints_gauss) : Hist_gauss.Fill(r.Gaus(mux,sigmax)) for i in xrange(Npoints_pol0) : Hist_gauss.Fill(r.Uniform(minx,maxx)) for i in xrange(Npoints_gauss/3) : Hist_gauss.Fill(r.Gaus(mux+2.0,sigmax)) canvas_gauss = TCanvas("canvas_gauss","", 50, 50, 1000, 600) Hist_gauss.SetLineColor(kBlue) Hist_gauss.SetLineWidth(2) Hist_gauss.SetMinimum(0.0) # Define function (including bin width to get normalisation right): def func_gpol0(x, par) : norm = binwidth_gauss * par[0] / sqrt(2.0*pi) / par[2] z = (x[0]-par[1])/par[2] return norm * exp(-0.5*z*z) + par[3] f_gauss = TF1("f_gauss", func_gpol0, 0.0, 10.0, 4) f_gauss.SetParNames("N", "#mu", "#sigma", "Background") f_gauss.SetLineColor(kRed) Hist_gauss.Fit("f_gauss", "R") Hist_gauss.Draw("e") canvas_gauss.Update() if (SavePlots) : canvas_gauss.SaveAs("Hist_gauss.pdf") #----------------------------------------------------------------------------------- # # CASE 2: Double exponential distribution: # # The "bad" fitting function: # f2_bad(x) = N1*exp(-t/r1) + N2*exp(-t/r2) for t in [0,inf] # # The "good" fitting function: # f2_good(x) = N * (f/r1*exp(-t/r1) + (1-f)/r2*exp(-t/r2)) for x in [0,inf] # #----------------------------------------------------------------------------------- # # NOTE: The parameters r1 and r2 need to be positive, and f in [0,1], # in order for this to be a PDF. #----------------------------------------------------------------------------------- # Npoints_2exp = 2000 frac = 0.9 # Fraction that "belows to" first exponential r1 = 3.0 r2 = 2.0 Hist_2exp = TH1F("Hist_2exp", ";x;Frequency", 200, 0.0, 20.0) binwidth_2exp = (20.0 - 0.0) / 200.0 # Transformation method: for i in xrange(Npoints_2exp) : if (r.Uniform() < frac) : t = r.Exp(r1) else : t = r.Exp(r2) Hist_2exp.Fill(t) canvas_2exp = TCanvas("canvas_2exp","", 80, 80, 1000, 600) Hist_2exp.SetLineColor(kBlue) Hist_2exp.SetLineWidth(2) Hist_2exp.SetMinimum(0.0) # The BAD fitting function and fit: # ---------------------------------- # I include the binwidth (here 0.1) in the fit to ensure that the normalisations are (or could be) right! f_2expBad = TF1("f_2expBad", "0.1*[0] * exp(-x/[2]) + 0.1*[1]*exp(-x/[3])", 0.0, 20.0) f_2expBad.SetParameters(frac*Npoints_2exp, (1.0-frac)*Npoints_2exp, r1, r2) f_2expBad.SetLineColor(kRed) FitResBad = Hist_2exp.Fit("f_2expBad", "RLS") corrBad = FitResBad.GetCorrelationMatrix() # Access the covariance matrix corrBad.Print() Hist_2exp.Draw("e") canvas_2exp.Update() if (SavePlots) : canvas_2exp.SaveAs("Hist_2exp.pdf") raw_input( ' ... Press enter to exit ... ' ) # ----------------------------------------------------------------------------------- # # # Looking at the first case, but make sure that you also do the second case, as this is # at least as important as the first one! # # Questions on CASE 1: # -------------------- # 1) Look at the data plot and the corresponding fit. What type of fit is it? Does it # run well (or at all) without some non-zero input parameters? And once it runs, does # it seem to be reasonable? Why? And does the fitting function include all features # of the data? Why/why not? Try for 10-15 minutes and discuss it with your neighbor, # before reading on! # # - - - - - - 10-15 minutes later - - - - - - # # 2) As it happens, there seem to be a suspectable bump around 5 < x < 6. Try to write # an additional fitting function, which includes this bump in the model, and get the # fit to run. How significant is the bump, based on significance of the amplitude of # the second Gaussian peak? And what test would you apply to this, if you wanted to # make a full-fledged hypothesis test between the two models? Are they nested? Can # you actually get a number out? # # - - - - - - 20-30 minutes (success or failure) later - - - - - - # # 3) Imagine that you concluded, that there was a new peak, and that you were sure that # it had the same width as the original peak. Does that help you in the fit, and if # so, how? Does the significance of the peak increase? Would it always do that? # Also imagine, that the parameter of interest is the distance between the peaks. How # would you now write your fitting function? # # # Questions on CASE 2: # -------------------- # 1) OK, the "bad" fit doesn't work to begin with. Can you see what is missing? There # are in fact several things, but one is very simple to remedy. Think and discuss... # # - - - - - - 5-10 minutes later - - - - - - # # Of course you need to give the fit good initial values! Do this (for example those # the data was produced with!), and run it again. It might work now, but actually that # is not always the case. # The reason is that the "bad" fitting function has two flaws: # * It does not have a correct normalisation, thus making N(1/2) and r(1/2) correlated. # * It does not have one overall normalisation, thus making N1 and N2 correlated. # This gives very high correlations between the parameters, as can be seen from the # correlation matrix printed. # # 2) Both of these problems can be avoided by rewriting the fitting function to include # the correct normalisation (i.e. dividing by the lifetime) and by putting only one # overall normalisation and then dividing the two lifetimes with a fraction (i.e. use # "frac" and "(1.0*frac)" as a parameter in front of each exponential term). # Try this (define a "good" function), and see if your fit improves. The way to see # this would in general be to try a lot of different data, but here we will simply see # that the correlations a smaller (especially for the overall normalisation). # # - - - - - - 20-30 minutes (success or failure) later - - - - - - # # If you didn't manage to get this fit going, I've included a "good" fitting function below! # # 3) The two lifetimes are naturally very correlated with each other (and the fraction), # when they are very alike. The only thing one can do about this is to fix one parameter. # This is of course not desirable, but one can be forced to do it, if the fit does not # converge otherwise. Not that since the correlation is very high, it is not a great # loss of freedom in the fitting function. # A very common similar example is fitting a "Gaussian-like" peak, which happens to have # more than one width, for example if the data is obtained from two or more sources with # different resolutions. Here, one may choose to let the two (or more) Gaussians have # the same mean, but two different widths (the "good" and the "bad" measurements). # Typically, the parameter to fix (if any) is the fraction, but never fix a parameter # without first having tried to let it "float". # # ----------------------------------------------------------------------------------- # # The GOOD fitting function and fit: # ---------------------------------- #f_2expGood = TF1("f_2expGood", "0.1*[0] * ([1]/[2]*exp(-x/[2]) + (1.0-[1])/[3]*exp(-x/[3]))", 0.0, 20.0) #f_2expGood.SetParameters(Npoints_2exp, frac, r1, r2) #f_2expGood.SetLineColor(kBlue) #FitResGood = Hist_2exp.Fit("f_2expGood", "RLS+") #corrGood = FitResGood.GetCorrelationMatrix() # Access the covariance matrix #corrGood.Print()