#!/usr/bin/env python # ---------------------------------------------------------------------------------- # # "Fitting is an art!" # ---------------------------------------------------------------------------------- # # # Python macro for testing likelihood fit vs. ChiSquare fits, and get a feel for # which fitting proceedure is likely to give the "best" results. # # This program is CENTRAL in that it contains all the necessary code for fitting. # The case has been chosen carefully, and should illustrate many points. Please, # "play around" with it as much as you can. # # IMPORTANT NOTE: We have calculated the Chi2 value (and associated probability) also # for the likelihood cases. But this is only for illustration, and not to be taken # literally, as the Chi2 value does not "make sense" for these, which minimize with # respect to the (-2*log) likelihood. # # The case in question is: # * (possibly) Gaussian peak distribution on constant background (peak searching/fitting) # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 1st of December 2017 # # ---------------------------------------------------------------------------------- # from __future__ import division, print_function import numpy as np import matplotlib.pyplot as plt from iminuit import Minuit from probfit import Chi2Regression, BinnedLH, UnbinnedLH from scipy import stats plt.close('all') # Ignore LogWarning # (Python may complain, if it takes the log of too small numbers, e.g. log-likelihood) import warnings warnings.filterwarnings("ignore") r = np.random # Random generator r.seed(42) # Set a random seed (but a fixed one) Verbose = False Saveplots = True # These are switches to decide, if signal should be included in the data, and also fitted for: IncludeSignalInData = False IncludeSignalInFit = False # Assert that we do not try to fit any signal that isn't there! # However, in reality this is often what we actually do, to assert what the largest # deviation from background is. But this type of "bump hunting" is challeging, and # for later in the course/statistics education. assert not ((IncludeSignalInData == False) and (IncludeSignalInFit == True)) # Function to create a nice string output def nice_string_output(names, values, extra_spacing = 2): max_values = len(max(values, key=len)) max_names = len(max(names, key=len)) string = "" for name, value in zip(names, values): string += "{0:s} {1:>{spacing}} \n".format(name, value, spacing = extra_spacing + max_values + max_names - len(name)) return string[:-2] # Function to calculate ChiSquare value (used for LLH fits, where we would like the Chi2 for illustration): def calc_chi2(x, y, sy, function, func_args, scale = 1): N = (y > 0).sum() - len(func_args) f = scale * function(x[y > 0], *func_args) chi2 = np.sum( (y[y > 0] - f)**2 / sy[y > 0]**2 ) return chi2, N #----------------------------------------------------------------------------------- # # CASE: Gaussian distribution on an exponential background: # sig(x) = Nsig / sqrt(2pi) / sigma * exp(-0.5 * sqr((x-mu)/sigma)) for x in [0,200] # bkg(x) = Nbkg * c * exp(-c * x) for x in [0,200] #----------------------------------------------------------------------------------- # if IncludeSignalInData: # Produce signal (Gaussian): Nsigx = 100 mux = 125.50 sigmax = 3.75 x_signal = r.normal(mux, sigmax, Nsigx) if (Verbose) : print(" Signal: ", x_signal) else: x_signal = [] # Produce background (exponential): Nbkgx = 1000 taux = 80.0 # "Lifetime" of background x_background = r.exponential(taux, Nbkgx) if (Verbose) : print(" Background: ", x_background) # Histogram specifications: xmin = 0.0 xmax = 200.0 Nbins = 200 binwidth = (xmax-xmin) / Nbins # Note: The x_signal and x_background needs to be in a parenthesis inside the other # parenthesis, because of the posibility of several other options. x_data = np.concatenate((x_signal, x_background)) # Combine the two numpy arrays if (Verbose) : print(" Data: ", x_data) #----------------------------------------------------------------------------------- # # Define PDFs: #----------------------------------------------------------------------------------- # # Normalized Gaussian: def gauss_pdf(x, mu, sigma): # return np.exp(-(x - mu)**2 / 2.0 / sigma**2) return 1.0 / np.sqrt(2 * np.pi) / np.abs(sigma) * np.exp(-(x - mu)**2 / 2.0 / sigma**2) # Normalized Exponential: def exponential_pdf(x, tau): # return np.exp(-x/tau) return 1.0 / tau * np.exp(-x/tau) # Full model PDF (one for ChiSquare [CS], one for Binned Likelihood [BL]) and # one for unbinned likelihood (UL) if IncludeSignalInFit: def PDFmodel_CS(x, Nsig, mu, sigma, Nbkg, tau) : return Nbkg * binwidth * exponential_pdf(x, tau) + Nsig * binwidth * gauss_pdf(x, mu, sigma) def PDFmodel_BL(x, Nsig, mu, sigma, Nbkg, tau) : return Nbkg * exponential_pdf(x, tau) + Nsig * gauss_pdf(x, mu, sigma) def PDFmodel_UL(x, Nsig, mu, sigma, Nbkg, tau) : return Nbkg * exponential_pdf(x, tau) + Nsig * gauss_pdf(x, mu, sigma) else: def PDFmodel_CS(x, Nbkg, tau) : return Nbkg * binwidth * exponential_pdf(x, tau) def PDFmodel_BL(x, Nbkg, tau) : return Nbkg * exponential_pdf(x, tau) def PDFmodel_UL(x, Nbkg, tau) : return Nbkg * exponential_pdf(x, tau) #----------------------------------------------------------------------------------- # # Plot the data: #----------------------------------------------------------------------------------- # fig, ax = plt.subplots(figsize=(15, 6)) # For a normal histogram (without error bars) one would use: # hist_data = ax.hist(x_data, bins=Nbins, range=(xmin, xmax), histtype='step', linewidth=2, label='Data', color='blue') # We want to plot the histogram with error bars (also used for the Chi2 below): y, bin_edges = np.histogram(x_data, bins=Nbins, range=(xmin, xmax), normed=False) x = 0.5*(bin_edges[1:] + bin_edges[:-1]) sy = np.sqrt(y) hist_data = ax.errorbar(x, y, sy, fmt='.', linewidth=2, label="Data") ax.set(xlabel="Photon invariant mass [GeV]", ylabel = "Frequency / {0:1.0f} GeV".format(binwidth), title = "Distribution of diphoton invariant masses", xlim=(xmin, xmax)) #---------------------------------------------------------------------------------- # # Fit the data: #---------------------------------------------------------------------------------- # # NOTE, that for the starting values for the fit, we here (to begin with) use the values that the data was generated with! # That is of course a great advantage, and one of the main challenges in general fitting is to find these good starting values. # ChiSquare fit: # -------------- # Do the fit with a Chi2 minimisation (only using bins with entries): chi2reg = Chi2Regression(PDFmodel_CS, x[y>0], y[y>0], sy[y>0]) if IncludeSignalInFit: minuit_cs = Minuit(chi2reg, pedantic=False, print_level=0, Nsig=Nsigx, mu=mux, sigma=sigmax, Nbkg=Nbkgx, tau=taux) else: minuit_cs = Minuit(chi2reg, pedantic=False, print_level=0, Nbkg=Nbkgx, tau=taux) minuit_cs.migrad() # Perform the actual fit if (not minuit_cs.get_fmin().is_valid) : # Check if the fit converged print(" WARNING: The ChiSquare fit DID NOT converge!!!") minuit_output = [minuit_cs.get_fmin(), minuit_cs.get_param_states()] # Save the output parameters in case needed if IncludeSignalInFit: csfit_Nsig, csfit_mu, csfit_sigma, csfit_Nbkg, csfit_tau = minuit_cs.args # The fitted values of the parameters else: csfit_Nbkg, csfit_tau = minuit_cs.args Chi2_value_cs = minuit_cs.fval # The Chi2 value NvarModel_cs = len(minuit_cs.args) Ndof_cs = len(y[y>0]) - NvarModel_cs ProbChi2_cs = stats.chi2.sf(Chi2_value_cs, Ndof_cs) for name in minuit_cs.parameters: print(" ChiSquare Fit result: {0} = {1:.5f} +/- {2:.5f}".format(name, minuit_cs.values[name], minuit_cs.errors[name])) # Plot fit result on top of histograms: x_csfit = np.linspace(xmin, xmax, 1000) # Create the x-axis for the plot of the fitted function if IncludeSignalInFit: y_csfit = binwidth*PDFmodel_CS(x_csfit, csfit_Nsig, csfit_mu, csfit_sigma, csfit_Nbkg, csfit_tau) else: y_csfit = binwidth*PDFmodel_CS(x_csfit, csfit_Nbkg, csfit_tau) # y_csfit = binwidth*PDFmodel_CS(x_csfit, *minuit_cs.args) ax.plot(x_csfit, y_csfit, '-', color='red', linewidth=2, label='ChiSquare fit') # Show the useful histogram and fit information: if IncludeSignalInFit: names = ['Entries', 'Chi2/ndf', 'Prob', 'Nsig', 'mu', 'sigma', 'Nbkg', 'tau'] values = ["{:d}".format(len(x_data)), "{:.3f} / {:d}".format(Chi2_value_cs, Ndof_cs), "{:.3f}".format(ProbChi2_cs), "{:.3f} +/- {:.3f}".format(minuit_cs.values['Nsig'], minuit_cs.errors['Nsig']), "{:.3f} +/- {:.3f}".format(minuit_cs.values['mu'], minuit_cs.errors['mu']), "{:.3f} +/- {:.3f}".format(minuit_cs.values['sigma'], minuit_cs.errors['sigma']), "{:.3f} +/- {:.3f}".format(minuit_cs.values['Nbkg'], minuit_cs.errors['Nbkg']), "{:.3f} +/- {:.3f}".format(minuit_cs.values['tau'], minuit_cs.errors['tau']), ] else: names = ['Entries', 'Chi2/ndf', 'Prob', 'Nbkg', 'tau'] values = ["{:d}".format(len(x_data)), "{:.3f} / {:d}".format(Chi2_value_cs, Ndof_cs), "{:.3f}".format(ProbChi2_cs), "{:.3f} +/- {:.3f}".format(minuit_cs.values['Nbkg'], minuit_cs.errors['Nbkg']), "{:.3f} +/- {:.3f}".format(minuit_cs.values['tau'], minuit_cs.errors['tau']), ] ax.text(0.02, 0.95, nice_string_output(names, values, 0), family='monospace', transform=ax.transAxes, fontsize=9, color='red', verticalalignment='top') # Binned Likelihood fit: # ---------------------- binned_likelihood = BinnedLH(PDFmodel_BL, x_data, bins=Nbins, bound=(xmin, xmax), extended=True) if IncludeSignalInFit: minuit_bl = Minuit(binned_likelihood, pedantic=False, print_level=0, Nsig=Nsigx, mu=mux, sigma=sigmax, Nbkg=Nbkgx, tau=taux) else: minuit_bl = Minuit(binned_likelihood, pedantic=False, print_level=0, Nbkg=Nbkgx, tau=taux) minuit_bl.migrad() # Perform the actual fit if (not minuit_bl.get_fmin().is_valid) : # Check if the fit converged print(" WARNING: The binned likelihood fit DID NOT converge!!!") minuit_output = [minuit_bl.get_fmin(), minuit_bl.get_param_states()] # Save the output parameters in case needed BLLH_value = minuit_bl.fval # The LLH value for name in minuit_bl.parameters: print(" Binned LLH Fit result: {0} = {1:.5f} +/- {2:.5f}".format(name, minuit_bl.values[name], minuit_bl.errors[name])) # Plot fit result on top of histograms: x_blfit = np.linspace(xmin, xmax, 1000) # Create the x-axis for the plot of the fitted function y_blfit = binwidth*PDFmodel_BL(x_blfit, *minuit_bl.args) ax.plot(x_blfit, y_blfit, '-', color='green', linewidth=2, label='Binned LLH fit') Chi2_value_bl, Ndof_bl = calc_chi2(x, y, sy, PDFmodel_BL, minuit_bl.args, binwidth) ProbChi2_bl = stats.chi2.sf(Chi2_value_bl, Ndof_bl) # Show the useful histogram and fit information: if IncludeSignalInFit: names = ['Entries', 'Chi2/ndf', 'Prob', 'Nsig', 'mu', 'sigma', 'Nbkg', 'tau'] values = ["{:d}".format(len(x_data)), "{:.3f} / {:d}".format(Chi2_value_bl, Ndof_bl), "{:.3f}".format(ProbChi2_bl), "{:.3f} +/- {:.3f}".format(minuit_bl.values['Nsig'], minuit_bl.errors['Nsig']), "{:.3f} +/- {:.3f}".format(minuit_bl.values['mu'], minuit_bl.errors['mu']), "{:.3f} +/- {:.3f}".format(minuit_bl.values['sigma'], minuit_bl.errors['sigma']), "{:.3f} +/- {:.3f}".format(minuit_bl.values['Nbkg'], minuit_bl.errors['Nbkg']), "{:.3f} +/- {:.3f}".format(minuit_bl.values['tau'], minuit_bl.errors['tau']), ] else: names = ['Entries', 'Chi2/ndf', 'Prob', 'Nbkg', 'tau'] values = ["{:d}".format(len(x_data)), "{:.3f} / {:d}".format(Chi2_value_bl, Ndof_bl), "{:.3f}".format(ProbChi2_bl), "{:.3f} +/- {:.3f}".format(minuit_bl.values['Nbkg'], minuit_bl.errors['Nbkg']), "{:.3f} +/- {:.3f}".format(minuit_bl.values['tau'], minuit_bl.errors['tau']), ] ax.text(0.2, 0.95, nice_string_output(names, values, 0), family='monospace', transform=ax.transAxes, fontsize=9, color='green', verticalalignment='top') # Unbinned Likelihood fit: # ---------------------- unbinned_likelihood = UnbinnedLH(PDFmodel_UL, x_data, extended=True) if IncludeSignalInFit: minuit_ul = Minuit(unbinned_likelihood, pedantic=False, print_level=0, Nsig=Nsigx, mu=mux, sigma=sigmax, Nbkg=Nbkgx, tau=taux) else: minuit_ul = Minuit(unbinned_likelihood, pedantic=False, print_level=0, Nbkg=Nbkgx, tau=taux) minuit_ul.migrad() # Perform the actual fit if (not minuit_ul.get_fmin().is_valid) : # Check if the fit converged print(" WARNING: The unbinned likelihood fit DID NOT converge!!!") minuit_output = [minuit_ul.get_fmin(), minuit_ul.get_param_states()] # Save the output parameters in case needed ULLH_value = minuit_ul.fval # The LLH value for name in minuit_ul.parameters: print(" Unbinned LLH Fit result: {0} = {1:.5f} +/- {2:.5f}".format(name, minuit_ul.values[name], minuit_ul.errors[name])) # Plot fit result on top of histograms: x_ulfit = np.linspace(xmin, xmax, 1000) # Create the x-axis for the plot of the fitted function y_ulfit = binwidth*PDFmodel_UL(x_ulfit, *minuit_ul.args) ax.plot(x_ulfit, y_ulfit, '--', color='black', linewidth=2, label='Unbinned LLH fit') Chi2_value_ul, Ndof_ul = calc_chi2(x, y, sy, PDFmodel_UL, minuit_ul.args, binwidth) ProbChi2_ul = stats.chi2.sf(Chi2_value_ul, Ndof_ul) # Show the useful histogram and fit information: if IncludeSignalInFit: names = ['Entries', 'Chi2/ndf', 'Prob', 'Nsig', 'mu', 'sigma', 'Nbkg', 'tau'] values = ["{:d}".format(len(x_data)), "{:.3f} / {:d}".format(Chi2_value_ul, Ndof_ul), "{:.3f}".format(ProbChi2_ul), "{:.3f} +/- {:.3f}".format(minuit_ul.values['Nsig'], minuit_ul.errors['Nsig']), "{:.3f} +/- {:.3f}".format(minuit_ul.values['mu'], minuit_ul.errors['mu']), "{:.3f} +/- {:.3f}".format(minuit_ul.values['sigma'], minuit_ul.errors['sigma']), "{:.3f} +/- {:.3f}".format(minuit_ul.values['Nbkg'], minuit_ul.errors['Nbkg']), "{:.3f} +/- {:.3f}".format(minuit_ul.values['tau'], minuit_ul.errors['tau']), ] else: names = ['Entries', 'Chi2/ndf', 'Prob', 'Nbkg', 'tau'] values = ["{:d}".format(len(x_data)), "{:.3f} / {:d}".format(Chi2_value_ul, Ndof_ul), "{:.3f}".format(ProbChi2_ul), "{:.3f} +/- {:.3f}".format(minuit_ul.values['Nbkg'], minuit_ul.errors['Nbkg']), "{:.3f} +/- {:.3f}".format(minuit_ul.values['tau'], minuit_ul.errors['tau']), ] ax.text(0.4, 0.95, nice_string_output(names, values, 0), family='monospace', transform=ax.transAxes, fontsize=9, color='black', verticalalignment='top') fig.tight_layout() ax.legend() plt.show(block=False) if Saveplots: fig.savefig("ExampleLikelihoodFit", dpi=600) # Finally, ensure that the program does not termine (and the plot disappears), before you press enter: try: __IPYTHON__ except: raw_input('Press Enter to exit') #---------------------------------------------------------------------------------- # # This exercise is meant to improve your understanding but also your experience with # ChiSquare and Likelihood fits. The below are indicative questions, but you should # also "play around" yourself to see, if you understand how the three behave, when you # change conditions. # # Questions: # ---------- # 1) At first run the program without a signal peak. What is the main difference between # the three fits? Do they give the same values? The same errors? And is the model good? # Also, try to lower the "background lifetime" (taux) to get more empty bins. # # 2) Try to give "mu" an initial starting value in the fit away from the true one, and # see if the fit can find the position of the signal peak and hence converge. # In general, how far can the initial parameter values stray from the true ones, # and still find the true minimum? When does it still converge? # # 3) Try to put fewer events in the signal peak, and see at what point you can no # longer see the peak by eye. Then consider the significance (i.e. the number of # sigma's) of Nsig. Does the size of this correspond well with your observation? # # 4) What helps the most to discover a signal peak: # - Removing half the background below the peak. # - Improving the resolution (sigma) by a factor two. # Discuss first what you think (and why), and then try it out with this fit. # # Advanced questions: # ------------------- # 1) Make a loop to repeat the two fits many times, and see how the result of the # fit compares to the true values. Which fit is least biased? And does this depend # on the value of the input parameters? # # 2) The likelihood value does not in itself tell you much. But if repeated many times # (as above) it can be used to evaluate the fit at hand. Try to produce new data and # fit it 1000 times, and see the distribution of likelihood values obtained... and # then compare the "original data LLH value" with this distribution! From this, one # can actually obtain a probability for the LLH in the same way as for the ChiSquare. # #----------------------------------------------------------------------------------