#!/usr/bin/env python # ---------------------------------------------------------------------------------- # # # Python script for illustrating the principle of maximum likelihood and a # likelihood fit. # # This is both an exercise, but also an attempt to illustrate four things: # - How to make a (binned and unbinned) Likelihood function/fit. # - The difference and a comparison between a Chi-square and a (binned) Likelihood. # - The difference and a comparison between a binned and unbinned Likelihood. # - What goes on behind the scenes in Minuit, when it is asked to fit something. # # In this respect, the exercise is more of an illustration rather than something to # be used directly, which is why it is followed by another exercise, where you can # test if you have understood the differences, and when to apply which fit method. # # The example uses 50 exponentially distributed random times, with the goal of # finding the best estimate of the lifetime (data is generated with lifetime, tau = 1). # Three estimates are considered: # - Chi-square fit (chi2) # - Binned Likelihood fit (bllh) # - Unbinned Likelihood fit (ullh) # # While the mean can be simply calculated, the three other methods are based on a scan # of values for tau in the range [0.5, 2.0]. For each value of tau, the chi2, bllh, and # llh are calculated (in the two likelihood cases, it is actually -2*log(likelihood) # which is calculated). # # Note that the unbinned likelihood is in principle the "optimal" fit, but also the # most difficult, since you are required to write the likelihood function yourself! # However, the two other constructions may be essentially the same, when there is # enough statistics (i.e. errors are Gaussian). # # The problem is explicitly chosen to have only one fit parameter, such that simple # 1D graphs can show what goes on. Also, it is meant more for illustration, as in # reality, one tends to simply do the minimization using an algorithm (Minuit) to # do the hard work. # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 27th of November 2017 # # ---------------------------------------------------------------------------------- # from __future__ import division, print_function import numpy as np import matplotlib.pyplot as plt from iminuit import Minuit from probfit import Chi2Regression, BinnedLH from scipy import stats plt.close('all') # Function to create a nice string output def nice_string_output(names, values, extra_spacing = 2): max_values = len(max(values, key=len)) max_names = len(max(names, key=len)) string = "" for name, value in zip(names, values): string += "{0:s} {1:>{spacing}} \n".format(name, value, spacing = extra_spacing + max_values + max_names - len(name)) return string[:-2] # ---------------------------------------------------------------------------------- # # Parabola: # Note that the parabola is defined differently than normally. The Parameters are: # minval: Minimum value (i.e. constant) # minpos: Minimum position (i.e. x of minimum) # quadratic: Quadratic term. # ---------------------------------------------------------------------------------- # def func_para(x, minval, minpos, quadratic) : return minval + quadratic*(x-minpos)**2 # ---------------------------------------------------------------------------------- # # Double parabola with different slopes on each side of the minimum: # Parameters are as follows: # minval: Minimum value (i.e. constant) # minpos: Minimum position (i.e. x of minimum) # quadlow: Quadratic term on lower side # quadhigh: Quadratic term on higher side # ---------------------------------------------------------------------------------- # def func_asympara(x, minval, minpos, quadlow, quadhigh) : if (x < minpos) : return minval + quadlow*(x-minpos)**2 else : return minval + quadhigh*(x-minpos)**2 func_asympara_vec = np.vectorize(func_asympara) # ---------------------------------------------------------------------------------- # # Likelihood program: # ---------------------------------------------------------------------------------- # SavePlots = True # Determining if plots are saved or not verbose = True # Should the program print or not? veryverbose = False # Should the program print a lot or not? ScanChi2 = True # In addition to fit for minimum, do a scan... # Parameters of the problem: Ntimes = 100 # Number of time measurements. tau_truth = 1.0; # We choose (like Gods!) the lifetime. r = np.random # Random numbers r.seed(42) # We set the numbers to be random, but the same for each run Nbins = 50 # Number of bins in histogram tmax = 10.0 # Maximum time in histogram dt = tmax / Nbins # Time step # ------------------------------------ # Generate data: # ------------------------------------ # List of times t = r.exponential(tau_truth, Ntimes) # Exponential with lifetime tau. yExp, xExp_edges = np.histogram(t, bins=Nbins, range=(0, tmax)) if (veryverbose) : for index,time in enumerate(t) : print(" {0:2d}: t = {1:5.3f}".format(index,time)) if index > 100: break # let's restrain ourselves # ------------------------------------ # Analyse data: # ------------------------------------ # Define the number of tau values and their range to test in Chi2 and LLH: # (As we know the "truth", namely tau = 1, the range [0.5, 1.5] seems fitting # for the mean. The number of bins can be increased at will...) Ntau_steps = 50 min_tau = 0.5 max_tau = 1.5 delta_tau = (max_tau-min_tau) / Ntau_steps # Loop over hypothesis for the value of tau and calculate Chi2 and LLH: # --------------------------------------------------------------------- chi2_minval = 999999.9 # Minimal Chi2 value found chi2_minpos = 0.0 # Position (i.e. time) of minimal Chi2 value bllh_minval = 999999.9 bllh_minpos = 0.0 ullh_minval = 999999.9 ullh_minpos = 0.0 tau = np.zeros(Ntau_steps+1) chi2 = np.zeros(Ntau_steps+1) bllh = np.zeros(Ntau_steps+1) ullh = np.zeros(Ntau_steps+1) # Now loop of POSSIBLE tau estimates: for itau in range(Ntau_steps+1): tau_hypo = min_tau + itau*delta_tau # Scan in values of tau tau[itau] = tau_hypo # Calculate Chi2 and binned likelihood (from loop over bins in histogram): chi2[itau] = 0.0 bllh[itau] = 0.0 for ibin in range (Nbins) : # Note: The number of EXPECTED events is the intergral over the bin! xlow_bin = xExp_edges[ibin] xhigh_bin = xExp_edges[ibin+1] # Given the start and end of the bin, we calculate the INTEGRAL over the bin, # to get the expected number of events in that bin: nexp = Ntimes * (np.exp(-xlow_bin/tau_hypo) - np.exp(-xhigh_bin/tau_hypo)) # The observed number of events... that is just the data! nobs = yExp[ibin] if (nobs > 0): # For ChiSquare but not LLH, we need to require Nobs > 0 chi2[itau] += (nobs-nexp)**2 / nobs # Chi2 summation/function bllh[itau] += -2.0*np.log(stats.poisson.pmf(int(nobs), nexp)) # Binned LLH function if (veryverbose and itau == 0) : print(" Nexp: {0:10.7f} Nobs: {1:3.0f} Chi2: {2:5.1f} BLLH: {3:5.1f}".format(nexp, nobs, chi2[itau], bllh[itau])) # Calculate Unbinned likelihood (from loop over events): ullh[itau] = 0.0 for time in t : ullh[itau] += -2.0*np.log(1.0/tau_hypo*np.exp(-time/tau_hypo)) # Unbinned LLH function if (verbose) : print(" {0:3d}: tau = {1:4.2f} chi2 = {2:6.2f} log(bllh) = {3:6.2f} log(ullh) = {4:6.2f}".format(itau, tau_hypo, chi2[itau], bllh[itau], ullh[itau])) # Search for minimum values of chi2, bllh, and ullh: if (chi2[itau] < chi2_minval) : chi2_minval = chi2[itau] chi2_minpos = tau_hypo if (bllh[itau] < bllh_minval) : bllh_minval = bllh[itau] bllh_minpos = tau_hypo if (ullh[itau] < ullh_minval) : ullh_minval = ullh[itau] ullh_minpos = tau_hypo print(" Decay time of minimum found: chi2 = {0:7.4f}s bllh = {1:7.4f}s ullh = {2:7.4f}s".format(chi2_minpos, bllh_minpos, ullh_minpos)) # ------------------------------------ # Plot and fit results: # ------------------------------------ # Define range around minimum to be fitted: min_fit = 0.15 max_fit = 0.20 # ChiSquare: # ---------- # Prepare figure fig_chi2, ax_chi2 = plt.subplots(figsize=(8, 6)) ax_chi2.set_title("ChiSquare") ax_chi2.set_xlabel(r"Value of $\tau$") ax_chi2.set_ylabel("Value of ChiSquare") # Plot chi2 values ax_chi2.plot(tau, chi2, 'k.', ) ax_chi2.set_xlim(chi2_minpos-min_fit, chi2_minpos+2*max_fit) # Fit chi2 values with our function indexes = (tau>chi2_minpos-min_fit) & (tau 1.0) and ((chi2[Ntau_steps] - chi2_minval) > 1.0)) : found_lower = False found_upper = False for itau in range (Ntau_steps+1) : if ((not found_lower) and ((chi2[itau] - chi2_minval) < 1.0)) : tau_lower = tau[itau] found_lower = True if ((found_lower) and (not found_upper) and ((chi2[itau] - chi2_minval) > 1.0)) : tau_upper = tau[itau] found_upper = True print(" Chi2 scan gives: tau = {0:6.4f} + {1:6.4f} - {2:6.4f}".format(chi2_minpos, chi2_minpos-tau_lower, tau_upper-chi2_minpos)) else : print("Error: Chi2 values do not fulfill requirements for finding minimum and errors!") # Binned Likelihood: # ------------------ # Prepare figure fig_bllh, ax_bllh = plt.subplots(figsize=(8, 6)) ax_bllh.set_title("Binned Likelihood") ax_bllh.set_xlabel(r"Value of $\tau$") ax_bllh.set_ylabel("Value of Binned Likelihood") # Plot binned log likelihood values ax_bllh.plot(tau, bllh, 'k.', ) ax_bllh.set_xlim(chi2_minpos-2*min_fit, chi2_minpos+2*max_fit) # Fit binned log likelihood values with our function indexes = (tau>bllh_minpos-min_fit) & (tau 0) : xlow_bin = xExp_edges[ibin] xhigh_bin = xExp_edges[ibin+1] nexp = Ntimes * (np.exp(-xlow_bin/tau_fitted) - np.exp(-xhigh_bin/tau_fitted)) chi2 += (yExp[ibin]-nexp)**2 / yExp[ibin] return chi2 fval_fit = chisquare(minpos) ndof_fit = chi2_object_bllh.ndof # or sum(indexes)-4 prob_fit = stats.chi2.sf(fval_fit, ndof_fit) names = ['Chi2/ndf', 'Prob', 'minval', 'minpos', 'quadlow', 'quadhigh'] values = ["{:.3f} / {:d}".format(fval_fit, ndof_fit), "{:.3f}".format(prob_fit), "{:.3f} +/- {:.3f}".format(minuit_bllh.values['minval'], minuit_bllh.errors['minval']), "{:.3f} +/- {:.3f}".format(minuit_bllh.values['minpos'], minuit_bllh.errors['minpos']), "{:.2f} +/- {:.2f}".format(minuit_bllh.values['quadlow'], minuit_bllh.errors['quadlow']), "{:.2f} +/- {:.2f}".format(minuit_bllh.values['quadhigh'], minuit_bllh.errors['quadhigh']), ] ax_bllh.text(0.55, 0.95, nice_string_output(names, values), family='monospace', transform=ax_chi2.transAxes, fontsize=10, verticalalignment='top') # Define the range in the y-axis to draw in: ax_bllh.set_ylim(bllh_minval-2.0, bllh_minval+15.0) plt.tight_layout() plt.show(block=False) if SavePlots: fig_bllh.savefig("FitMinimum_bllh.pdf", dpi=600) # Unbinned Likelihood: # -------------------- # Prepare figure fig_ullh, ax_ullh = plt.subplots(figsize=(8, 6)) ax_ullh.set_title("Unbinned Likelihood") ax_ullh.set_xlabel(r"Value of $\tau$") ax_ullh.set_ylabel("Value of Unbinned Likelihood") # Plot unbinned log likelihood values ax_ullh.plot(tau, ullh, 'k.', ) ax_ullh.set_xlim(chi2_minpos-min_fit, chi2_minpos+2*max_fit) # Fit unbinned log likelihood values with our function indexes = (tau>ullh_minpos-min_fit) & (tau0 # only bins with values! xExp = (xExp_edges[1:] + xExp_edges[:-1])/2 # move from bins edges to bin centers syExp = np.sqrt(yExp) # uncertainties ax_fit.errorbar(xExp[indexes], yExp[indexes], syExp[indexes], fmt='k_', ecolor='k', elinewidth=1, capsize=2, capthick=1) # Chisquare-fit tau values with our function chi2_object_fit = Chi2Regression(func_exp, xExp[indexes], yExp[indexes], syExp[indexes]) # NOTE: The constant for normalization is NOT left free in order to have only ONE parameter! minuit_fit_chi2 = Minuit(chi2_object_fit, pedantic=False, limit_tau=(min_tau,max_tau), tau=tau_truth, fix_N0=True, N0=Ntimes) minuit_fit_chi2.migrad() # Plot fit N0, tau_fit_chi2 = minuit_fit_chi2.args x_fit = np.linspace(0, 10, 1000) y_fit_simple = func_exp(x_fit, N0, tau_fit_chi2) ax_fit.plot(x_fit, y_fit_simple, 'b-', label="ChiSquare fit") # Binned likelihood-fit tau values with our function # extended=True because we have our own normalization in our fit function bllh_object_fit = BinnedLH(func_exp, t, bins=Nbins, bound=(0, tmax), extended=True) minuit_fit_bllh = Minuit(bllh_object_fit, pedantic=False, limit_tau=(min_tau,max_tau), tau=tau_truth, fix_N0=True, N0=Ntimes) minuit_fit_bllh.migrad() # Plot fit N0, tau_fit_bllh = minuit_fit_bllh.args x_fit = np.linspace(0, 10, 1000) y_fit_simple = func_exp(x_fit, N0, tau_fit_bllh) ax_fit.plot(x_fit, y_fit_simple, 'r-', label="Binned Likelihood fit") # Show fit statistics in figure fval_fit_chi2 = minuit_fit_chi2.fval # chi2_object_fit.ndof does not take into account that N0 is fixed. # It incorrectly gives us one degree less. ndof_fit_chi2 = sum(indexes)-1 prob_fit_chi2 = stats.chi2.sf(fval_fit_chi2, ndof_fit_chi2) fval_fit_bllh = chisquare(tau_fit_bllh) # We can not use bllh_object_fit.ndof either because it uses all bins in the # BLLH fit, while chi2 still cannot use empty bins. ndof_fit_bllh = sum(indexes)-1 prob_fit_bllh = stats.chi2.sf(fval_fit_bllh, ndof_fit_bllh) string = "Chi2 fit: \n" names = ['Chi2/ndf', 'Prob', 'tau'] values = ["{:.4f} / {:d}".format(fval_fit_chi2, ndof_fit_chi2), "{:.3f}".format(prob_fit_chi2), "{:.3f} +/- {:.3f}".format(minuit_fit_chi2.values['tau'], minuit_fit_chi2.errors['tau']) ] string += nice_string_output(names, values) string += "\n\nBLLH fit: \n" names = ['Chi2/ndf', 'Prob', 'tau'] values = ["{:.4f} / {:d}".format(fval_fit_bllh, ndof_fit_bllh), "{:.3f}".format(prob_fit_bllh), "{:.3f} +/- {:.3f}".format(minuit_fit_bllh.values['tau'], minuit_fit_bllh.errors['tau']) ] string += nice_string_output(names, values) ax_fit.text(0.55, 0.95, string, family='monospace', transform=ax_fit.transAxes, fontsize=10, verticalalignment='top') # Define the ranges: ax_fit.set_xlim(0, 10) ax_fit.set_ylim(bottom=0) plt.legend(loc='lower right') plt.tight_layout() plt.show(block=False) if (SavePlots) : fig_fit.savefig("ExponentialDist_Fitted.pdf", dpi=600) # Finally, ensure that the program does not termine (and the plot disappears), before you press enter: try: __IPYTHON__ except: raw_input('Press Enter to exit') # ---------------------------------------------------------------------------------- # # # Make sure that you understand how the likelihood is different from the ChiSquare, # and how the binned likelihood is different from the unbinned. If you don't do it, # this exercise, and much of the course and statistics in general will be a bit lost # on you! :-) # # The binned likelihood resembels the ChiSquare a bit, only the evaluation in each bin # is different, especially if the number of events in the bin is low, as the PDF # considered (Poisson for the LLH, Gaussian for the ChiSquare) is then different. # # The unbinned likelihood uses each single event, and is thus different at its core. # This can make a different, if there are only few events and/or if each event has # several attributes, which can't be summarized in a simple histogram with bins. # # # Questions: # ---------- # 1) Consider the four plots produced by running the program, starting with the one # showing the data and two fits (Chi2 and BLLH) to it. Do the two methods give # similar results, or are they different? Consider both central value and # uncertainty. # Now consider the three curves showing the chi2, bllh, and ullh as a function # of lifetime. Do you see the relation between the curves and the fit result? # Again, consider both the central values and the uncertainty. # # 2) Try to decrease the number of exponential numbers you consider to say 10, # and see how things change. Does the difference between Chi2, BLLH, and ULLH # get bigger or not? # # 3) Try to increase the number of exponential numbers you consider to say 5000, # and see what happens to the difference between Chi2 and BLLH? # Also, does the errors become more symetric? Perhaps you will need to consider # a shorter range of the fit around the mimimal value, and have to also # increase the number of points you calculate the chi2/bllh/ullh (or decrease # the range you search!), and possibly change the ranges of your plotting. # # Advanced Questions: # # 4) Make (perhaps in a new program) a loop over the production of random data, # and try to see, if you can print (or plot) the Chi2 and BLLH results for each # turn. Can you spot any general trends? I.e. is the Chi2 uncertainty always # lower or higher than the (B/U)LLH? And are any of the estimators biased? # # 5) Make a copy of the program and put in a different PDF (i.e. not the exponential). # Run it, and see if the errors are still asymetric. For the function, try either # e.g. a Uniform or a Gaussian. # # ---------------------------------------------------------------------------------- #