#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python program for analysing measurements of the length of the lecture table in # Auditorium A at NBI. # # There are two measurements each with an estimated error of the table length: # 1: Measurement with a 30cm ruler. # 2: Measurement with a 2m folding ruler. # # Each person was asked not only to state the measurement, but also their estimated # uncertainty. None of the persons could see others measurements in order to get the # largest degree of independence. Also, the 30cm ruler measurement was asked to be # done first. Last but not least, it was encouraged to include millimeters, even if # the precision was significant less (for binning reasons). # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 24th of November 2017 # # ----------------------------------------------------------------------------------- # # First, import the modules you want to use: from __future__ import print_function, division # import numpy as np # Matlab like syntax for linear algebra and functions import matplotlib.pyplot as plt # Plots and figures like you know them from Matlab from iminuit import Minuit # The actual fitting tool, better than scipy's from probfit import BinnedLH, Chi2Regression, Extended, UnbinnedLH # Helper tool for fitting from scipy import stats from scipy.special import erfc plt.close('all') # to close all open figures # Define if things are printed to the screen, and written to files: verbose = True SavePlots = True # Define the function "sqr", which squares whatever you give it: def sqr(a) : return a*a # Define your PDF / model def gauss_extended(x, N, mu, sigma): """Normalized Gaussian""" return N / np.sqrt(2 * np.pi) / sigma * np.exp(-(x - mu) ** 2 / 2. / sigma ** 2) # function to create a nice string output for figures def nice_string_output(names, values, extra_spacing = 2): max_values = len(max(values, key=len)) max_names = len(max(names, key=len)) string = "" for name, value in zip(names, values): string += "{0:s} {1:>{spacing}} \n".format(name, value, spacing = extra_spacing + max_values + max_names - len(name)) return string[:-2] # ----------------------------------------------------------------------------------- # # Define lists to contain the measurements, such that you can easily loop over them later! L30cm = [] eL30cm = [] L2m = [] eL2m = [] # Data files: infiles = ["data_TableMeasurements2009.txt", "data_TableMeasurements2010.txt", "data_TableMeasurements2011.txt", "data_TableMeasurements2012.txt", "data_TableMeasurements2013.txt", "data_TableMeasurements2014.txt", "data_TableMeasurements2015.txt", "data_TableMeasurements2016.txt", "data_TableMeasurements2017.txt"] # Loop over files and open them for infile in infiles : with open( infile, 'r' ) as f : # Read and ignore header lines (the ones showing what the numbers represents): header1 = f.readline() header2 = f.readline() # Loop over lines in the files and extract variables of interest: for line in f: line = line.strip() # Remove all "strange" characters columns = line.split() # Split the line into its parts (columns is now a list) if (len(columns) == 4) : # Check that there are four numbers! # Record (and print) the measurements: L30cm.append(float(columns[0])) eL30cm.append(float(columns[1])) L2m.append(float(columns[2])) eL2m.append(float(columns[3])) if (verbose) : print(" Measurement {:3d}: L(30cm): {:6.3f} +- {:6.3f} m L(2m): {:6.3f} +- {:6.3f} m".format( len(L30cm), L30cm[-1], eL30cm[-1], L2m[-1], eL2m[-1])) # Make sure that you are warned about data lines NOT used! else : print("Warning: Line read did not have 4 entries but ", len(columns)) f.close() # ---------------------------------------------------------------- # Numpy version of the data reading (however, without the checks): # ---------------------------------------------------------------- L30cm = np.array([]) eL30cm = np.array([]) L2m = np.array([]) eL2m = np.array([]) # Loop over files and open them: for infile in infiles: tmp_L30cm, tmp_eL30cm, tmp_L2m, tmp_eL2m = np.loadtxt(infile, skiprows=2, unpack=True) L30cm = np.append(L30cm, tmp_L30cm) eL30cm = np.append(eL30cm, tmp_eL30cm) L2m = np.append(L2m, tmp_L2m) eL2m = np.append(eL2m, tmp_eL2m) Nread = len(L30cm) # Number of measurements read in total # ----------------------------------------------------------------------------------- # # Calculate raw/naive/uncorrected/unweighted mean, RMS and from that error on the mean: # This should in your code be a call to your own function! if (len(L30cm) > 1) : # Check that there is more than one measurement (as we divide by N-1)! # Calculate the (unweighted) mean: sum1_30raw = 0.0 for i30 in range( len(L30cm) ): sum1_30raw += L30cm[i30] mean_30raw = sum1_30raw / len(L30cm) # Calculate the RMS: sum2_30raw = 0.0 for i30 in range( len(L30cm) ) : sum2_30raw += np.square(L30cm[i30] - mean_30raw) RMS_30raw = np.sqrt(sum2_30raw / (len(L30cm)-1)) # Calculate RMS (-1 degree-of-freedom) emean_30raw = RMS_30raw / np.sqrt(len(L30cm)) # Uncertainty on mean is RMS / sqrt(N) # This is the "naive" mean, in that it simply uses all measurements without any inspection. print("\n RAW: Mean = {:6.4f} +- {:6.4f} m RMS = {:6.4f} m (N = {:3d})\n".format( mean_30raw, emean_30raw, RMS_30raw, len(L30cm))) # ----------------------------------------------------------------------------------- # # Write down the number of bins and the range used once and for all - then it is easy to change! Nbins = 1000 Lmin = 0.0 Lmax = 5.0 binwidth = ( Lmax - Lmin ) / Nbins # Define a histogram with the lengths recorded: fig, ax = plt.subplots(ncols=1, figsize=(12,6)) hist_L30cm = ax.hist(L30cm, bins=Nbins, range=(Lmin, Lmax), histtype='step') # -------------------------------------- IMPORTANT NOTE ----------------------------------------- # Always think a bit about the binning! It should match the purpose of the plot, i.e. include # all measurements (at least to begin with), and have a "regural" size (so that it can be read # easily). # In order to do this, I decided that since one of course should include ALL measurements, # then as one would also like to be able to note features of less than 1cm, I choose 1000 bins # over the range 0-5m, giving a bin size of 5mm. This has also been put on the y-axis, to help # the reader knowing the bin size. # However, additional plots might want to choose differently for both range and binning. ax.set_title("Lengths estimates by 30cm ruler") ax.set_xlabel("Table length (m)") ax.set_ylabel("Frequency / 5mm") ax.set_xlim(Lmin, Lmax) # Fit the 30cm measurements: # NOTE: In the line starting with "minuit = ...", it is VERY IMPORTANT to give the fit good starting values! # This is the greatest art in fitting. For a Gaussian, it is fortunately not very hard. LLH_object = BinnedLH(gauss_extended, L30cm, bound=(Lmin, Lmax), extended=True) minuit = Minuit(LLH_object, pedantic=False, mu=L30cm.mean(), sigma=L30cm.std(ddof=1), N=len(L30cm)*binwidth, print_level=0) minuit.migrad() # Perform the actual fit fit_N, fit_mu, fit_sigma = minuit.args # Get the fitted values of the parameters print("\n L30cm raw fit:") for name in minuit.parameters: print(" Fit value: {0} = {1:.5f} +/- {2:.5f}".format(name, minuit.values[name], minuit.errors[name])) x_fit = np.linspace(Lmin, Lmax, 2*Nbins) # Create the x-axis for the plot of the fitted function y_fit = binwidth*gauss_extended(x_fit, fit_N, fit_mu, fit_sigma) # Plot the fitted function ax.plot(x_fit, y_fit, '-', color='red') # A plt-histogram returns three things (counts, bins, and patches/drawing objects), where we are only # interested in the first two. The third object is put into the variable "_", which is just used as a # dummy (and not used further). In other programming languages, the variable name "dummy" is often used. counts, bin_edges, _ = hist_L30cm bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1]) L30cm_x = bin_centers L30cm_y = counts L30cm_sy = np.sqrt(L30cm_y) Nentries = 0 Chi2_val = 0 # A smart way of looping over variables in several lists of the same length is "zip": for x,y,sy in zip(L30cm_x, L30cm_y, L30cm_sy) : if (y > 0) : f = binwidth * gauss_extended(x, fit_N, fit_mu, fit_sigma) # calc the model value residual = ( y-f ) / sy # find the uncertainty-weighted residual Chi2_val += residual**2 # the chi2-value is the squared residual Nentries += 1 # count the bin as non-empty since sy>0 (and thus y>0) Ndof = Nentries - len(minuit.args) Prob = stats.chi2.sf(Chi2_val, Ndof) # The chi2 probability given N_DOF degrees of freedom names = ['Entries', 'Mean', 'Std Dev', 'Chi2/ndf', 'Prob', 'N', 'mu', 'sigma'] values = ["{:d}".format(len(L30cm)), "{:.3f}".format(L30cm.mean()), "{:.3f}".format(L30cm.std(ddof=1)), "{:.3f} +/- {:d}".format(Chi2_val, Ndof), "{:.3f}".format(Prob), "{:.1f} +/- {:.1f}".format(minuit.values['N'], minuit.errors['N']), "{:.3f} +/- {:.3f}".format(minuit.values['mu'], minuit.errors['mu']), "{:.3f} +/- {:.3f}".format(minuit.values['sigma'], minuit.errors['sigma']), ] ax.text(0.02, 0.95, nice_string_output(names, values), family='monospace', transform=ax.transAxes, fontsize=12, verticalalignment='top') plt.tight_layout() plt.show(block=False) if SavePlots: fig.savefig("TableMeasurements_30cmRuler_Raw.pdf", dpi=600) # ----------------------------------------------------------------------------------- # try: __IPYTHON__ except: raw_input('Press Enter to exit') # ----------------------------------------------------------------------------------- # # # Start by taking a close look at the data, first by inspecting the numbers in the data # file (yes, open the damn thing, and look over the numbers by eye!), and then by # considering the histograms produced by running the macro. # # To begin with, only consider the 30cm ruler measurements, and disregard the # estimated/guessed uncertainties. You can then expand from there, as guided below by # questions. # # Questions: # ---------- # 1) Consider the calculated/naive mean and width, and make sure that you understand how # they were performed. Is the result as you would expect it? And do you think that # it is close to the best possible (i.e. most accurate and precise) estimate? # NOTE: Make sure that you know the difference between accuracy and precision!!! # See "Common definition" in: http://en.wikipedia.org/wiki/Accuracy_and_precision # # 2) Do any of the measurements looks wrong/bad/suspecious? Do you see any repeated # mistakes done? Would you correct or exclude any of the measurements and how would # you justify this? # Also, fit the length measurements with a Gaussian distribution in a small range # around what you believe is the true value. Should the binning be changed for these # fits? And is the Gaussian distribution justified? Also, do you see any "human" # effects of rounding? Did any of your class mates (or you?) not read to mm precision? # # 3) Once you have selected the measurements you want to use, calculate the mean, RMS # and uncertainty on the mean (using sums). How much did your result improve in # precision? # # ------------------- Now repeat the above for the 2m folding rule ---------------------- # # 4) How much better/worse is the single measurement uncertainty from the 30cm ruler case # to the 2m folding rule? # # 5) The "Pull" distribution is defined as the plot of z = (x - ) / sigma(x). If the # measurements and uncertainties are good, then it should give a unit Gaussian. Is # that the case? And thus, were the uncertainty estimates/guesses reasonable? # If not, then the pull RMS is often used to remove overly precise measurements, and # scale the errors on the remaining measurements to a reasonable level. # # 6) Try to calculate the weighted mean. Do you need to discard more dubious measurements? # Did the result improve further in precision? # # 7) Does the length of the table seems to be different when measured with a 30cm and a 2m # ruler? Quantify this statement! I.e. what is the difference, and what is the # uncertainty on that difference? Is this significantly away from 0? # # 8) If you were asked for the best estimate of the length of the table, what would # you do? (If posssible, read Bevington page 58 bottom!) # # # Advanced questions: # ------------------- # 9) Is there any correlation between the errors on the measurements and the actual # distance to the mean (i.e. the "residual")? That is, do the errors contain useful # information, that should be used in the calculation of the most accurate estimate of # the mean? # In all simplicity - did those who had a measurement close to the mean also put down a # small uncertainty? What is the correlation between residuals and the uncertainties? # Yes, calculate the correlation yourself! # #----------------------------------------------------------------------------------