#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#   Python program for analysing measurements of the length of the lecture table in
#   Auditorium A at NBI.
# 
#   There are two measurements each with an estimated error of the table length:
#     1: Measurement with a 30cm ruler.
#     2: Measurement with a 2m folding ruler.
# 
#   Each person was asked not only to state the measurement, but also their estimated
#   uncertainty. None of the persons could see others measurements in order to get the
#   largest degree of independence. Also, the 30cm ruler measurement was asked to be
#   done first. Last but not least, it was encouraged to include millimeters, even if
#   the precision was significant less (for binning reasons).
# 
#   Author: Troels C. Petersen (NBI)
#   Email:  petersen@nbi.dk
#   Date:   24th of November 2017
#
# ----------------------------------------------------------------------------------- #

# First, import the modules you want to use:
from __future__ import print_function, division        #
import numpy as np                                     # Matlab like syntax for linear algebra and functions
import matplotlib.pyplot as plt                        # Plots and figures like you know them from Matlab
from iminuit import Minuit                             # The actual fitting tool, better than scipy's
from probfit import BinnedLH, Chi2Regression, Extended, UnbinnedLH # Helper tool for fitting
from scipy import stats
from scipy.special import erfc
plt.close('all')         # to close all open figures

# Define if things are printed to the screen, and written to files:
verbose = True
SavePlots = True

# Define the function "sqr", which squares whatever you give it:
def sqr(a) : 
	return a*a

# Define your PDF / model 
def gauss_extended(x, N, mu, sigma):
    """Normalized Gaussian"""
    return N / np.sqrt(2 * np.pi) / sigma * np.exp(-(x - mu) ** 2 / 2. / sigma ** 2)

# function to create a nice string output for figures
def nice_string_output(names, values, extra_spacing = 2):
    max_values = len(max(values, key=len))
    max_names = len(max(names, key=len))
    string = ""
    for name, value in zip(names, values):
        string += "{0:s} {1:>{spacing}} \n".format(name, value,
                   spacing = extra_spacing + max_values + max_names - len(name))
    return string[:-2]

# ----------------------------------------------------------------------------------- #

# Define lists to contain the measurements, such that you can easily loop over them later!
L30cm = []
eL30cm = []
L2m = []
eL2m = []

# Data files:
infiles = ["data_TableMeasurements2009.txt",
           "data_TableMeasurements2010.txt",
           "data_TableMeasurements2011.txt",
           "data_TableMeasurements2012.txt",
           "data_TableMeasurements2013.txt",
           "data_TableMeasurements2014.txt",
           "data_TableMeasurements2015.txt",
           "data_TableMeasurements2016.txt",
           "data_TableMeasurements2017.txt"]

# Loop over files and open them
for infile in infiles :
    with open( infile, 'r' ) as f : 

        # Read and ignore header lines (the ones showing what the numbers represents):
        header1 = f.readline()
        header2 = f.readline()

        # Loop over lines in the files and extract variables of interest:
        for line in f:
            line = line.strip()              # Remove all "strange" characters
            columns = line.split()           # Split the line into its parts (columns is now a list)
            if (len(columns) == 4) :         # Check that there are four numbers!

                # Record (and print) the measurements:
                L30cm.append(float(columns[0]))
                eL30cm.append(float(columns[1]))
                L2m.append(float(columns[2]))
                eL2m.append(float(columns[3]))        
                if (verbose) :
                    print("  Measurement {:3d}:   L(30cm): {:6.3f} +- {:6.3f} m      L(2m): {:6.3f} +- {:6.3f} m".format(
                    	len(L30cm), L30cm[-1], eL30cm[-1], L2m[-1], eL2m[-1]))
            # Make sure that you are warned about data lines NOT used!
            else :
                print("Warning: Line read did not have 4 entries but ", len(columns))

    f.close()


# ----------------------------------------------------------------
# Numpy version of the data reading (however, without the checks):
# ----------------------------------------------------------------
L30cm = np.array([])
eL30cm = np.array([])
L2m = np.array([])
eL2m = np.array([])

# Loop over files and open them:
for infile in infiles:
    tmp_L30cm, tmp_eL30cm, tmp_L2m, tmp_eL2m = np.loadtxt(infile, skiprows=2, unpack=True)
    L30cm = np.append(L30cm, tmp_L30cm)
    eL30cm = np.append(eL30cm, tmp_eL30cm)
    L2m = np.append(L2m, tmp_L2m)
    eL2m = np.append(eL2m, tmp_eL2m)
Nread  = len(L30cm)       # Number of measurements read in total


# ----------------------------------------------------------------------------------- #
# Calculate raw/naive/uncorrected/unweighted mean, RMS and from that error on the mean:
# This should in your code be a call to your own function!

if (len(L30cm) > 1) :       # Check that there is more than one measurement (as we divide by N-1)!

    # Calculate the (unweighted) mean:
    sum1_30raw = 0.0
    for i30 in range( len(L30cm) ): 
    	sum1_30raw += L30cm[i30]
    mean_30raw = sum1_30raw / len(L30cm)

    # Calculate the RMS:
    sum2_30raw = 0.0
    for i30 in range( len(L30cm) ) : 
    	sum2_30raw += np.square(L30cm[i30] - mean_30raw)
    RMS_30raw  = np.sqrt(sum2_30raw / (len(L30cm)-1))        # Calculate RMS (-1 degree-of-freedom)
    emean_30raw = RMS_30raw / np.sqrt(len(L30cm))            # Uncertainty on mean is RMS / sqrt(N)

    # This is the "naive" mean, in that it simply uses all measurements without any inspection.
    print("\n  RAW: Mean = {:6.4f} +- {:6.4f} m      RMS = {:6.4f} m   (N = {:3d})\n".format(
    	mean_30raw, emean_30raw, RMS_30raw, len(L30cm)))

# ----------------------------------------------------------------------------------- #

# Write down the number of bins and the range used once and for all - then it is easy to change!
Nbins = 1000
Lmin = 0.0
Lmax = 5.0
binwidth = ( Lmax - Lmin ) / Nbins

# Define a histogram with the lengths recorded:
fig, ax = plt.subplots(ncols=1, figsize=(12,6))
hist_L30cm = ax.hist(L30cm, bins=Nbins, range=(Lmin, Lmax), histtype='step')

# -------------------------------------- IMPORTANT NOTE -----------------------------------------
#  Always think a bit about the binning! It should match the purpose of the plot, i.e. include
#  all measurements (at least to begin with), and have a "regural" size (so that it can be read
#  easily).
#  In order to do this, I decided that since one of course should include ALL measurements,
#  then as one would also like to be able to note features of less than 1cm, I choose 1000 bins
#  over the range 0-5m, giving a bin size of 5mm. This has also been put on the y-axis, to help
#  the reader knowing the bin size.
#  However, additional plots might want to choose differently for both range and binning.

ax.set_title("Lengths estimates by 30cm ruler")
ax.set_xlabel("Table length (m)")
ax.set_ylabel("Frequency / 5mm")
ax.set_xlim(Lmin, Lmax)

# Fit the 30cm measurements:
# NOTE: In the line starting with "minuit = ...", it is VERY IMPORTANT to give the fit good starting values!
#       This is the greatest art in fitting. For a Gaussian, it is fortunately not very hard.
LLH_object = BinnedLH(gauss_extended, L30cm, bound=(Lmin, Lmax), extended=True) 
minuit = Minuit(LLH_object, pedantic=False, mu=L30cm.mean(), sigma=L30cm.std(ddof=1), N=len(L30cm)*binwidth, print_level=0)  
minuit.migrad()                             # Perform the actual fit
fit_N, fit_mu, fit_sigma = minuit.args      # Get the fitted values of the parameters
print("\n  L30cm raw fit:")
for name in minuit.parameters:
    print("    Fit value: {0} = {1:.5f} +/- {2:.5f}".format(name, minuit.values[name], minuit.errors[name]))

x_fit = np.linspace(Lmin, Lmax, 2*Nbins)        # Create the x-axis for the plot of the fitted function
y_fit = binwidth*gauss_extended(x_fit, fit_N, fit_mu, fit_sigma)         # Plot the fitted function
ax.plot(x_fit, y_fit, '-', color='red')

# A plt-histogram returns three things (counts, bins, and patches/drawing objects), where we are only
# interested in the first two. The third object is put into the variable "_", which is just used as a
# dummy (and not used further). In other programming languages, the variable name "dummy" is often used.
counts, bin_edges, _ = hist_L30cm
bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
L30cm_x = bin_centers
L30cm_y = counts
L30cm_sy = np.sqrt(L30cm_y)

Nentries = 0
Chi2_val = 0

# A smart way of looping over variables in several lists of the same length is "zip":
for x,y,sy in zip(L30cm_x, L30cm_y, L30cm_sy) :
    if (y > 0) :
        f = binwidth * gauss_extended(x, fit_N, fit_mu, fit_sigma) # calc the model value
        residual  = ( y-f ) / sy  # find the uncertainty-weighted residual
        Chi2_val += residual**2  # the chi2-value is the squared residual
        Nentries += 1 # count the bin as non-empty since sy>0 (and thus y>0)
Ndof = Nentries - len(minuit.args)
Prob =  stats.chi2.sf(Chi2_val, Ndof) # The chi2 probability given N_DOF degrees of freedom


names = ['Entries', 'Mean', 'Std Dev', 'Chi2/ndf', 'Prob', 'N', 'mu', 'sigma']
values = ["{:d}".format(len(L30cm)), 
          "{:.3f}".format(L30cm.mean()),
          "{:.3f}".format(L30cm.std(ddof=1)),
          "{:.3f} +/- {:d}".format(Chi2_val, Ndof), 
          "{:.3f}".format(Prob),
          "{:.1f} +/- {:.1f}".format(minuit.values['N'], minuit.errors['N']),
          "{:.3f} +/- {:.3f}".format(minuit.values['mu'], minuit.errors['mu']),
          "{:.3f} +/- {:.3f}".format(minuit.values['sigma'], minuit.errors['sigma']),
          ] 
ax.text(0.02, 0.95, nice_string_output(names, values), family='monospace', 
            transform=ax.transAxes, fontsize=12, verticalalignment='top')

plt.tight_layout()
plt.show(block=False)

if SavePlots:
    fig.savefig("TableMeasurements_30cmRuler_Raw.pdf", dpi=600)


# ----------------------------------------------------------------------------------- #
try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
#
# Start by taking a close look at the data, first by inspecting the numbers in the data
# file (yes, open the damn thing, and look over the numbers by eye!), and then by
# considering the histograms produced by running the macro.
# 
# To begin with, only consider the 30cm ruler measurements, and disregard the
# estimated/guessed uncertainties. You can then expand from there, as guided below by
# questions.
# 
# Questions:
# ----------
#  1) Consider the calculated/naive mean and width, and make sure that you understand how
#     they were performed. Is the result as you would expect it? And do you think that
#     it is close to the best possible (i.e. most accurate and precise) estimate?
#     NOTE: Make sure that you know the difference between accuracy and precision!!!
#     See "Common definition" in: http://en.wikipedia.org/wiki/Accuracy_and_precision
# 
#  2) Do any of the measurements looks wrong/bad/suspecious? Do you see any repeated
#     mistakes done? Would you correct or exclude any of the measurements and how would
#     you justify this?
#     Also, fit the length measurements with a Gaussian distribution in a small range
#     around what you believe is the true value. Should the binning be changed for these
#     fits? And is the Gaussian distribution justified? Also, do you see any "human"
#     effects of rounding? Did any of your class mates (or you?) not read to mm precision?
#
#  3) Once you have selected the measurements you want to use, calculate the mean, RMS
#     and uncertainty on the mean (using sums). How much did your result improve in
#     precision?
# 
#  ------------------- Now repeat the above for the 2m folding rule ----------------------
#
#  4) How much better/worse is the single measurement uncertainty from the 30cm ruler case
#     to the 2m folding rule?
# 
#  5) The "Pull" distribution is defined as the plot of z = (x - <x>) / sigma(x). If the
#     measurements and uncertainties are good, then it should give a unit Gaussian. Is
#     that the case? And thus, were the uncertainty estimates/guesses reasonable?
#     If not, then the pull RMS is often used to remove overly precise measurements, and
#     scale the errors on the remaining measurements to a reasonable level.
#
#  6) Try to calculate the weighted mean. Do you need to discard more dubious measurements?
#     Did the result improve further in precision?
# 
#  7) Does the length of the table seems to be different when measured with a 30cm and a 2m
#     ruler? Quantify this statement! I.e. what is the difference, and what is the
#     uncertainty on that difference? Is this significantly away from 0?
# 
#  8) If you were asked for the best estimate of the length of the table, what would
#     you do? (If posssible, read Bevington page 58 bottom!)
#
# 
# Advanced questions:
# -------------------
#  9) Is there any correlation between the errors on the measurements and the actual
#     distance to the mean (i.e. the "residual")? That is, do the errors contain useful
#     information, that should be used in the calculation of the most accurate estimate of
#     the mean? 
#     In all simplicity - did those who had a measurement close to the mean also put down a
#     small uncertainty? What is the correlation between residuals and the uncertainties?
#     Yes, calculate the correlation yourself!
#
#----------------------------------------------------------------------------------