#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#   Python program for analysing measurements of the length of the lecture table in
#   Auditorium A at NBI.
# 
#   There are two measurements each with an estimated error of the table length:
#     1: Measurement with a 30cm ruler.
#     2: Measurement with a 2m folding ruler.
# 
#   Each person was asked not only to state the measurement, but also their estimated
#   uncertainty. None of the persons could see others measurements in order to get the
#   largest degree of independence. Also, the 30cm ruler measurement was asked to be
#   done first. Last but not least, it was encouraged to include millimeters, even if
#   the precision was significant less (for binning reasons).
# 
#   Author: Troels C. Petersen (NBI)
#   Email:  petersen@nbi.dk
#   Date:   19th of November 2016
#
# ----------------------------------------------------------------------------------- #

from ROOT import *
import os, math

# Set the showing of statistics and fitting results (0 means off, 1 means on):
gStyle.SetOptStat("emr");   # See http://root.cern.ch/root/html/TStyle.html#TStyle:SetOptStat
gStyle.SetOptFit(1111);     # See http://root.cern.ch/root/html/TStyle.html#TStyle:SetOptFit

# Define if things are printed to the screen, and written to files:
verbose = True
SavePlots = True

# Define the function "sqr", which squares whatever you give it:
def sqr(a) : return a*a


# ----------------------------------------------------------------------------------- #

# Define a histogram with the lengths recorded:
Hist_L30cm     = TH1F("Hist_L30cm",    "Lengths estimates by 30cm ruler", 200, 0.0, 5.0);

# NOTE: I thought a bit about the binning, and decided that since one of course should include
#       ALL measurements (at least in the first plot), then as one would also like to be able
#       to note features of less than 1cm, I choose 1000 bins, giving a bin size of 5mm.
#       However, additional plots might want to choose differently for both range and binning.

# Define lists to contain the measurements, such that you can easily loop over them later!
L30cm = []
eL30cm = []
L2m = []
eL2m = []

# Data files:
infiles = ["data_TableMeasurements2009.txt", "data_TableMeasurements2010.txt", "data_TableMeasurements2011.txt", "data_TableMeasurements2012.txt", "data_TableMeasurements2013.txt", "data_TableMeasurements2014.txt", "data_TableMeasurements2015.txt", "data_TableMeasurements2016.txt"]

# Loop over files and open them
for infile in infiles :
    with open( infile, 'r' ) as f : 

        # Read and ignore header lines (the ones showing what the numbers represents):
        header1 = f.readline()
        header2 = f.readline()

        # Loop over lines in the files and extract variables of interest:
        for line in f:
            line = line.strip()              # Remove all "strange" characters
            columns = line.split()           # Split the line into its parts (columns is now a list)
            if (len(columns) == 4) :         # Check that there are four numbers!

                # Record (and print) the measurements:
                L30cm.append(float(columns[0]))
                eL30cm.append(float(columns[1]))
                L2m.append(float(columns[2]))
                eL2m.append(float(columns[3]))        
                if (verbose) :
                    print "  Measurement %3d:   L(30cm): %6.3f +- %6.3f m      L(2m): %6.3f +- %6.3f m"%(int(len(L30cm)), L30cm[-1], eL30cm[-1], L2m[-1], eL2m[-1])
                    # print "  %6.3f  %6.3f       %6.3f  %6.3f"%(L30cm[-1], eL30cm[-1], L2m[-1], eL2m[-1])

                # Plot measurements into histogram:
                Hist_L30cm.Fill(L30cm[-1])

            # Make sure that you are warned about data lines NOT used!
            else :
                print "Warning: Line read did not have 4 entries but ", len(columns)

    f.close()


# ----------------------------------------------------------------------------------- #
# Calculate raw/naive/uncorrected mean, RMS and from that error on the mean:

if (len(L30cm) > 1) :       # Check that there is more than one measurement (as we divide by N-1)!

    # Calculate the mean:
    sum1_30raw = 0.0
    for i30 in range( len(L30cm) ) : sum1_30raw += L30cm[i30]
    mean_30raw = sum1_30raw / len(L30cm)                  # Calculate mean

    # Calculate the RMS:
    sum2_30raw = 0.0
    for i30 in range( len(L30cm) ) : sum2_30raw += sqr(L30cm[i30] - mean_30raw)
    RMS_30raw  = sqrt(sum2_30raw / (len(L30cm)-1))        # Calculate RMS
    emean_30raw = RMS_30raw / sqrt(len(L30cm))            # Uncertainty on mean is RMS / sqrt(N)

    # This is the "naive" mean, in that it simply uses all measurements without any inspection.
    print "\n  RAW: Mean = %6.4f +- %6.4f m      RMS = %6.4f m   (N = %3d)\n"%(mean_30raw, emean_30raw, RMS_30raw, int(len(L30cm)))


# ----------------------------------------------------------------------------------- #
# Plot histogram on the screen in one canvas:

canvas = TCanvas( "canvas", "canvas", 50, 50, 1200, 600 )

# Set all the trimmings for the histogram plotting:
Hist_L30cm.GetXaxis().SetRangeUser(0.0, 5.0)       # From the histogram, we get the x-axis, and set the range.
Hist_L30cm.GetXaxis().SetTitle("Table length (m)") # We give the x-axis a label.
Hist_L30cm.GetYaxis().SetTitle("Frequency")        # We give the y-axis a label.
Hist_L30cm.SetLineColor(kBlue)                     # Set the histogram line color to blue.
Hist_L30cm.SetLineWidth(2)                         # Set the histogram line width to two.

# Define a resonable fitting function:
fit_L30cm = TF1("fit_L30cm", "gaus", 0.0, 5.0)     # Define a (predefined) Gaussian fitting function.
fit_L30cm.SetLineColor(kRed)                       # Set the line color to red.
fit_L30cm.SetLineWidth(2)                          # Set tne line width to 2.

# Fit the histogram (with a likelihood function in the range the function is defined in):
Hist_L30cm.Fit("fit_L30cm", "LR")                  # Note the two options: L for Likelihood and R for Range.
                                                   # I will soon introduce what "likelihood" means.
Hist_L30cm.Draw()                                  # Draw the histogram

canvas.Update()
if (SavePlots):
    canvas.SaveAs("TableMeasurements_30cmRuler_Raw.pdf")      # Save plot (format follow extension name)


# ----------------------------------------------------------------------------------- #
raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
#
# Start by taking a close look at the data, first by inspecting the numbers in the data
# file, and then by considering the histograms produced by running the macro.
# 
# To begin with, only consider the 30cm ruler measurements, and disregard the
# estimated/guessed uncertainties. You can then expand from there, as guided below by
# questions.
# 
# Questions:
# ----------
#  1) Consider the calculated/naive mean and width, and make sure that you understand how
#     they were performed. Is the result as you would expect it? And do you think that
#     it is close to the best possible (i.e. most accurate and precise) estimate?
#     NOTE: Make sure that you know the difference between accuracy and precision!!!
#     See "Common definition" in: http://en.wikipedia.org/wiki/Accuracy_and_precision
# 
#  2) Do any of the measurements looks wrong/bad/suspecious? Do you see any repeated
#     mistakes done? Would you correct or exclude any of the measurements and how would
#     you justify this?
#     Also, fit the length measurements with a Gaussian distribution in a small range
#     around what you believe is the true value. Should the binning be changed for these
#     fits? And is the Gaussian distribution justified?
#
#  3) Once you have selected the measurements you want to use, calculate the mean, RMS
#     and uncertainty on the mean (using sums). How much did your result improve in
#     precision?
# 
#  ------------------- Now repeat the above for the 2m folding rule ----------------------
#
#  4) How much better/worse is the single measurement uncertainty from the 30cm ruler case
#     to the 2m folding rule?
# 
#  5) The "Pull" distribution is defined as the plot of z = (x - <x>) / sigma(x). If the
#     measurements and uncertainties are good, then it should give a unit Gaussian. Is
#     that the case? And thus, were the uncertainty estimates/guesses reasonable?
#     If not, then the pull RMS is often used to scale the errors to a reasonable level.
#
#  6) Try to calculate the weighted mean. Do you need to discard more dubious measurements?
#     Did the result improve further?
# 
#  7) Does the length of the table seems to be different when measured with a 30cm and a 2m
#     ruler? Quantify this statement! I.e. what is the difference, and what is the
#     uncertainty on that difference? Is this significantly away from 0?
# 
#  8) If you were asked for the best estimate of the length of the table, what would
#     you do? (If posssible, read Bevington page 58 bottom!)
#
# 
# Advanced questions:
# -------------------
#  9) Is there any correlation between the errors on the measurements and the actual
#     distance to the mean (this is called the "residual")? That is, do the errors
#     contain useful information, that should be used in the calculation of the most
#     accurate estimate of the mean? 
#     In all simplicity - did those who had a measurement close to the mean also put down a
#     small uncertainty? What is the correlation between residuals and the uncertainties?
#     Yes, define new sums as discussed in the lectures, and calculate the correlation!
#
#----------------------------------------------------------------------------------