#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#   Python program for analysing measurements of the length of the lecture table in
#   Auditorium A at NBI.
# 
#   There are two measurements each with an estimated error of the table length:
#     1: Measurement with a 30cm ruler.
#     2: Measurement with a 2m folding ruler.
# 
#   Each person was asked not only to state the measurement, but also their estimated
#   uncertainty. None of the persons could see others measurements in order to get the
#   largest degree of independence. Also, the 30cm ruler measurement was asked to be
#   done first.
# 
#   Author: Troels C. Petersen (NBI)
#   Email:  petersen@nbi.dk
#   Date:   22nd of August 2014
#
# ----------------------------------------------------------------------------------- #

from ROOT import *
import os, math

# Set the showing of statistics and fitting results (0 means off, 1 means on):
gStyle.SetOptStat("emr");   # See http://root.cern.ch/root/html/TStyle.html#TStyle:SetOptStat
gStyle.SetOptFit(1111);     # See http://root.cern.ch/root/html/TStyle.html#TStyle:SetOptFit

verbose = True
SavePlots = False


# Define two histograms with the lengths recorded and the correlation between length residual and uncertainty:
Hist_L30cm     = TH1F("Hist_L30cm",    "Lengths estimates by 30cm ruler", 250, 0.0, 5.0);


# Define lists to contain the measurements, such that you can easily loop over them later!
L30cm = []
eL30cm = []
L2m = []
eL2m = []

# Define sums, from which the mean and RMS can be calculated:
sum0_30raw = 0.0
sum1_30raw = 0.0
sum2_30raw = 0.0


# Data files:
infiles = ["data_TableMeasurements2009.txt", "data_TableMeasurements2010.txt", "data_TableMeasurements2011.txt", "data_TableMeasurements2012.txt", "data_TableMeasurements2013.txt", "data_TableMeasurements2014.txt"]

# Loop over files and open them
for infile in infiles :
    with open( infile, 'r' ) as f : 

        # Read and ignore header lines:
        header1 = f.readline()
        header2 = f.readline()

        # Loop over lines and extract variables of interest
        for line in f:
            line = line.strip()
            columns = line.split()
            if (len(columns) == 4) :         # Check that there are four numbers!

                # Record (and print) the measurements:
                L30cm.append(float(columns[0]))
                eL30cm.append(float(columns[1]))
                L2m.append(float(columns[2]))
                eL2m.append(float(columns[3]))        
                if (verbose) :
                    print "  Measurement %3d:   L(30cmR): %6.3f +- %6.3f m      L(2mFR): %6.3f +- %6.3f m"%(int(sum0_30raw), L30cm[-1], eL30cm[-1], L2m[-1], eL2m[-1])

                # Plot measurements:
                Hist_L30cm.Fill(L30cm[-1])

                # Make sums to get a quick check of result:
                sum0_30raw += 1
                sum1_30raw += L30cm[-1]
                sum2_30raw += L30cm[-1]*L30cm[-1]

            # Make sure that you are warned about data lines NOT used!
            else :
                print "Warning: Line read did not have 4 entries but ", len(columns)

    f.close()


# ----------------------------------------------------------------------------------- #
# Calculate raw/naive/uncorrected mean, RMS and from that error on the mean:

if (sum0_30raw > 1) :
    mean_30raw  = sum1_30raw/sum0_30raw                                   # Calculate the mean
    RMS_30raw   = sqrt(sum2_30raw/sum0_30raw - mean_30raw*mean_30raw)     # Calculate the RMS
    emean_30raw = RMS_30raw / sqrt(sum0_30raw)                            # Uncertainty of the mean is RMS / sqrt(N)

    # This is the "naive" mean, in that it simply uses all measurements without any inspection.
    print "  Mean = %6.4f +- %6.4f      RMS = %6.4f"%(mean_30raw, emean_30raw, RMS_30raw)


# ----------------------------------------------------------------------------------- #
# Plot histogram on the screen in one canvas:

canvas = TCanvas( "canvas", "canvas", 50, 50, 1200, 600 )

# Set all the trimmings for the histogram plotting:
Hist_L30cm.GetXaxis().SetRangeUser(0.0, 5.0)       # From the histogram, we get the x-axis, and set the range.
Hist_L30cm.GetXaxis().SetTitle("Table length (m)") # We give the x-axis a label.
Hist_L30cm.GetYaxis().SetTitle("Frequency")        # We give the y-axis a label.
Hist_L30cm.SetLineColor(kBlue)                     # Set the histogram line color to blue.
Hist_L30cm.SetLineWidth(2)                         # Set the histogram line width to two.

# Define a resonable fitting function:
fit_L30cm = TF1("fit_L30cm", "gaus", 0.0, 5.0)     # Define a (predefined) Gaussian fitting function.
fit_L30cm.SetLineColor(kRed)                       # Set the line color to red.
fit_L30cm.SetLineWidth(2)                          # Set tne line width to 2.

# Fit the histogram (with a likelihood function in the range the function is defined in):
Hist_L30cm.Fit("fit_L30cm", "LR")                  # Note the two options: L for Likelihood and R for Range.
Hist_L30cm.Draw()                                  # Draw the histogram

if (SavePlots):
    canvas.SaveAs("TableMeasurements_30cmRuler_Raw.pdf")      # Save plot (format follow extension name)


# ----------------------------------------------------------------------------------- #
raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
#
# Start by taking a close look at the data, first by inspecting the numbers in the data
# file, and then by considering the histograms produced by running the macro.
# 
# To begin with, only consider the 30cm ruler measurements, and disregard the
# estimated/guessed uncertainties. You can then expand from there, as guided below by
# questions.
# 
# Questions:
# ----------
#  1) Consider the calculated/naive mean and width, and make sure that you understand how
#     they were performed. Is the result as you would expect it? And do you think that
#     it is close to the best possible (i.e. most accurate and precise) estimate?
#     NOTE: Make sure that you know the difference between accuracy and precision!!!
#     See "Common definition" in: http://en.wikipedia.org/wiki/Accuracy_and_precision
# 
#  2) Do any of the measurements looks wrong/bad/suspecious? Do you see any repeated
#     mistakes done? Would you correct or exclude any of the measurements and how would
#     you justify this?
#     Also, fit the length measurements with a Gaussian distribution in a small range.
#     Should the binning be changed for these fits? And is the Gaussian distribution
#     justified?
#
#  3) Once you have selected the measurements you want to use, calculate the mean, RMS
#     and uncertainty on the mean (using sums). How much did your result improve in
#     precision?
#
#  4) Does the length of the table seems to be different between the two sides? Quantify
#     this statement! I.e. what is the difference, and what is the uncertainty on this
#     difference?
# 
#  ------------------- Now repeat the above for the 2m folding rule ----------------------
#
#  5) How much better/worse is the single measurement uncertainty from the 30cm ruler case
#     to the 2m folding rule?
# 
#  6) The "Pull" distribution is defined as the plot of z = (x - <x>) / sigma(x). If the
#     measurements and uncertainties are good, then it should give a unit Gaussian. Is
#     that the case? And thus, were the uncertainty estimates/guesses reasonable?
#
#  7) Try to calculate the weighted mean. Do you need to discard more dubious measurements?
#     Did the result improve further?
# 
#  8) Does the length of the table seems to be different when measured with a 30cm and a 2m
#     ruler? Quantify this statement! I.e. what is the difference, and what is the
#     uncertainty on that difference? Is this significantly away from 0? How big an
#     effect on the 30cm ruler would this correspond to?
# 
#  9) If you were asked for the best estimate of the length of the table, what would
#     you do? (If posssible, read Bevington page 58 bottom!)
#
# 
# Advanced questions:
# -------------------
# 10) Is there any correlation between the errors on the measurements and the actual
#     distance to the mean (this is called the "residual")? That is, do the errors
#     contain useful information, that should be used in the calculation of the most
#     accurate estimate of the mean? 
#     In all simplicity - did those who had a measurement close to the mean also put down a
#     small uncertainty? What is the correlation between residuals and the uncertainties?
#     Yes, define new sums as discussed in the lectures, and calculate the correlation!
#
#
#---------------------------------------------------------------------------------- 
#
# Key questions:
# --------------
#   https://docs.google.com/forms/d/1SjQViqu0dPFTITnxkiMXxv1VIeYG8J8oB0fqYBvOlmA/viewform?usp=send_form
#   Please answer the following questions both for the 30cm ruler and for the 2m folding
#   rule!
#   * Write the number of measurements you accepted for the length determination along
#     with your result (mean and error on mean) into the questionaire. Do this both for
#     the unweighted and weighted result.
#   * What is the Chi2 and its probability when fitting a Gaussian to the measurements?
#
#----------------------------------------------------------------------------------