// ----------------------------------------------------------------------------------- //
/*
  ROOT macro for analysing measurements of the length of the lecture table in Auditorium
  A at NBI.

  There are four measurements of the table length:
    1: Estimated by eye.
    2: Estimated using human scale (Troels).
    3: Measurement with a 30cm ruler.
    4: Measurement with a 2m folding ruler.
  Each person was asked not only to state the measurement, but also their estimated
  uncertainty, though some persons did not include an uncertainty, and in some cases
  measurements are missing (indicated by the value -1). As both sides of the table
  were used for measurements, this was also indicated (0 for room side, 1 for board
  side).
  None of the persons could see others measurements in order to get the largest
  degree of independence.

  Author: Troels C. Petersen (NBI)
  Email:  petersen@nbi.dk
  Date:   6th of September 2011
*/
// ----------------------------------------------------------------------------------- //

// Include from C++:
#include <fstream>
#include <iostream>

// Include from ROOT:
#include <TROOT.h>
#include <TH1.h>
#include <TF1.h>
#include <TCanvas.h>
#include <TStyle.h>
#include <TFile.h>


//---------------------------------------------------------------------------------- 
// Simple example of defining your own function. Reading from an end, it returns a
// "double" (i.e. a number with (many) decimals), it's name is "sqr", and as input
// it takes a "double". And between the {}, you define what it should do!
double sqr(double a) {
  return a*a;
}


  // SolutionLevel1:
  // Before the first iteration, I decide not to really consider the errors, as I will
  // not start with the more advanced weighted mean, but simply start with the simplest,
  // namely checking the data, and obtaining an initial result.


// ----------------------------------------------------------------------------------- //
// This is the main function. It does not return anything (void) and takes no input.
// It simply runs whatever is "inside".
void TableMeasurements_SolutionLevel1() {
// ----------------------------------------------------------------------------------- //

  // Set the showing of statistics and fitting results (0 means off, 1111 means all on):
  gROOT->Reset();
  gStyle->SetOptStat(1111);
  gStyle->SetOptFit(1111);

  // Set the graphics:
  gStyle->SetStatBorderSize(1);
  gStyle->SetStatFontSize(0.055);
  gStyle->SetCanvasColor(0);
  gStyle->SetPalette(1);

  int NeventsMax = 100;  // Maximum number of events to be read.
  int Nevents = 0;       // Counter of events read so far.
  double x[9];           // Array of 9 numbers for storing the current measuremets read.

  // Define three sums to be increased with each event for calculation of mean and width:

  // SolutionLevel1: Given that there are four types of measurements, I would start by
  // defining all the sums, means, and widths as arrays of 4:
  const int Ndata = 4;
  double sum0[Ndata] = {0.0, 0.0, 0.0, 0.0};
  double sum1[Ndata] = {0.0, 0.0, 0.0, 0.0};
  double sum2[Ndata] = {0.0, 0.0, 0.0, 0.0};
  double mean[Ndata];
  double width[Ndata];

  // Define a histogram with the name "Histogram", Title "Hist of...", 50 bins,
  // starting at 0.0 and ending at 1000.0 (thus a bin width of 20.0).
  
  // SolutionLevel1: By hand define four different histograms:
  TH1F* Hist_ByEye   = new TH1F("Hist_ByEye",   "Lengths estimates by eye",              50, 0.0, 1000.0);
  TH1F* Hist_ByHuman = new TH1F("Hist_ByHuman", "Lengths estimates by human scale",      50, 0.0, 1000.0);
  TH1F* Hist_By30cmR = new TH1F("Hist_By30cmR", "Lengths estimates by 30cm ruler",      100, 0.0,  500.0);
  TH1F* Hist_By2mFR  = new TH1F("Hist_By2mFR",  "Lengths estimates by 2m folding rule", 100, 0.0,  500.0);

  ifstream dataset("TableMeasurements_data.csv");      // Open the data file.


  // Start the loop over data:
  // -------------------------
  if (dataset.is_open()) {              // If opening the file succeeded, then...
    // While the reading is not at the End-of-File (eof),
    // and the number of events does not exceed the maximum, do...
    while ((!dataset.eof()) && (Nevents < NeventsMax)) {    

      // Read the dataset into the array "x" of length 9.
      dataset >> x[0] >> x[1] >> x[2] >> x[3] >> x[4] >> x[5] >> x[6] >> x[7] >> x[8];

      // SolutionLevel1: As there are 4 measurements and 4 errors, put these into vectors as well
      // so that any analysis step can be applied to all 4 data sets in a loop:
      double l[Ndata]  = {x[0], x[2], x[4], x[6]};
      double el[Ndata] = {x[1], x[3], x[5], x[7]};

      // Write out the first 5 events to the screen with some values.
      // Note how the slighly cryptic "%5.2f" refers to 5 spaces with 2 digits
      // for a float (i.e. double), thus printing out the value (here of x[3]) nicely.
      if (Nevents < 5)
	printf("  Read: %6d  %7.1f   %3.0f %2.0f  %5.2f \n", Nevents, x[0], x[1], x[2], x[3]);

      // For every 10 events (here the "%" is integer division, i.e. 12%5 = 2), print progress.
      if (Nevents%10 == 0)
	printf("  Read %6d events. \n", Nevents);

      // If the value read is good, use this measurement:
      // SolutionLevel1: Loop over measurements and get the sums for each of them (and put them into histograms).
      for (int i=0; i < Ndata; i++) {
	if (l[i] > 0.0) {
	  sum0[i] += 1.0;
	  sum1[i] += l[i];
	  sum2[i] += sqr(l[i]);
	}
      }

      // SolutionLevel1: Hmmm.... this doesn't look very good. Perhaps also make the histograms an array or vector?!?
      Hist_ByEye->Fill(l[0]);     // Simply fill the estimate by eye value into the histogram.
      Hist_ByHuman->Fill(l[1]);
      Hist_By30cmR->Fill(l[2]);
      Hist_By2mFR->Fill(l[3]);
      Nevents++;                  // Increase the number of events read by one.
    }
  }

  // After reading the data, finish:
  dataset.close();                                          // Close the data file.
  printf("  Number of entries in total: %6d \n", Nevents);  // Write the total number of events read.

// ----------------------------------------------------------------------------------- //

  // Calculate mean and width:
  // SolutionLevel1: Loop over sums to get means, widths, and errors on the means:
  for (int i=0; i < Ndata; i++) {
    if (sum0[i] > 1.0) {
      mean[i] = sum1[i] / sum0[i];
      width[i] = sqrt(sum2[i] / sum0[i] - sqr(mean[i]));
      printf("  Data %1d.   Mean = %6.1f +- %6.1f  Width = %6.1f \n", i, mean[i], width[i]/sqrt(sum0[i]), width[i]);
    } else {
      printf("  Error: Too little data to calculate mean and width! %1d Nevents = %3d \n", i, int(sum0[i]));
    }
  }


// ----------------------------------------------------------------------------------- //
  // Plot histogram on the screen:

  // Make canvas:
  // SolutionLevel1: Plot all the four distributions:
  TCanvas* canvas = new TCanvas("canvas","",250,20,800,600);     // Make a new window
  canvas->Divide(2,2);

  canvas->cd(1);
  TF1* func_g1 = new TF1("func_g1", "gaus", 0.0, 1000.0);  // Define a Gaussian function
  Hist_ByEye->Fit("func_g1","LR");                          // Fit histogram with function

  canvas->cd(2);
  TF1* func_g2 = new TF1("func_g2", "gaus", 0.0, 1000.0);  // Define a Gaussian function
  Hist_ByHuman->Fit("func_g2","LR");                          // Fit histogram with function

  canvas->cd(3);
  TF1* func_g3 = new TF1("func_g3", "gaus", 320.0, 350.0);  // Define a Gaussian function
  Hist_By30cmR->Fit("func_g3","LR");                          // Fit histogram with function

  canvas->cd(4);
  TF1* func_g4 = new TF1("func_g4", "gaus", 320.0, 350.0);  // Define a Gaussian function
  Hist_By2mFR->Fit("func_g4","LR");                          // Fit histogram with function

  canvas->Update();
  // canvas->SaveAs("LengthOfTable.png");   // Alternative formats: jpg, eps, pdf, etc!


// ----------------------------------------------------------------------------------- //
  // Write histograms to file:
  TFile *file = new TFile("Histograms.root","RECREATE","Root Histograms");
  Hist_ByEye->Write();
  file->Write();
  file->Close();

}


/*
SolutionLevel1: Discussion after analysis step
----------------------------------------------
  After first iteration, there are several conclusions:
  1: The two estimates look quite alright - at least they are Gaussian, and have no clear outliers.
  2: The two measurements do NOT look very good, and have clear problems.
     However, one can assert, that the true mean surely lies around 335+-2cm.
       30cm ruler: There are two measurements 30cm below and 3-6 30cm above the mean (counting errors).
                   There is one measurement 2.0m below mean, and several above the mean (measurement errors).
       2m folding rule: There is one measurement 2.0m below mean (counting error!!!).
                        There are three measurements 30cm above mean (measurement errors).
  3: Repeating the Gaussian fits in the range 320cm-350cm yields widths of the order 2cm, and so I choose
     only to include the above counting errors (corrected of course) if they lie +-6cm (+-3 sigma) of 335cm.

  Notes:
     Given that the 30cm data contains a measurement 2.0m below mean, and the 2m data contains
     3 measurements 30cm above the mean, one could suspect that some measurements have been
     interchanged!
     Looking at some of the errors (always good to just skim through), it seems that they are sometimes so
     large, that the measurement cannot be taken very seriously! So perhaps the inclusion of errors will
     give some answers.

  I continue at SolutionLevel2...
*/


//---------------------------------------------------------------------------------- 
/*

Start by taking a close look at the data, both by inspecting the numbers, and then by
considering the histograms produced by running the macro.


Questions:
----------
First, consider the two estimates, first by eye and then by human scale (i.e. not the
measurements).

 1)  Calculate their mean and widths. Did the "help" of having a human
    scale decrease the width? Assuming that the table is 336cm long, did it also give a
    mean closer to this value?

 2) Do any of the measurements looks wrong/bad/suspecious? Do you find any reason to
    exclude any measurements?

 3) Were the uncertainties that people gave realistic? Try to plot the estimates with
    the mean subtracted and divided by the uncertainty. Such a plot is called a
    "residual" or "pull" plot, and should be a unit Gaussian if the measurements
    are unbiased and with correct errors. Are they?

Now consider the two measurements.

 4) Ask yourself again, if any of the measurements and/or errors looks wrong/bad?
    Would you correct or exclude any of the measurements and how would you justify this?
    If so, make sure that your program does this, or make a copy of the data file, and
    make the corrections here.

 5) Given the (possibly) updated dataset, fit the length measurements with a Gaussian
    distribution, both for the 30cm and the 2m ruler cases. Should the binning be changed
    for these fits? And is the Gaussian distribution justified?

 6) Given two succesful fits, the width of these Gaussians actually show the "correct"
    uncertainty in the measurements. How do they compare to the uncertainties estimated
    by the people making the measurements?
    How much was the estimated uncertainties lowered from first to second type of
    measurement? Was this justified?

 7) Is there any correlation between the errors on the measurements and the actual
    distance to the mean? That is, do the errors contain useful information, that
    should be used in the calculation of the most accurate estimate of the mean? 

 8) Try to calculate the mean and then the weighted mean including uncertainties.
    How much difference does this make?


Advanced questions:
-------------------
 1) The length of the table seems to be different when measured with a 30cm and a 2m
    ruler - quantify this statement! I.e. what is the difference, and what is the
    uncertainty on that difference? Is this significantly away from 0? How big an
    effect on the 30cm ruler would this correspond to?
    And, is there any sign of a difference in length between the two sides of the table?

 2) In question 3) you chose a binning - does this have any influence on the result?

 3) If you were asked for the best estimate of the length of the table, what would
    you do? (If posssible, read Bevington page 58!)


*/
//----------------------------------------------------------------------------------