#!/usr/bin/env python

# ---------------------------------------------------------------------------------- #
#                                  "Fitting is an art!"
# ---------------------------------------------------------------------------------- #
# 
#  Python/ROOT macro for testing which fitting proceedure is likely to give the "best"
#  results. The two cases in question are:
#   * Gaussian distribution on constant background (peak searching)
#   * Double exponential distribution (high correlations)
#
#  Consider each case and argue/discuss which fitting function and method should be used.
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   6th of January 2017
# 
# ---------------------------------------------------------------------------------- #

from ROOT import *
import math
from array import array

gStyle.SetOptStat("emr")
gStyle.SetOptFit(1111)

SavePlots = False         # Determining if plots are saved or not

pi = TMath.Pi()
r = TRandom3()
r.SetSeed(1)


#----------------------------------------------------------------------------------- #
# CASE 1: Gaussian distribution on a constant background:
#         f1(x) = N / sqrt(2pi) / sigma * exp(-0.5 * sqr((x-mu)/sigma)) for x in [-inf,inf]
#----------------------------------------------------------------------------------- #

Npoints_gauss = 200
mux    = 3.50
sigmax = 0.25
Npoints_pol0 = 1000
minx   =  0.0
maxx   = 10.0
Nbins = 100
binwidth_gauss = (maxx-minx) / float(Nbins)
print "  The bin width is: %5.2f"%(binwidth_gauss)

# Fill histogram with signal and background events:
Hist_gauss = TH1F("Hist_gauss", ";x;Frequency", Nbins, minx, maxx)
for i in xrange(Npoints_gauss)   : Hist_gauss.Fill(r.Gaus(mux,sigmax))
for i in xrange(Npoints_pol0)    : Hist_gauss.Fill(r.Uniform(minx,maxx))
for i in xrange(Npoints_gauss/3) : Hist_gauss.Fill(r.Gaus(mux+2.0,sigmax))

canvas_gauss = TCanvas("canvas_gauss","", 50, 50, 1000, 600)

Hist_gauss.SetLineColor(kBlue)
Hist_gauss.SetLineWidth(2)
Hist_gauss.SetMinimum(0.0)

# Define function (including bin width to get normalisation right):
def func_gpol0(x, par) :
    norm = binwidth_gauss * par[0] / sqrt(2.0*pi) / par[2]
    z = (x[0]-par[1])/par[2]
    return norm * exp(-0.5*z*z) + par[3]

f_gauss = TF1("f_gauss", func_gpol0, 0.0, 10.0, 4)
f_gauss.SetParNames("N", "#mu", "#sigma", "Background")
f_gauss.SetLineColor(kRed)
Hist_gauss.Fit("f_gauss", "R")
Hist_gauss.Draw("e")

canvas_gauss.Update()
if (SavePlots) : canvas_gauss.SaveAs("Hist_gauss.pdf")


#----------------------------------------------------------------------------------- #
# CASE 2: Double exponential distribution:
#
#   The "bad" fitting function:
#     f2_bad(x)  = N1*exp(-t/r1) + N2*exp(-t/r2) for t in [0,inf]
#
#   The "good" fitting function:
#     f2_good(x) = N * (f/r1*exp(-t/r1) + (1-f)/r2*exp(-t/r2)) for x in [0,inf]
#
#----------------------------------------------------------------------------------- #
# NOTE: The parameters r1 and r2 need to be positive, and f in [0,1],
#       in order for this to be a PDF.
#----------------------------------------------------------------------------------- #

Npoints_2exp = 2000
frac = 0.9   # Fraction that "belows to" first exponential
r1 = 3.0
r2 = 2.0

Hist_2exp = TH1F("Hist_2exp", ";x;Frequency", 200, 0.0, 20.0)
binwidth_2exp = (20.0 - 0.0) / 200.0

# Transformation method:
for i in xrange(Npoints_2exp) :
    if (r.Uniform() < frac) : t = r.Exp(r1)
    else :                    t = r.Exp(r2)
    Hist_2exp.Fill(t)

canvas_2exp = TCanvas("canvas_2exp","", 80, 80, 1000, 600)
Hist_2exp.SetLineColor(kBlue)
Hist_2exp.SetLineWidth(2)
Hist_2exp.SetMinimum(0.0)

# The BAD fitting function and fit:
# ----------------------------------
# I include the binwidth (here 0.1) in the fit to ensure that the normalisations are (or could be) right!
f_2expBad = TF1("f_2expBad", "0.1*[0] * exp(-x/[2]) + 0.1*[1]*exp(-x/[3])", 0.0, 20.0)
f_2expBad.SetParameters(frac*Npoints_2exp, (1.0-frac)*Npoints_2exp, r1, r2)
f_2expBad.SetLineColor(kRed)
FitResBad = Hist_2exp.Fit("f_2expBad", "RLS")

corrBad = FitResBad.GetCorrelationMatrix()    # Access the covariance matrix
corrBad.Print()

Hist_2exp.Draw("e")


canvas_2exp.Update()
if (SavePlots) : canvas_2exp.SaveAs("Hist_2exp.pdf")


raw_input( ' ... Press enter to exit ... ' )


# ----------------------------------------------------------------------------------- #
#
# Looking at the first case, but make sure that you also do the second case, as this is
# at least as important as the first one!
#
# Questions on CASE 1:
# --------------------
#  1) Look at the data plot and the corresponding fit. What type of fit is it? Does it
#     run well (or at all) without some non-zero input parameters? And once it runs, does
#     it seem to be reasonable? Why? And does the fitting function include all features
#     of the data? Why/why not? Try for 10-15 minutes and discuss it with your neighbor,
#     before reading on!
#
#            - - - - - -     10-15 minutes later    - - - - - -
#
#  2) As it happens, there seem to be a suspectable bump around 5 < x < 6. Try to write
#     an additional fitting function, which includes this bump in the model, and get the
#     fit to run. How significant is the bump, based on significance of the amplitude of
#     the second Gaussian peak? And what test would you apply to this, if you wanted to
#     make a full-fledged hypothesis test between the two models? Are they nested? Can
#     you actually get a number out?
#
#            - - - - - -     20-30 minutes (success or failure) later - - - - - -
#
#  3) Imagine that you concluded, that there was a new peak, and that you were sure that
#     it had the same width as the original peak. Does that help you in the fit, and if
#     so, how? Does the significance of the peak increase? Would it always do that?
#     Also imagine, that the parameter of interest is the distance between the peaks. How
#     would you now write your fitting function?
#
#
# Questions on CASE 2:
# --------------------
#  1) OK, the "bad" fit doesn't work to begin with. Can you see what is missing? There
#     are in fact several things, but one is very simple to remedy. Think and discuss...
#
#            - - - - - -     5-10 minutes later    - - - - - -
#
#     Of course you need to give the fit good initial values! Do this (for example those
#     the data was produced with!), and run it again. It might work now, but actually that
#     is not always the case.
#     The reason is that the "bad" fitting function has two flaws:
#       * It does not have a correct normalisation, thus making N(1/2) and r(1/2) correlated.
#       * It does not have one overall normalisation, thus making N1 and N2 correlated.
#     This gives very high correlations between the parameters, as can be seen from the
#     correlation matrix printed.
#
#  2) Both of these problems can be avoided by rewriting the fitting function to include
#     the correct normalisation (i.e. dividing by the lifetime) and by putting only one
#     overall normalisation and then dividing the two lifetimes with a fraction (i.e. use
#     "frac" and "(1.0*frac)" as a parameter in front of each exponential term).
#     Try this (define a "good" function), and see if your fit improves. The way to see
#     this would in general be to try a lot of different data, but here we will simply see
#     that the correlations a smaller (especially for the overall normalisation).
#
#            - - - - - -     20-30 minutes (success or failure) later - - - - - -
#
#     If you didn't manage to get this fit going, I've included a "good" fitting function below!
#
#  3) The two lifetimes are naturally very correlated with each other (and the fraction),
#     when they are very alike. The only thing one can do about this is to fix one parameter.
#     This is of course not desirable, but one can be forced to do it, if the fit does not
#     converge otherwise. Not that since the correlation is very high, it is not a great
#     loss of freedom in the fitting function.
#     A very common similar example is fitting a "Gaussian-like" peak, which happens to have
#     more than one width, for example if the data is obtained from two or more sources with
#     different resolutions. Here, one may choose to let the two (or more) Gaussians have
#     the same mean, but two different widths (the "good" and the "bad" measurements).
#     Typically, the parameter to fix (if any) is the fraction, but never fix a parameter
#     without first having tried to let it "float".
#
# ----------------------------------------------------------------------------------- #


# The GOOD fitting function and fit:
# ----------------------------------
#f_2expGood = TF1("f_2expGood", "0.1*[0] * ([1]/[2]*exp(-x/[2]) + (1.0-[1])/[3]*exp(-x/[3]))", 0.0, 20.0)
#f_2expGood.SetParameters(Npoints_2exp, frac, r1, r2)
#f_2expGood.SetLineColor(kBlue)
#FitResGood = Hist_2exp.Fit("f_2expGood", "RLS+")
#corrGood = FitResGood.GetCorrelationMatrix()    # Access the covariance matrix
#corrGood.Print()