#!/usr/bin/env python

# ------------------------------------------------------------------------ #
#  Program for illustrating the use of sWeights from multidimensional fits.
# 
#  Challenge at hand:
#    Given uncorrelated multidimensional data of two (or more) different
#    categories (e.g. signal and background), which can be fitted in some
#    dimensions (separation variables), how to best estimate the distribution
#    of other variables (control variables) of interest for a specific
#    category (typically signal)?
# 
#  This is a very general problem. Imagine you have two (or more) types of
#  mice, bacteria, cells, particles, humans, etc., and that you don't know
#  which are of which type in general, but only from a known feature, such
#  as size, shape, color, income, activity, etc. which you can fit.
#  Now you want to know the distribution of another feature for one type
#  of cells, etc. - how to get this, along with the correct uncertainties?
# 
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   18th of February 2018
# ------------------------------------------------------------------------ #

from ROOT import *
from array import array
import math
import sys
from numpy import matrix
from numpy import linalg

gStyle.SetOptStat("emr")
gStyle.SetOptFit(1111)

SavePlots = False
verbose = True
Nverbose = 10
RunFast = True    # Fix PDF shape parameters to truth values!

pi = TMath.Pi()


# ----------------------------------------------------------------------------------- #
# Functions:
# ----------------------------------------------------------------------------------- #
def sqr( a ) : return a*a



# ------------------------------------------------------------------------ #
def main() : 
# ------------------------------------------------------------------------ #

    # File with data in:
    infile = open('data_sWeights.txt', 'r')
    # infile = open('data_sWeights2.txt', 'r')

    # List (size 3 x Nevents) to put data into:
    x_all = []

    # Number of variables and definition of 1D histogram sizes:
    Nvar = 3
    Nbins = [ 100,  100,  100]
    xmin  = [ 0.0, -3.0, -1.0]
    xmax  = [ 2.0,  2.0,  1.0]
    VarName = ["Mass", "Shape", "Angle"]
    hist_all = [TH1D("hist_all_" + VarName[ivar], ";" + VarName[ivar] + ";Frequency", Nbins[ivar], xmin[ivar], xmax[ivar]) for ivar in xrange(Nvar)]

    # 2D histograms:
    hist2D_all = TH2D("hist2D_all", ";Mass;Shape", Nbins[0], xmin[0], xmax[0], Nbins[1], xmin[1], xmax[1])

    # Calculating binwidth to ensure that PDFs are normalised correctly:
    binwidth0 = (xmax[0] - xmin[0]) / Nbins[0]
    binwidth1 = (xmax[1] - xmin[1]) / Nbins[1]
    binarea = binwidth0 * binwidth1


    # ------------------------------------------------------------------------------
    # Read in the data points with 3 variables. Shapes for signal and background are:
    #  * Mass:  Gaussian peak vs. exponential.
    #  * Shape: Two Gaussian peaks.
    #  * Angle: Two Beta distributions.
    # ------------------------------------------------------------------------------

    Nevents = 0

    # Loop over the file and read variables:
    for line in infile:
        line = line.strip()              # Remove all "strange" characters
        columns = line.split()           # Split the line into its parts (columns is now a list)
        if (len(columns) == 3) :         # Check that there are exactly three numbers!

            # Record (and print) the measurements:
            x = [float(columns[0]), float(columns[1]), float(columns[2]) ]
            x_all.append(x)
            if (verbose and Nevents < Nverbose) :
                print "  %7.3f    %7.3f    %7.3f  "%(x_all[-1][0], x_all[-1][1], x_all[-1][2])

            # Fill histograms and count events:
            for i in xrange( Nvar ) : hist_all[i].Fill(x_all[-1][i])
            hist2D_all.Fill(x_all[-1][0], x_all[-1][1])
            Nevents += 1
 
    infile.close()



    # Fit the signal and background samples in 1D:
    # --------------------------------------------
    # Fit mass dimension:
    def func_GaussExp(x, p) :
        z = (x[0] - p[1]) / p[2]
        sig = p[0]*binwidth0 / sqrt(2.0*pi)/p[2] * exp(-0.5*z*z)
        bkg = p[3]*binwidth0 * p[4] * exp(-p[4]*x[0])
        return sig+bkg

    fit_mass = TF1("fit_mass", func_GaussExp, 0.0, 2.0, 5)
    fit_mass.SetParameters(4000.0, 1.0, 0.2, 11000.0, 0.5)
    fit_mass.SetLineColor(kBlack)
    hist_all[0].Fit("fit_mass", "LR")

    # Fit shape dimension:
    def func_GaussGauss(x, p) :
        zsig = (x[0] - p[1]) / p[2]
        sig = p[0]*binwidth1 / sqrt(2.0*pi)/p[2] * exp(-0.5*zsig*zsig)
        zbkg = (x[0] - p[4]) / p[5]
        bkg = p[3]*binwidth1 / sqrt(2.0*pi)/p[5] * exp(-0.5*zbkg*zbkg)
        return sig+bkg

    fit_shape = TF1("fit_shape", func_GaussGauss, -3.0, 2.0, 6)
    fit_shape.SetParameters(4000.0, 0.0, 0.4, 11000.0, -1.0, 0.6)
    fit_shape.SetLineColor(kBlack)
    hist_all[1].Fit("fit_shape", "LR")


    # Fit the signal and background samples in 2D (mass and shape):
    # ------------------------------------------------------------------------
    def func_GEGG(x, p) :
        z0sig = (x[0] - p[1]) / p[2]
        z1sig = (x[1] - p[3]) / p[4]
        sig = p[0]*binarea / sqrt(2.0*pi)/p[2] * exp(-0.5*z0sig*z0sig) / sqrt(2.0*pi)/p[4] * exp(-0.5*z1sig*z1sig)
        z1bkg = (x[1] - p[7]) / p[8]
        bkg = p[5]*binarea * p[6] * exp(-p[6]*x[0]) / sqrt(2.0*pi)/p[8] * exp(-0.5*z1bkg*z1bkg)
        return sig+bkg
 
    fit_massshape = TF2("fit_massshape", func_GEGG, 0.0, 2.0, -3.0, 2.0, 9)
    fit_massshape.SetParameters(fit_mass.GetParameter(0), fit_mass.GetParameter(1), fit_mass.GetParameter(2),
                                fit_shape.GetParameter(1), fit_mass.GetParameter(2), fit_mass.GetParameter(0),
                                fit_mass.GetParameter(1), fit_shape.GetParameter(1), fit_shape.GetParameter(2))
    if (RunFast) :     # For debugging it is worthwhile skipping a 9 parameter fit in 2D!!!
        fit_massshape.FixParameter(1, 1.0)
        fit_massshape.FixParameter(2, 0.2)
        fit_massshape.FixParameter(3, 0.0)
        fit_massshape.FixParameter(4, 0.4)
        fit_massshape.FixParameter(6, 0.5)
        fit_massshape.FixParameter(7,-1.0)
        fit_massshape.FixParameter(8, 0.6)
    fit_massshape.SetLineColor(kBlack)
    fitres = hist2D_all.Fit("fit_massshape", "LRS")   # Option "S" gives fit result as output!
    cmfit = fitres.GetCovarianceMatrix()              # Access to entries: cmfit[0][0]
    

    # Simple "cut"-based selection of signal and background:
    # ------------------------------------------------------------------------
    hist_Angle_sig = TH1D("hist_Angle_sig", ";Signal Angle;Frequency", Nbins[2], xmin[2], xmax[2])
    hist_Angle_bkg = TH1D("hist_Angle_bkg", ";Background Angle;Frequency", Nbins[2], xmin[2], xmax[2])

    for i in xrange ( Nevents ) :
        if ( abs(x_all[i][0]-0.8) < 0.25 and x_all[i][1] > -0.25 ) : 
            hist_Angle_sig.Fill( x_all[i][2] )
        if ( abs(x_all[i][0]-0.8) > 0.50 and x_all[i][1] < -0.50 ) : 
            hist_Angle_bkg.Fill( x_all[i][2] )



    # Produce plots with the variable distributions for signal and background:
    # ------------------------------------------------------------------------
    canvas1 = TCanvas("canvas1", "", 20, 20, 1200, 700)
    canvas1.Divide(2,2)

    for i in xrange( Nvar ) : 
        canvas1.cd(i+1)
        hist_all[i].SetXTitle(VarName[i])
        hist_all[i].SetYTitle("Frequency")
        hist_all[i].SetMinimum(0.0)
        hist_all[i].SetLineColor(kMagenta)
        hist_all[i].SetLineWidth(2)
        hist_all[i].Draw()

    hist_Angle_sig.SetLineColor(kBlue)
    hist_Angle_sig.SetLineWidth(2)
    hist_Angle_sig.Draw("same")

    hist_Angle_bkg.SetLineColor(kRed)
    hist_Angle_bkg.SetLineWidth(2)
    hist_Angle_bkg.Draw("same")

    canvas1.cd(4)
    hist2D_all.SetMarkerColor(kMagenta)
    hist2D_all.Draw("box")

    canvas1.Update()
    if (SavePlots) : canvas1.SaveAs("VariableDistributions.pdf")



    # -----------------------------------------------------------------------------------
    # Given succesful fit, how do you determine what the signal distribution of "angle" is?
    # -----------------------------------------------------------------------------------

    """
    # Here are empty histograms - try to fill them:
    hist_Angle_sig = TH1D("hist_Angle_sig", ";Signal Angle;Frequency", Nbins[2], xmin[2], xmax[2])
    hist_Angle_bkg = TH1D("hist_Angle_bkg", ";Background Angle;Frequency", Nbins[2], xmin[2], xmax[2])


    for i in xrange ( len(x_all) ) :
        hist_Angle_sig.Fill( SOMETHING )
        hist_Angle_bkg.Fill( SOMETHING )


    # Produce plots with the variable distributions for signal and background:
    # ------------------------------------------------------------------------
    canvas2 = TCanvas("canvas2", "", 50, 50, 1200, 700)
    canvas2.Divide(1,2)

    canvas2.cd(1)
    hist_Angle_sig.SetLineColor(kRed)
    hist_Angle_sig.SetLineWidth(2)
    hist_Angle_sig.Draw("e")

    canvas2.cd(2)
    hist_Angle_bkg.SetLineColor(kBlue)
    hist_Angle_bkg.SetLineWidth(2)
    hist_Angle_bkg.Draw("e")

    canvas2.Update()
    if (SavePlots) : canvas2.SaveAs("AngleDistributions.pdf")
    """


    raw_input( ' ... ' )
    return 


# ------------------------------------------------------------------------ #
if __name__ == '__main__':
    sys.exit( main() )



# ----------------------------------------------------------------------------------- #
#
# Start by taking a look at the data and the 1D fits to ensure that you understand
# what is going on. Then inspect the 2D fit, and see if you understand how this is
# constructed.
#
# Questions:
# ----------
#  1) Do the two 1D fits and the 2D fit converge well? Do they have good Chi2 probabilities?
#     Can you imagine what the signal and background distributions look like? Otherwise draw them!
#     Also, switch off the "RunFast" boolean, such that the full 2D fit runs, and realise how
#     tough a task this is (it takes some time, but should converge!).
#
#  2) Try to make a signal and a background selection in Mass and Shape, and plot these to get a
#     feel for the signal and background distributions of the Angle variables. Then try to subtract
#     the (correctly estimated) amount of background from the signal Angle distribution, such that
#     it yields an estimate of the true signal distribution.
#     Additionally or alternatively, calculate a weight as PDFsignal / (PDFsignal + PDFbackground)
#     for each event, and see if it compares to the background subtracted estimate from above.
#
#  3) Now try to calculate an sWeight from the PDF value and the covariance matrix. Follow the
#     recipe in the paper (http://arxiv.org/pdf/physics/0402083v3.pdf) page 12, also described in
#     the slides.