#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Root macro for calculating two different types of separation measures
#
#  Author: Lars Egholm Pedersen 
#  Email:  egholm@nbi.dk
#  Date:   5th of October 2013
#
#  Create som artificial random data for two different processes
#  And calculate ROC Integral + Normalized integral differences
#
# ----------------------------------------------------------------------------------- #
# Imports 

from ROOT import *
from array import array
import math

gStyle.SetOptStat(0)
gStyle.SetOptTitle(0)

def sqr( a ) : 
    return a*a


# --------------------------------------------------------------------------------
# Calculate ROC integral
# Optional : drawPlot, if set to True, function will also draw distributions
# Everything in the block opened by drawPlot (i.e. ending with return statement) can
# be removed without any consequences if copied to other analysis scripts
def calcROC( hist_a = TH1D(), hist_b = TH1D() ) : #Calculate ROC integral between hist_a and hist_b

    drawPlot = True

    hist_a.Sumw2() #Make sure errors are scaled properly
    hist_b.Sumw2()

    hist_a.Scale( 1.0 / hist_a.Integral() ) # Normalize histograms
    hist_b.Scale( 1.0 / hist_b.Integral() ) 

    #Arrays containing a efficiency and b rejection
    eff_a = array( 'f' ) # Arrays containing signal and background
    rej_b = array( 'f' ) # efficiency / rejection

    # Fill arrays with relevant efficiencies (and rej = 1-efficiency) from
    # histograms: efficiency of a, 1-efficiency of b.
    # Remember that there are a lot of different ways that the ROC can be 
    # defined, i.e. 1-alpha, 1-beta ...
    # Try here to define it in such a way that the integral > 0.5 per def.
    # I.e. make the curve bend towards the top right corner of the plot
    # Easiest to do by trial and error...

    #Create a graph containing errors
    ROC = TGraph( hist_a.GetNbinsX(), eff_a, rej_b )

    # --------------------------------------------------------------------------------
    # Draw plots here
    if drawPlot : 
        c = TCanvas("c", "c", 1200, 900)

        padROC = TPad("padROC", "padROC", 0.0, 0.0, 1.0, 1.0) #Pad for drawing ROC curve
        padROC.Draw()
        padROC.cd()

        ROC.GetXaxis().SetRangeUser(0.0, 1.0)
        ROC.GetYaxis().SetRangeUser(0.0, 1.0)
        ROC.GetXaxis().SetTitle( "Signal efficiency"    )
        ROC.GetYaxis().SetTitle( "Background rejection" )

        ROC.SetMarkerStyle(20)
        ROC.SetMarkerSize(2)

        ROC.Draw("AP")

        #Pad for drawing distributions
        padEff = TPad("padDist", "padDist", 0.13, 0.13, 0.50, 0.50)
        padEff.SetTopMargin(0.1)
        padEff.SetRightMargin(0.02)
        padEff.SetLeftMargin(0.05)
        padEff.Draw()
        padEff.cd()

        hist_a.GetXaxis().SetTitle("Some observable")

        hist_a.SetLineColor(2)
        hist_b.SetLineColor(4)
        hist_a.SetLineWidth(2)
        hist_b.SetLineWidth(2)

        hist_a.Draw()
        hist_b.Draw("same")
        
        #Draw legend and ROC pad
        padROC.cd()
        leg = TLegend( 0.70, 0.74, 0.88, 0.88 )
        leg.SetTextFont(42)
        leg.SetMargin(0.1)
        leg.SetShadowColor(0)
        leg.SetFillColor(0)
        leg.SetFillStyle(0)
        leg.SetLineColor(0)
        leg.AddEntry( hist_a, "Signal distribution"    , "l")
        leg.AddEntry( hist_b, "Background distribution", "l")
        leg.AddEntry( 0     , "#int ROC = %5.3f"%(ROC.Integral() + 0.5), "" )

        leg.Draw()

        c.Update()
        c.Draw()
        raw_input( " ... Press Enter to close ROC plot " )
        #Plot drawing ending here...
        
    return ROC.Integral() + 0.5


# --------------------------------------------------------------------------------
# Calculate normalize difference in integral
def calcIntegralDiff( hist_a = TH1D(), hist_b = TH1D() ) : 

    sep = 0.0 #Separation

    for ibin in range( hist_a.GetNbinsX() ) : # Loop over number of bins

        # Make sure you are not dividing by zero!
        if ( hist_a.GetBinContent( ibin + 1 ) < 0.00001 and 
             hist_b.GetBinContent( ibin + 1 ) < 0.00001 ) : continue 

        sep += 0.0 # ??? # 1/2 * int( sqr(pdf_sig - pdf_bkg) / (pdf_sig + pdf_bkg) )

    return 0.5 * sep


#Execute function : Main call here
if __name__ == "__main__" :

    nEvents = 10000 #Number of events to generate

    r = TRandom3()  #Random generator

    #Define parameters / gaussian histograms
    mean_sig  = 1.0
    mean_bkg  = 0.0
    sigma_sig = 1.0
    sigma_bkg = 1.0

    #Define parameters / trigonomic histograms
    sine_phase = 0.0

    #Define histograms
    histRange = [-5.0, 5.0]
#    histRange = [0.0, 10*math.pi]
    nBins     = 200
    hist_sig  = TH1D( "hist_sig", "hist_sig", nBins, histRange[0], histRange[1] )
    hist_bkg  = TH1D( "hist_bkg", "hist_bkg", nBins, histRange[0], histRange[1] )

    # --------------------------------------------------------------------------------
    # Fill histograms with random number from gaussian distributions
    # --------------------------------------------------------------------------------

    for iEvt in range( nEvents ) : 
        hist_sig.Fill( r.Gaus( mean_sig, sigma_sig ) )
        hist_bkg.Fill( r.Gaus( mean_bkg, sigma_bkg ) )

    # --------------------------------------------------------------------------------
    # Fill histograms with sine/cosine numbers (Not random!)
    # --------------------------------------------------------------------------------

    # This is a bit tricky python way of doing things. Start by : 
    # histRange[0] + ibin*histRange[1]/nBins                               # Deviding histRange[1]-histRange[0] into nBins
                                                                           # This will give the value at the ibin 
    # [histRange[0] + ibin*histRange[1]/nBins for ibin in range( nBins ) ] # Will create a list of length nBins containing the 
                                                                           # values that is calculated one by one 


#    for ibin, iangle in enumerate([histRange[0] + ibin*histRange[1]/nBins for ibin in range( nBins ) ]) : 
#
#        hist_sig.SetBinContent( ibin+1, math.cos(iangle             ) + 1.0 ) #Adding 1 since pdfs are
#        hist_bkg.SetBinContent( ibin+1, math.sin(iangle - sine_phase) + 1.0 ) #positive definite
#
#        hist_sig.SetBinError( ibin+1, 0.01 )
#        hist_bkg.SetBinError( ibin+1, 0.01 )


    print "ROC integral                   : ", calcROC( hist_sig, hist_bkg )
#    print "Normalized Integral difference : ", calcIntegralDiff( hist_sig, hist_bkg ) 


# --------------------------------------------------------------------------------
# Questions
# --------------------------------------------------------------------------------
# 
# Start by using the histograms that are filled with random Gaussian numbers in 
# the main part of the script
# 
#
# Go to the calcROC function defined in the top of the script. This takes as argument
# the two histograms you have filled with random numbers, and should end up calculating
# the ROC integral between the two (function is called in the line: print "ROC integral"....
# 
#
# Fill in the blanks...
# 
#
# What Int(ROC) do you get with the initial options (i.e. distributions have width one
# and a mean shift of one)
# 
#
# Change the mean of the signal distribution to 0.5, 2,3 sigma
# (Take care to change the histogram ranges s.t. your data doesn't end up in overflow)
# Look at the curve. Do you understand how to interpret it?
#
# What numbers do you end up getting? 
# 
# --------------------------------------------------------------------------------
# 
# Thats all fine, and seems to work fairly well...
# 
# - Change the histogram range to [0.0, 10*math.pi]
# - Outcomment the part where the histograms are filled with gaussian numbers
# - Put back in the lines where the histogram is filled with sine/cosine numbers
# 
# Calculate the ROC integral again and plot the distributions...
# 
# What int(ROC) do you get now?
# 
# Comparing these to the Gaussian distributions, does this seem reasonable? 
# (Leading question...)
# 
# Go to the function calcIntegralDiff defined in a very similar way as the calcROC
# and again fill in the blanks, s.t. you end up getting : 
# 1/2 * int( sqr(pdf_sig - pdf_bkg) / (pdf_sig + pdf_bkg) )
# 
# Put back in the line at the bottom of the script saying print "Normalized Integral diff...."
# which will print out the result of the function and see what numbers do you get out.
# 
# Finally change the phi_phase to pi/2 to see an example where the ROC integral is
# totally useless. 
#
# --------------------------------------------------------------------------------