#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Root macro for calculating two different types of separation measures # # Author: Lars Egholm Pedersen # Email: egholm@nbi.dk # Date: 5th of October 2013 # # Create som artificial random data for two different processes # And calculate ROC Integral + Normalized integral differences # # ----------------------------------------------------------------------------------- # # Imports from ROOT import * from array import array import math gStyle.SetOptStat(0) gStyle.SetOptTitle(0) def sqr( a ) : return a*a # -------------------------------------------------------------------------------- # Calculate ROC integral # Optional : drawPlot, if set to True, function will also draw distributions # Everything in the block opened by drawPlot (i.e. ending with return statement) can # be removed without any consequences if copied to other analysis scripts def calcROC( hist_a = TH1D(), hist_b = TH1D() ) : #Calculate ROC integral between hist_a and hist_b drawPlot = True hist_a.Sumw2() #Make sure errors are scaled properly hist_b.Sumw2() hist_a.Scale( 1.0 / hist_a.Integral() ) # Normalize histograms hist_b.Scale( 1.0 / hist_b.Integral() ) #Arrays containing a efficiency and b rejection eff_a = array( 'f' ) # Arrays containing signal and background rej_b = array( 'f' ) # efficiency / rejection # Fill arrays with relevant efficiencies (and rej = 1-efficiency) from # histograms: efficiency of a, 1-efficiency of b. # Remember that there are a lot of different ways that the ROC can be # defined, i.e. 1-alpha, 1-beta ... # Try here to define it in such a way that the integral > 0.5 per def. # I.e. make the curve bend towards the top right corner of the plot # Easiest to do by trial and error... #Create a graph containing errors ROC = TGraph( hist_a.GetNbinsX(), eff_a, rej_b ) # -------------------------------------------------------------------------------- # Draw plots here if drawPlot : c = TCanvas("c", "c", 1200, 900) padROC = TPad("padROC", "padROC", 0.0, 0.0, 1.0, 1.0) #Pad for drawing ROC curve padROC.Draw() padROC.cd() ROC.GetXaxis().SetRangeUser(0.0, 1.0) ROC.GetYaxis().SetRangeUser(0.0, 1.0) ROC.GetXaxis().SetTitle( "Signal efficiency" ) ROC.GetYaxis().SetTitle( "Background rejection" ) ROC.SetMarkerStyle(20) ROC.SetMarkerSize(2) ROC.Draw("AP") #Pad for drawing distributions padEff = TPad("padDist", "padDist", 0.13, 0.13, 0.50, 0.50) padEff.SetTopMargin(0.1) padEff.SetRightMargin(0.02) padEff.SetLeftMargin(0.05) padEff.Draw() padEff.cd() hist_a.GetXaxis().SetTitle("Some observable") hist_a.SetLineColor(2) hist_b.SetLineColor(4) hist_a.SetLineWidth(2) hist_b.SetLineWidth(2) hist_a.Draw() hist_b.Draw("same") #Draw legend and ROC pad padROC.cd() leg = TLegend( 0.70, 0.74, 0.88, 0.88 ) leg.SetTextFont(42) leg.SetMargin(0.1) leg.SetShadowColor(0) leg.SetFillColor(0) leg.SetFillStyle(0) leg.SetLineColor(0) leg.AddEntry( hist_a, "Signal distribution" , "l") leg.AddEntry( hist_b, "Background distribution", "l") leg.AddEntry( 0 , "#int ROC = %5.3f"%(ROC.Integral() + 0.5), "" ) leg.Draw() c.Update() c.Draw() raw_input( " ... Press Enter to close ROC plot " ) #Plot drawing ending here... return ROC.Integral() + 0.5 # -------------------------------------------------------------------------------- # Calculate normalize difference in integral def calcIntegralDiff( hist_a = TH1D(), hist_b = TH1D() ) : sep = 0.0 #Separation for ibin in range( hist_a.GetNbinsX() ) : # Loop over number of bins # Make sure you are not dividing by zero! if ( hist_a.GetBinContent( ibin + 1 ) < 0.00001 and hist_b.GetBinContent( ibin + 1 ) < 0.00001 ) : continue sep += 0.0 # ??? # 1/2 * int( sqr(pdf_sig - pdf_bkg) / (pdf_sig + pdf_bkg) ) return 0.5 * sep #Execute function : Main call here if __name__ == "__main__" : nEvents = 10000 #Number of events to generate r = TRandom3() #Random generator #Define parameters / gaussian histograms mean_sig = 1.0 mean_bkg = 0.0 sigma_sig = 1.0 sigma_bkg = 1.0 #Define parameters / trigonomic histograms sine_phase = 0.0 #Define histograms histRange = [-5.0, 5.0] # histRange = [0.0, 10*math.pi] nBins = 200 hist_sig = TH1D( "hist_sig", "hist_sig", nBins, histRange[0], histRange[1] ) hist_bkg = TH1D( "hist_bkg", "hist_bkg", nBins, histRange[0], histRange[1] ) # -------------------------------------------------------------------------------- # Fill histograms with random number from gaussian distributions # -------------------------------------------------------------------------------- for iEvt in range( nEvents ) : hist_sig.Fill( r.Gaus( mean_sig, sigma_sig ) ) hist_bkg.Fill( r.Gaus( mean_bkg, sigma_bkg ) ) # -------------------------------------------------------------------------------- # Fill histograms with sine/cosine numbers (Not random!) # -------------------------------------------------------------------------------- # This is a bit tricky python way of doing things. Start by : # histRange[0] + ibin*histRange[1]/nBins # Deviding histRange[1]-histRange[0] into nBins # This will give the value at the ibin # [histRange[0] + ibin*histRange[1]/nBins for ibin in range( nBins ) ] # Will create a list of length nBins containing the # values that is calculated one by one # for ibin, iangle in enumerate([histRange[0] + ibin*histRange[1]/nBins for ibin in range( nBins ) ]) : # # hist_sig.SetBinContent( ibin+1, math.cos(iangle ) + 1.0 ) #Adding 1 since pdfs are # hist_bkg.SetBinContent( ibin+1, math.sin(iangle - sine_phase) + 1.0 ) #positive definite # # hist_sig.SetBinError( ibin+1, 0.01 ) # hist_bkg.SetBinError( ibin+1, 0.01 ) print "ROC integral : ", calcROC( hist_sig, hist_bkg ) # print "Normalized Integral difference : ", calcIntegralDiff( hist_sig, hist_bkg ) # -------------------------------------------------------------------------------- # Questions # -------------------------------------------------------------------------------- # # Start by using the histograms that are filled with random Gaussian numbers in # the main part of the script # # # Go to the calcROC function defined in the top of the script. This takes as argument # the two histograms you have filled with random numbers, and should end up calculating # the ROC integral between the two (function is called in the line: print "ROC integral".... # # # Fill in the blanks... # # # What Int(ROC) do you get with the initial options (i.e. distributions have width one # and a mean shift of one) # # # Change the mean of the signal distribution to 0.5, 2,3 sigma # (Take care to change the histogram ranges s.t. your data doesn't end up in overflow) # Look at the curve. Do you understand how to interpret it? # # What numbers do you end up getting? # # -------------------------------------------------------------------------------- # # Thats all fine, and seems to work fairly well... # # - Change the histogram range to [0.0, 10*math.pi] # - Outcomment the part where the histograms are filled with gaussian numbers # - Put back in the lines where the histogram is filled with sine/cosine numbers # # Calculate the ROC integral again and plot the distributions... # # What int(ROC) do you get now? # # Comparing these to the Gaussian distributions, does this seem reasonable? # (Leading question...) # # Go to the function calcIntegralDiff defined in a very similar way as the calcROC # and again fill in the blanks, s.t. you end up getting : # 1/2 * int( sqr(pdf_sig - pdf_bkg) / (pdf_sig + pdf_bkg) ) # # Put back in the line at the bottom of the script saying print "Normalized Integral diff...." # which will print out the result of the function and see what numbers do you get out. # # Finally change the phi_phase to pi/2 to see an example where the ROC integral is # totally useless. # # --------------------------------------------------------------------------------