#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python/ROOT macro for training different types of TMVA's discriminants. # # Author: Lars Egholm Pedersen # Email: egholm@nbi.dk # Date: 8th of October 2014 # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 2nd of January 2016 # # ----------------------------------------------------------------------------------- # from ROOT import * import sys # Main call here: if __name__ == '__main__': # ----------------------------------------------------------------------------------- # # Test beam analysis: # ----------------------------------------------------------------------------------- # # Define path to input files, name of ttrees and cuts to be applied on signal and background . # Note that we here use the same input file and tree but with the "var_id" variable, but this # can in general be taken from several files (see Higgs example below). sigfile_name = 'atlas_test_beam_data.root' sigtree_name = 'data_tree' bkgfile_name = 'atlas_test_beam_data.root' bkgtree_name = 'data_tree' # Here is just defined a cut based on a rough eye estimate from looking at data # The names here should correspond to what is found in the ttrees cut = { 'signal' : 'cherenkov_counts > 700 && ht_trt_hits / trt_hits > 0.16' , 'background' : 'cherenkov_counts < 590 && ht_trt_hits / trt_hits < 0.06' } # Define which variables we want to use for classification # Use a dictionary to store both name and datatype vardict = { "em_1" : 'f' , "em_2" : 'f' , "em_3" : 'f' , "em_4" : 'f' , "had_1" : 'f' , "had_2" : 'f' , "had_3" : 'f' } # ----------------------------------------------------------------------------------- # # MVA test sample using six variables: # ----------------------------------------------------------------------------------- # """ sigfile_name = 'DataSample.root' sigtree_name = 'rand_tree' bkgfile_name = 'DataSample.root' bkgtree_name = 'rand_tree' cut = { 'signal' : 'var_id == 0' , 'background' : 'var_id == 1' } # Define which variables we want to use for classification vardict = { "var_1" : 'f' , "var_2" : 'f' , "var_3" : 'f' , "var_4" : 'f' , "var_5" : 'f' , "var_6" : 'f' } """ # ----------------------------------------------------------------------------------- # # Angular Higgs -> ZZ -> 4l analysis: # ----------------------------------------------------------------------------------- # """ sigfile_name = 'Higgs14TeV.root' sigtree_name = 'Default_M125' bkgfile_name = 'ZZ14TeV.root' bkgtree_name = 'Default' vardict = { "cts" : 'f' , "phi1" : 'f' , "ct1" : 'f' , "ct2" : 'f' , "phi" : 'f' , "mZ1" : 'f' , "mZ2" : 'f' } #Define variables and type of variable cut = { 'signal' : '' , 'background' : '' } #No need to cut here : Two seperate files """ # ----------------------------------------------------------------------------------- # # TMVA is defined from here... # ----------------------------------------------------------------------------------- # # Define factory options. See table 1 of the TMVA user guide for settings: # http://tmva.sourceforge.net/docu/TMVAUsersGuide.pdf # Start TMVA by defining the "overall" problem (here classification into signal and background): factoryOption = "!V:!Silent:Transformations=I;P:AnalysisType=Classification" # TMVA title : tmvatitle = "TMVAClassifier" # Start training here: output = TFile("./tmva." + tmvatitle + ".root", "RECREATE" ) factory = TMVA.Factory( tmvatitle, output, factoryOption ) # Retrieve signal and background data: file_signal = TFile( sigfile_name, "READ" ) tree_signal = file_signal.Get( sigtree_name ) file_background = TFile( bkgfile_name, "READ" ) tree_background = file_background.Get( bkgtree_name ) # Tell the TMVA factory where it should get the signal and background data from. # The second argument is an event weight if you e.g. have several background models. # When training, it is important that their relative abundance is correctly specified. factory.AddSignalTree( tree_signal , 1.0 ) # Tell tmva where it should find signal factory.AddBackgroundTree( tree_background, 1.0 ) # and background trees # Add variables to your TMVA factory: for ivar in vardict : # This will tell the factory which variables should be used for # classification and their data format, e.g. # ivar = em_1, vardict[ivar] = 'f' (i.e. float) factory.AddVariable( ivar, ivar, "", vardict[ivar] ) # Tell factory how many event it should train on etc. # First set of arguments tell the factory which cuts are to be applied on the signal and # background data. Here we have specified : use all signal event, all background events. # To split between training and testing samples use a random selection of these events, # and finally normalize the signal and background events by simply using the number of events. factory.PrepareTrainingAndTestTree( TCut(cut['signal']), TCut(cut['background']), "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V" ) # Define different types of classifiers. # Here are general training options, see table to of the TMVA user guide. factory.BookMethod( TMVA.Types.kFisher , "Fisher" , "!H:!V:Fisher:VarTransform=None") # factory.BookMethod( TMVA.Types.kBDT , "BDT" , "!H:!V:NTrees=10:MaxDepth=1:VarTransform=None" ) # factory.BookMethod( TMVA.Types.kCFMlpANN , "CFMlpANN", "!H:!V:NCycles=10:HiddenLayers=N" ) # Execute actual training here factory.TrainAllMethods(); factory.TestAllMethods(); factory.EvaluateAllMethods(); # Write and close output file output.Write() output.Close() # Load GUI gROOT.SetMacroPath( "/Users/petersen/Software/root/tmva/test/" ) gROOT.Macro ( "./TMVAlogon.C" ) gROOT.LoadMacro ( "./TMVAGui.C" ) gROOT.ProcessLine( "TMVAGui(\"%s\")" % ("./tmva." + tmvatitle + ".root") ) raw_input( ' Press enter to exit GUI ' ) # -------------------------------------------------------------------------------- # Questions # -------------------------------------------------------------------------------- # # The script is a general setup and running of TMVA. Simply start by running it, # here using the ATLAS TestBeam data, and trying to separate electrons from pions # based on the calorimeter. # The three MVA methods used are Fisher, Neural Network and Boosted Decision Trees. # A lot of information is printet to the screen... # # TMVA will tell you the separational power of the individual variables : # # --- IdTransformation : Ranking result (top variable is best ranked) # --- IdTransformation : ------------------------------------- # --- IdTransformation : Rank : Variable : Separation # --- IdTransformation : ------------------------------------- # --- IdTransformation : 2 : em_1 : ... # # # And what the found Fisher coefficients are : # # --- Fisher : ----------------------- # --- Fisher : Variable: Coefficient: # --- Fisher : ----------------------- # --- Fisher : em_4: ... # # Finally it will also print out the ROC integral for you: # # --- Factory : -------------------------------------------------------------------------------- # --- Factory : MVA Signal efficiency at bkg eff.(error): | Sepa- Signifi- # --- Factory : Method: @B=0.01 @B=0.10 @B=0.30 ROC-integ. | ration: cance: # --- Factory : -------------------------------------------------------------------------------- # # You can study the output by typing in your terminal (here it is started automatically!): # # root -l $ROOTSYS/tmva/test/TMVAGui.C\(\"tmva.TMVAClassifier.root\"\) # # This will open a GUI that can plot all input parameters, correlations, resulting discriminat # (Classifier Output Distributions ... ) and ROC curve in an automated way # # -------------------------------------------------------------------------------- # # Change the script s.t. using other types of classifiers by adding the two lines : # # factory.BookMethod( TMVA.Types.kBDT , "BDT" , "!H:!V:NTrees=10:MaxDepth=1" ) # factory.BookMethod( TMVA.Types.kCFMlpANN , "CFMlpANN", "!H:!V:NCycles=10:HiddenLayers=N" ); # # back into the code. This will train a Boosted Decision Tree (top line) and a Neural Network (Bottom) # This will probably initially not perform that well. But do it anyway and use the GUI to look at # distributions and linear correlation coeeficients. # Looking at the distributions, can you get an idea of which variables will give a good separation? # Does this correspond to what TMVA writes out when running? # # Try to improve the BDT by modifying how many trees or how many nodes it uses. # Does this improve the performance? Try the same also for the NN (HiddenLayers: N+1, N). # # Try to remove the least significant variable to see if that helps in the test / training comparrison? # Does this change the ROC integral? # # Which of the methods seem to be most sensitive to the amount of statistics? # # ---------------------------------------------------------------------------------- # # Remove the setup for the test beam analysis and add the setup for Higgs analysis. # # The parameters in the ATLAS test beem data are to a high degree linearly correlated # As you have seen this means that the Fisher discriminant performs as well as more # computationally advanced methods. # # For an example where parameters have more complex relationship you can modify # the code to separate a Higgs signal versus its predominant background using # angular variables, the observables found in the files are defined as seen in e.g. : # https://muon.files.wordpress.com/2012/11/figures_angles-graviton-all.png # The file names are: # Higgs14TeV.root # ZZ14TeV.root # Train a BDT s.t. you feel that it is as complex as possible without beeing overtrained. # # Compare the resulting ROC curve to that of the Fisher discriminant, do you see an # improvement? # # ----------------------------------------------------------------------------------