#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Python/ROOT macro for training different types of TMVA's discriminants.
#
#  Author: Lars Egholm Pedersen 
#  Email:  egholm@nbi.dk
#  Date:   8th of October 2014
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   2nd of January 2016
#
# ----------------------------------------------------------------------------------- #

from ROOT import *
import sys

# Main call here:
if __name__ == '__main__':

    # ----------------------------------------------------------------------------------- #
    # Test beam analysis:
    # ----------------------------------------------------------------------------------- #

    # Define path to input files, name of ttrees and cuts to be applied on signal and background .
    # Note that we here use the same input file and tree but with the "var_id" variable, but this
    # can in general be taken from several files (see Higgs example below).
    sigfile_name = 'atlas_test_beam_data.root'
    sigtree_name = 'data_tree'

    bkgfile_name = 'atlas_test_beam_data.root'
    bkgtree_name = 'data_tree'

    # Here is just defined a cut based on a rough eye estimate from looking at data
    # The names here should correspond to what is found in the ttrees 
    cut = { 'signal'     : 'cherenkov_counts > 700 && ht_trt_hits / trt_hits > 0.16' , 
            'background' : 'cherenkov_counts < 590 && ht_trt_hits / trt_hits < 0.06' }

    # Define which variables we want to use for classification
    # Use a dictionary to store both name and datatype
    vardict = { "em_1"        : 'f' , 
                "em_2"        : 'f' , 
                "em_3"        : 'f' , 
                "em_4"        : 'f' , 
                "had_1"       : 'f' , 
                "had_2"       : 'f' , 
                "had_3"       : 'f' }


    # ----------------------------------------------------------------------------------- #
    # MVA test sample using six variables:
    # ----------------------------------------------------------------------------------- #

    """
    sigfile_name = 'DataSample.root'
    sigtree_name = 'rand_tree'

    bkgfile_name = 'DataSample.root'
    bkgtree_name = 'rand_tree'

    cut = { 'signal'     : 'var_id == 0' , 
            'background' : 'var_id == 1' }

    # Define which variables we want to use for classification
    vardict = { "var_1"        : 'f' , 
                "var_2"        : 'f' , 
                "var_3"        : 'f' , 
                "var_4"        : 'f' , 
                "var_5"        : 'f' , 
                "var_6"        : 'f' }
    """


    # ----------------------------------------------------------------------------------- #
    # Angular Higgs -> ZZ -> 4l analysis:
    # ----------------------------------------------------------------------------------- #

    """
    sigfile_name = 'Higgs14TeV.root'
    sigtree_name = 'Default_M125'

    bkgfile_name = 'ZZ14TeV.root'
    bkgtree_name = 'Default'

    vardict = { "cts"  : 'f' ,
                "phi1" : 'f' ,
                "ct1"  : 'f' ,
                "ct2"  : 'f' ,
                "phi"  : 'f' ,
                "mZ1"  : 'f' ,
                "mZ2"  : 'f' } #Define variables and type of variable

    cut = { 'signal'     : '' , 
            'background' : '' } #No need to cut here : Two seperate files
    """


    # ----------------------------------------------------------------------------------- #
    # TMVA is defined from here...
    # ----------------------------------------------------------------------------------- #

    # Define factory options. See table 1 of the TMVA user guide for settings:
    #   http://tmva.sourceforge.net/docu/TMVAUsersGuide.pdf

    # Start TMVA by defining the "overall" problem (here classification into signal and background):
    factoryOption = "!V:!Silent:Transformations=I;P:AnalysisType=Classification"

    # TMVA title : 
    tmvatitle = "TMVAClassifier"

    # Start training here:
    output  = TFile("./tmva." + tmvatitle + ".root", "RECREATE" )
    factory = TMVA.Factory( tmvatitle, output, factoryOption )

    # Retrieve signal and background data:
    file_signal = TFile( sigfile_name, "READ" )
    tree_signal = file_signal.Get( sigtree_name )
    file_background = TFile( bkgfile_name, "READ" )
    tree_background = file_background.Get( bkgtree_name )

    # Tell the TMVA factory where it should get the signal and background data from.
    # The second argument is an event weight if you e.g. have several background models.
    # When training, it is important that their relative abundance is correctly specified.
    factory.AddSignalTree(     tree_signal    , 1.0 )   # Tell tmva where it should find signal
    factory.AddBackgroundTree( tree_background, 1.0 )   # and background trees

    # Add variables to your TMVA factory:
    for ivar in vardict : 
        # This will tell the factory which variables should be used for
        # classification and their data format, e.g. 
        # ivar = em_1, vardict[ivar] = 'f' (i.e. float)
        factory.AddVariable( ivar, ivar, "", vardict[ivar] )

    # Tell factory how many event it should train on etc.
    # First set of arguments tell the factory which cuts are to be applied on the signal and
    # background data. Here we have specified : use all signal event, all background events.
    # To split between training and testing samples use a random selection of these events,
    # and finally normalize the signal and background events by simply using the number of events.
    factory.PrepareTrainingAndTestTree( TCut(cut['signal']), TCut(cut['background']), 
                                        "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V" )

    # Define different types of classifiers.
    # Here are general training options, see table to of the TMVA user guide.
    factory.BookMethod( TMVA.Types.kFisher   , "Fisher"  , "!H:!V:Fisher:VarTransform=None")
    # factory.BookMethod( TMVA.Types.kBDT      , "BDT"     , "!H:!V:NTrees=10:MaxDepth=1:VarTransform=None" )
    # factory.BookMethod( TMVA.Types.kCFMlpANN , "CFMlpANN", "!H:!V:NCycles=10:HiddenLayers=N"  )

    # Execute actual training here
    factory.TrainAllMethods();
    factory.TestAllMethods();
    factory.EvaluateAllMethods();

    # Write and close output file
    output.Write()
    output.Close()

    # Load GUI
    gROOT.SetMacroPath( "/Users/petersen/Software/root/tmva/test/" )
    gROOT.Macro       ( "./TMVAlogon.C" )    
    gROOT.LoadMacro   ( "./TMVAGui.C" )
    gROOT.ProcessLine( "TMVAGui(\"%s\")" % ("./tmva." + tmvatitle + ".root") )


    raw_input( ' Press enter to exit GUI ' )


# --------------------------------------------------------------------------------
# Questions
# --------------------------------------------------------------------------------
# 
# The script is a general setup and running of TMVA. Simply start by running it,
# here using the ATLAS TestBeam data, and trying to separate electrons from pions
# based on the calorimeter.
# The three MVA methods used are Fisher, Neural Network and Boosted Decision Trees.
# A lot of information is printet to the screen...
#
# TMVA will tell you the separational power of the individual variables : 
#
# --- IdTransformation         : Ranking result (top variable is best ranked)
# --- IdTransformation         : -------------------------------------
# --- IdTransformation         : Rank : Variable     : Separation
# --- IdTransformation         : -------------------------------------
# --- IdTransformation         :    2 : em_1      : ...
#
#
# And what the found Fisher coefficients are : 
# 
# --- Fisher                   : -----------------------
# --- Fisher                   : Variable:  Coefficient:
# --- Fisher                   : -----------------------
# --- Fisher                   :     em_4: ...
#
# Finally it will also print out the ROC integral for you:
#
# --- Factory                  : --------------------------------------------------------------------------------
# --- Factory                  : MVA              Signal efficiency at bkg eff.(error):       | Sepa-    Signifi- 
# --- Factory                  : Method:          @B=0.01    @B=0.10    @B=0.30    ROC-integ. | ration:  cance:   
# --- Factory                  : --------------------------------------------------------------------------------
#
# You can study the output by typing in your terminal (here it is started automatically!): 
# 
# root -l $ROOTSYS/tmva/test/TMVAGui.C\(\"tmva.TMVAClassifier.root\"\)
#
# This will open a GUI that can plot all input parameters, correlations, resulting discriminat
# (Classifier Output Distributions ... ) and ROC curve in an automated way
#
# --------------------------------------------------------------------------------
#
# Change the script s.t. using other types of classifiers by adding the two lines : 
#
#    factory.BookMethod( TMVA.Types.kBDT      , "BDT"     , "!H:!V:NTrees=10:MaxDepth=1" )
#    factory.BookMethod( TMVA.Types.kCFMlpANN , "CFMlpANN", "!H:!V:NCycles=10:HiddenLayers=N"  );
#
# back into the code. This will train a Boosted Decision Tree (top line) and a Neural Network (Bottom)
# This will probably initially not perform that well. But do it anyway and use the GUI to look at
# distributions and linear correlation coeeficients. 
# Looking at the distributions, can you get an idea of which variables will give a good separation?
# Does this correspond to what TMVA writes out when running?
# 
# Try to improve the BDT by modifying how many trees or how many nodes it uses.
# Does this improve the performance? Try the same also for the NN (HiddenLayers: N+1, N).
# 
# Try to remove the least significant variable to see if that helps in the test / training comparrison?
# Does this change the ROC integral?
#
# Which of the methods seem to be most sensitive to the amount of statistics?
# 
# ----------------------------------------------------------------------------------
# 
# Remove the setup for the test beam analysis and add the setup for Higgs analysis.
# 
# The parameters in the ATLAS test beem data are to a high degree linearly correlated
# As you have seen this means that the Fisher discriminant performs as well as more  
# computationally advanced methods.
# 
# For an example where parameters have more complex relationship you can modify
# the code to separate a Higgs signal versus its predominant background using 
# angular variables, the observables found in the files are defined as seen in e.g. : 
# https://muon.files.wordpress.com/2012/11/figures_angles-graviton-all.png
# The file names are:
#   Higgs14TeV.root
#   ZZ14TeV.root
# Train a BDT s.t. you feel that it is as complex as possible without beeing overtrained.
# 
# Compare the resulting ROC curve to that of the Fisher discriminant, do you see an 
# improvement?
# 
# ----------------------------------------------------------------------------------