#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Python macro for reading ATLAS electron/non-electron file (for small project in
#  MLandBDA2019).
#
#  Author: Troels C. Petersen (NBI) and Lukas Ehrke (NBI)
#  Email:  petersen@nbi.dk
#  Date:   25th of April 2019
#
# ----------------------------------------------------------------------------------- #

# ----------------------------------------------------------------------------------- #
#
#                         SMALL PROJECT IN MLandBDA2019
#                         -----------------------------
#  The files "train.h5" and "test.h5" contain data from the ATLAS experiment at CERN, more
#  specifically a long list of measurements (i.e. input variables) for electron candidates.
#  There are also two "truth" variables in "train.h5", namely if it is an electron and its energy.
#
#  Using Machine Learning algorithm(s), try to solve at least one of the following
#  problems:
#  1) Identify (i.e. classify) electrons compared to non-electrons based on the target variable
#     'Truth':       0 for non-electrons, 1 for electrons
#  2) Estimate (i.e. make regression for) the energy of electrons based on the target variable
#     'p_truth_E':   Energy (in MeV) of the electrons
#
#  You should use "train.h5" to develop your algorithm, and when you feel satisfied, you should
#  apply it to "test.h5", where you don't know the true values. When training (on "train.h5")
#  remember to divide the sample into a part that you train on, and one that you validate on,
#  such that you don't overtrain (to be discussed in class).
#
#  You "only" have to submit ONE solution for ONE of these problems, though it is typically not
#  hard to rewrite the code to solve the other problem as well. You are welcome to submit up to
#  three solutions for each problem using different algorithms. The solution(s) should NOT USE
#  MORE THAN 30 VARIABLES, but you're welcome to use different variables for each solution. 
#
#  You should hand in (each of) your solution(s) as TWO separate files:
#   * A list of index/event numbers (1, 2, 3, etc.) followed by your estimate on each event, i.e.
#     (Here is shown a classifcation solution. For a regression solution, the last number should
#      be the energy estimate in MeV)
#       0   0.998232
#       1   0.410455
#       2   0.037859
#       3   ...
#   * A list of the variables you've used for each problem, i.e.
#       p_eta
#       p_pt_track
#       ...
#  
#  You should name your file as follows:
#    TypeOfProblemSolved_FirstnameLastname_SolutionName(_VariableList).txt
#  three solution examples of which could for myself be:
#    Classification_TroelsPetersen_SKLearnAlgo1.txt
#    Classification_TroelsPetersen_SKLearnAlgo1_VariableList.txt
#    Classification_TroelsPetersen_XGBoost1.txt
#    Classification_TroelsPetersen_XGBoost1_VariableList.txt
#    Regression_TroelsPetersen_kNN-DidNotReallyWorkWell.txt
#    Regression_TroelsPetersen_kNN-DidNotReallyWorkWell_VariableList.txt
# ----------------------------------------------------------------------------------- #

import numpy as np
import h5py


with h5py.File("/Users/petersen/Teaching/MachineLearningAndBigData2019/Week3/train.h5", "r") as hf :
    data = hf["train"][:]

for i in range(10) :
    # Basic way of printing in a formatted way (using the "%" sign):
    # print("  %3d    eta (perpend?): %6.3f    FracHad: %6.3f    Track Mom.: %5.1f GeV    Cluster2 E: %5.1f GeV       is Elec: %1d    True E: %5.1f GeV"%(data[i]["index"], data[i]["p_eta"], data[i]["p_Rhad"], data[i]["p_pt_track"]/1000.0, data[i]["p_eClusterLr2"]/1000.0, data[i]["Truth"], data[i]["p_truth_E"]/1000.0))

    # More advanced (and better) way, using the f-strings (one advantage is that what you print goes together with the format):
    print(f"  {data[i]['index']:3d},     Eta (perpend?): {data[i]['p_eta']:6.3f}    FracHad: {data[i]['p_Rhad']:6.3f}    Track Mom.: {data[i]['p_pt_track']:8.1f} MeV    Cluster2 E: {data[i]['p_eClusterLr2']:8.1f} MeV       is Elec: {data[i]['Truth']:1d}    True E: {data[i]['p_truth_E']:8.1f} MeV")
    

# List of variables.
# ------------------
# The list of variables is included below just for reference, so that you can easily read/know what the data file contains.

# There are many variables, and part of the exercise is also to admit, that you don't know, what they are, and that you don't
# really need to know (*). The only thing you REALLY NEED TO KNOW is, which are INPUT variables (i.e. those you use), and
# which are the TARGET variables (i.e. those you want to learn to predict).
# In this case, all the variables are input variables, except the two target variables:
#  * 'Truth':       0 for non-electrons, 1 for electrons (in a CLASSIFICATION PROBLEM)
#  * 'p_truth_E':   Energy (in MeV) of the electrons (in a REGRESSION PROBLEM)

# (*) Well, if you were working alone on this, and wanted to get the absolute best result, you would probably benefit
#     from knowing, but generally one can get very far without knowing.

"""
liste = ['index',
    'eventNumber',
    'runNumber',
    'actualInteractionsPerCrossing',
    'averageInteractionsPerCrossing',
    'correctedActualMu',
    'correctedAverageMu',
    'correctedScaledActualMu',
    'correctedScaledAverageMu',
    'NvtxReco',
    'p_nTracks',
    'p_pt_track',
    'p_eta',
    'p_phi',
    'p_charge',
    'p_qOverP',
    'p_z0',
    'p_d0',
    'p_sigmad0',
    'p_d0Sig',
    'p_EptRatio',
    'p_dPOverP',
    'p_z0theta',
    'p_deltaR_tag',
    'p_etaCluster',
    'p_phiCluster',
    'p_eCluster',
    'p_rawEtaCluster',
    'p_rawPhiCluster',
    'p_rawECluster',
    'p_eClusterLr0',
    'p_eClusterLr1',
    'p_eClusterLr2',
    'p_eClusterLr3',
    'p_etaClusterLr1',
    'p_etaClusterLr2',
    'p_phiClusterLr2',
    'p_eAccCluster',
    'p_f0Cluster',
    'p_etaCalo',
    'p_phiCalo',
    'p_eTileGap3Cluster',
    'p_cellIndexCluster',
    'p_phiModCalo',
    'p_etaModCalo',
    'p_dPhiTH3',
    'p_R12',
    'p_fTG3',
    'p_weta2',
    'p_Reta',
    'p_Rphi',
    'p_Eratio',
    'p_f1',
    'p_f3',
    'p_Rhad',
    'p_Rhad1',
    'p_deltaEta1',
    'p_deltaPhiRescaled2',
    'p_TRTPID',
    'p_TRTTrackOccupancy',
    'p_numberOfInnermostPixelHits',
    'p_numberOfPixelHits',
    'p_numberOfSCTHits',
    'p_numberOfTRTHits',
    'p_numberOfTRTXenonHits',
    'p_chi2',
    'p_ndof',
    'p_SharedMuonTrack',
    'Truth',
    'p_truth_E',
    'p_E7x7_Lr2',
    'p_E7x7_Lr3',
    'p_E_Lr0_HiG',
    'p_E_Lr0_LowG',
    'p_E_Lr0_MedG',
    'p_E_Lr1_HiG',
    'p_E_Lr1_LowG',
    'p_E_Lr1_MedG',
    'p_E_Lr2_HiG',
    'p_E_Lr2_LowG',
    'p_E_Lr2_MedG',
    'p_E_Lr3_HiG',
    'p_E_Lr3_LowG',
    'p_E_Lr3_MedG',
    'p_ambiguityType',
    'p_asy1',
    'p_author',
    'p_barys1',
    'p_core57cellsEnergyCorrection',
    'p_deltaEta0',
    'p_deltaEta2',
    'p_deltaEta3',
    'p_deltaPhi0',
    'p_deltaPhi1',
    'p_deltaPhi2',
    'p_deltaPhi3',
    'p_deltaPhiFromLastMeasurement',
    'p_deltaPhiRescaled0',
    'p_deltaPhiRescaled1',
    'p_deltaPhiRescaled3',
    'p_e1152',
    'p_e132',
    'p_e235',
    'p_e255',
    'p_e2ts1',
    'p_ecore',
    'p_emins1',
    'p_etconeCorrBitset',
    'p_ethad',
    'p_ethad1',
    'p_f1core',
    'p_f3core',
    'p_maxEcell_energy',
    'p_maxEcell_gain',
    'p_maxEcell_time',
    'p_maxEcell_x',
    'p_maxEcell_y',
    'p_maxEcell_z',
    'p_nCells_Lr0_HiG',
    'p_nCells_Lr0_LowG',
    'p_nCells_Lr0_MedG',
    'p_nCells_Lr1_HiG',
    'p_nCells_Lr1_LowG',
    'p_nCells_Lr1_MedG',
    'p_nCells_Lr2_HiG',
    'p_nCells_Lr2_LowG',
    'p_nCells_Lr2_MedG',
    'p_nCells_Lr3_HiG',
    'p_nCells_Lr3_LowG',
    'p_nCells_Lr3_MedG',
    'p_pos',
    'p_pos7',
    'p_poscs1',
    'p_poscs2',
    'p_ptconeCorrBitset',
    'p_ptconecoreTrackPtrCorrection',
    'p_r33over37allcalo',
    'p_topoetconeCorrBitset',
    'p_topoetconecoreConeEnergyCorrection',
    'p_topoetconecoreConeSCEnergyCorrection',
    'p_weta1',
    'p_widths1',
    'p_widths2',
    'p_wtots1',
    'p_e233',
    'p_e237',
    'p_e277',
    'p_e2tsts1',
    'p_ehad1',
    'p_emaxs1',
    'p_fracs1',
    'p_DeltaE',
    'p_E3x5_Lr0',
    'p_E3x5_Lr1',
    'p_E3x5_Lr2',
    'p_E3x5_Lr3',
    'p_E5x7_Lr0',
    'p_E5x7_Lr1',
    'p_E5x7_Lr2',
    'p_E5x7_Lr3',
    'p_E7x11_Lr0',
    'p_E7x11_Lr1',
    'p_E7x11_Lr2',
    'p_E7x11_Lr3',
    'p_E7x7_Lr0',
    'p_E7x7_Lr1']
"""