#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python macro for reading ATLAS electron/non-electron file (for small project in # MLandBDA2019). # # Author: Troels C. Petersen (NBI) and Lukas Ehrke (NBI) # Email: petersen@nbi.dk # Date: 25th of April 2019 # # ----------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------- # # # SMALL PROJECT IN MLandBDA2019 # ----------------------------- # The files "train.h5" and "test.h5" contain data from the ATLAS experiment at CERN, more # specifically a long list of measurements (i.e. input variables) for electron candidates. # There are also two "truth" variables in "train.h5", namely if it is an electron and its energy. # # Using Machine Learning algorithm(s), try to solve at least one of the following # problems: # 1) Identify (i.e. classify) electrons compared to non-electrons based on the target variable # 'Truth': 0 for non-electrons, 1 for electrons # 2) Estimate (i.e. make regression for) the energy of electrons based on the target variable # 'p_truth_E': Energy (in MeV) of the electrons # # You should use "train.h5" to develop your algorithm, and when you feel satisfied, you should # apply it to "test.h5", where you don't know the true values. When training (on "train.h5") # remember to divide the sample into a part that you train on, and one that you validate on, # such that you don't overtrain (to be discussed in class). # # You "only" have to submit ONE solution for ONE of these problems, though it is typically not # hard to rewrite the code to solve the other problem as well. You are welcome to submit up to # three solutions for each problem using different algorithms. The solution(s) should NOT USE # MORE THAN 30 VARIABLES, but you're welcome to use different variables for each solution. # # You should hand in (each of) your solution(s) as TWO separate files: # * A list of index/event numbers (1, 2, 3, etc.) followed by your estimate on each event, i.e. # (Here is shown a classifcation solution. For a regression solution, the last number should # be the energy estimate in MeV) # 0 0.998232 # 1 0.410455 # 2 0.037859 # 3 ... # * A list of the variables you've used for each problem, i.e. # p_eta # p_pt_track # ... # # You should name your file as follows: # TypeOfProblemSolved_FirstnameLastname_SolutionName(_VariableList).txt # three solution examples of which could for myself be: # Classification_TroelsPetersen_SKLearnAlgo1.txt # Classification_TroelsPetersen_SKLearnAlgo1_VariableList.txt # Classification_TroelsPetersen_XGBoost1.txt # Classification_TroelsPetersen_XGBoost1_VariableList.txt # Regression_TroelsPetersen_kNN-DidNotReallyWorkWell.txt # Regression_TroelsPetersen_kNN-DidNotReallyWorkWell_VariableList.txt # ----------------------------------------------------------------------------------- # import numpy as np import h5py with h5py.File("/Users/petersen/Teaching/MachineLearningAndBigData2019/Week3/train.h5", "r") as hf : data = hf["train"][:] for i in range(10) : # Basic way of printing in a formatted way (using the "%" sign): # print(" %3d eta (perpend?): %6.3f FracHad: %6.3f Track Mom.: %5.1f GeV Cluster2 E: %5.1f GeV is Elec: %1d True E: %5.1f GeV"%(data[i]["index"], data[i]["p_eta"], data[i]["p_Rhad"], data[i]["p_pt_track"]/1000.0, data[i]["p_eClusterLr2"]/1000.0, data[i]["Truth"], data[i]["p_truth_E"]/1000.0)) # More advanced (and better) way, using the f-strings (one advantage is that what you print goes together with the format): print(f" {data[i]['index']:3d}, Eta (perpend?): {data[i]['p_eta']:6.3f} FracHad: {data[i]['p_Rhad']:6.3f} Track Mom.: {data[i]['p_pt_track']:8.1f} MeV Cluster2 E: {data[i]['p_eClusterLr2']:8.1f} MeV is Elec: {data[i]['Truth']:1d} True E: {data[i]['p_truth_E']:8.1f} MeV") # List of variables. # ------------------ # The list of variables is included below just for reference, so that you can easily read/know what the data file contains. # There are many variables, and part of the exercise is also to admit, that you don't know, what they are, and that you don't # really need to know (*). The only thing you REALLY NEED TO KNOW is, which are INPUT variables (i.e. those you use), and # which are the TARGET variables (i.e. those you want to learn to predict). # In this case, all the variables are input variables, except the two target variables: # * 'Truth': 0 for non-electrons, 1 for electrons (in a CLASSIFICATION PROBLEM) # * 'p_truth_E': Energy (in MeV) of the electrons (in a REGRESSION PROBLEM) # (*) Well, if you were working alone on this, and wanted to get the absolute best result, you would probably benefit # from knowing, but generally one can get very far without knowing. """ liste = ['index', 'eventNumber', 'runNumber', 'actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_deltaR_tag', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'Truth', 'p_truth_E', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1'] """