#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
# Python script for analysis of ATLAS test beam data.
#
# The program reads an ASCII (i.e. text) data file, containting a large number of events,
# where a charged particle (electron or pion) passed through a slice of the ATLAS detector.
# Each passage is recorded by different detectors, boiling down to eleven numbers (some more
# relevant than others). The exercise is to separate electron and pion events based on these
# numbers, and in turn us this information to measure the interaction of pions and electrons
# seperately.
#
# NOTE: Though the data is from particle physics, it could in principle have been from ANY
# other source, and the eleven numbers could for example have been indicators of cancer,
# key numbers for investors, or index numbers for identifying potential costumors.
# 
# For more information on ATLAS test beam: 
#   http://www.nbi.dk/~petersen/Teaching/Stat2017/TestBeam/TestBeamDataAnalysis.html
# 
# Author: Troels C. Petersen (NBI)
# Email:  petersen@nbi.dk
# Date:   10th of December 2017
# 
# ----------------------------------------------------------------------------------- #

from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
plt.close('all')


# ----------------------------------------------------------------------------------- #
# Read the data file:
# ----------------------------------------------------------------------------------- #

# Write extensive output
verbose = True
Nverbose = 10
Nread = 0

# Lists of the eleven variables:
Cher_electrons = []
nLT_electrons  = []
nHT_electrons  = []
EM0_electrons  = []
EM1_electrons  = []
EM2_electrons  = []
EM3_electrons  = []
Had0_electrons = []
Had1_electrons = []
Had2_electrons = []
Muon_electrons = []

# Open file
f = open('DataSet_AtlasPid_ElectronPion_2GeV.txt', 'r')

# Loop over lines and extract variables of interest
header = f.readline()

for line in f:
    line = line.strip()         # Remove spaces, tabs, and other 'whitespace' from the line from both ends until a character is met
    columns = line.split()      # Split the line at whitespace character (spaces, tabs, etc.) into a list

    Cher_i = float(columns[ 0])
    nLT_i  = int  (columns[ 1])
    nHT_i  = int  (columns[ 2])
    EM0_i  = float(columns[ 3])
    EM1_i  = float(columns[ 4])
    EM2_i  = float(columns[ 5])
    EM3_i  = float(columns[ 6])
    Had0_i = float(columns[ 7])
    Had1_i = float(columns[ 8])
    Had2_i = float(columns[ 9])
    Muon_i = float(columns[10])
    
    #if ( EM1_i > 0.3 ) : 
    #    electrons?
    
    Cher_electrons.append(Cher_i)
    nLT_electrons.append(nLT_i)
    nHT_electrons.append(nHT_i)
    EM0_electrons.append(EM0_i)
    EM1_electrons.append(EM1_i)
    EM2_electrons.append(EM2_i)
    EM3_electrons.append(EM3_i)
    Had0_electrons.append(Had0_i)
    Had1_electrons.append(Had1_i)
    Had2_electrons.append(Had2_i)
    Muon_electrons.append(Muon_i)

    if (verbose and Nread < Nverbose) :
        print("Cher: {:6.1f}    nLT,nHT: {:2d} {:2d}    EM: {:5.2f} {:5.2f} {:5.2f} {:5.2f}   Had: {:5.2f} {:5.2f} {:5.2f}    Muon: {:5.1f}".format(Cher_i, nLT_i, nHT_i, EM0_i, EM1_i, EM2_i, EM3_i, Had0_i, Had1_i, Had2_i, Muon_i))
    Nread += 1

f.close()

print("  Total number of events read: {:d}".format(Nread))


Cher_electrons = np.array(Cher_electrons)
nLT_electrons  = np.array(nLT_electrons)
nHT_electrons  = np.array(nHT_electrons)
EM0_electrons  = np.array(EM0_electrons)
EM1_electrons  = np.array(EM1_electrons)
EM2_electrons  = np.array(EM2_electrons)
EM3_electrons  = np.array(EM3_electrons)
Had0_electrons = np.array(Had0_electrons)
Had1_electrons = np.array(Had1_electrons)
Had2_electrons = np.array(Had2_electrons)
Muon_electrons = np.array(Muon_electrons)


# numpy version:
# data = np.loadtxt('DataSet_AtlasPid_ElectronPion_2GeV.txt', skiprows=1, unpack=True)
# [Cher_electrons, nLT_electrons, nHT_electrons, EM0_electrons, 
# EM1_electrons, EM2_electrons, EM3_electrons, Had0_electrons, 
# Had1_electrons, Had2_electrons, Muon_electrons] = data


# ----------------------------------------------------------------------------------- #
# Your analysis:
# ----------------------------------------------------------------------------------- #

# Hist_Cher, Hist_Cher_edges = np.histogram(Cher, bins=200, range=(400, 1400))
# Hist_EM1, Hist_EM1_edges = np.histogram(EM1, bins=200, range=(-0.5, 2.0))
# Hist_CherEM1, Hist_CherEM1_xedges, Hist_CherEM1_yedges = np.histogram2d(Cher, EM1, bins=(100,100), range=[(400,1400),(-0.5,2.0)])

# Make a figure with 2*2=4 subplots
fig, ax = plt.subplots(2,2,figsize=(12, 8))

ax[0,0].hist(Cher_electrons, bins=200, range=(400,1400), histtype='step', label='Cherenkov')
ax[0,0].set_xlabel("Cherenkov")
ax[0,0].set_ylabel("Frequency")

ax[1,0].hist(EM1_electrons , bins=200, range=(-0.5,2.0), histtype='step', label='EM1')
ax[1,0].set_xlabel("EM1")
ax[1,0].set_ylabel("Frequency")

# binx, biny, x1, x2, y1, y2 or "binary"
h = ax[0,1].hist2d(Cher_electrons, EM1_electrons, (100,100), ((400,1400),(-0.5,2.0)), cmap="Reds")
plt.colorbar(h[3],ax=ax[0,1])               # z-scale on the right of the figure
ax[0,1].set_xlabel("Cherenkov")
ax[0,1].set_ylabel("EM1")

# You can add your own fourth plot
fig.delaxes(ax[1,1])

plt.tight_layout()
plt.show(block=False)


# Finally, ensure that the program does not termine (and the plot disappears), before you press enter:
try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
# Questions to be answered:
# ----------------------------------------------------------------------------------- #
# 
# Generally, this analysis is about separating electrons and pions (and determining how
# well this can be done), followed by a few questions characterizing the detector
# response to each type of particle.
# Below are questions guiding you, some/most of which your analysis should cover, but
# you do not have to follow them blindly (I've put "Optional" on those that are not
# essential).
# Start by considering the data, and get a feel for the typical range of each variable.
# Plot the variables, both in 1D and also 2D! From considering these plots,
# guess/estimate an approximate knowledge of how electrons and pions distribute
# themselves in the variables above, and how to make a selection of these.
#
# As described on the webpage introducing the data, the three detectors, Cherenkov,
# TRT and Calorimeters are each capable of separating electrons and pions. As they are
# INDEPENDENT (three separate detectors), they may be used to cross check each other,
# and this is what you should use, in fact the essential part of the analysis!
#
# 
# Questions:
# ----------
# 1) Find for each of these three detector systems one variable (i.e. a combination),
# which seem to separate electrons and pions best. For example, start with the
# Cherenkov, which is only a single number, and assume/guess that the large peak
# at low values is mainly from pions, while the upper broad peak is from electrons.
# Now plot the TRT and Calorimeter distributions when the Cherenkov selects a pion
# and afterwards an electron. This should give you a hint of how to separate pions
# and electrons using the TRT and Calorimeters.
#
# Hint: Sometimes variables from a single detector are more powerful, when they are
# combined, e.g. taken ratios of. For the TRT this may be somewhat obvious, but for
# the EMcalo, it is not as simple. While one may simply use the second layer,
# involving this layer in a ratio may enhance the separation power.
#
# 2) Next you should try to see, if you can make a selection, which gives you a
# fairly large and clean electron and pion sample, respectively. The question is, how
# can you know how clean your sample is and how efficient your selection is?
# This can actually be measured in the data itself, using the fact that there are
# three independent detectors. For example, start by making an electron and a pion
# selection using two of the three variables, and plot the third variable for each of
# these selections. Now you can directly see, how electrons and pions will distribute
# themselves in this third variable. Are you worried, that there are pions in your
# electron sample, and vice versa? Well, there will probably be, but so few, that it
# won't matter too much, at least not to begin with.
# Why? Well, let us assume that for each detector, 80% of electrons pass your
# requirement, but also 10% of pions do. Assuming an even number of electrons and
# pions (which is not really the case), then with two detector cuts, you should get
# a sample, which is: 0.8*0.8 / (0.8*0.8 + 0.1*0.1) = 98.5% pure.
#
# Now with this sample based on cuts on the two other detectors, ask what fraction
# of electrons and pions passes your electron selection. The fraction of electrons,
# that are not selected as electrons will be your TYPE I errors, denoted alpha,
# while the fraction of pions, that do pass the cut will be your TYPE II errors,
# denoted beta.
# Measure these for each of the two cuts in the three detector types, and ask yourself
# if they are "reasonable", i.e. something like in the example above. If not, then
# you should perhaps reconsider adjusting your cuts.
#
# By now, you should for each detector have 6 numbers:
# - The electron cut value above which you accept an electron.
# - The efficiency (i.e. 1-alpha) for electrons of this cut.
# - The fake rate (i.e. beta) for pions of this cut.
# - The pion cut value below which you accept a pion (may be same value as above!).
# - The efficiency (i.e. 1-alpha) for pions of this cut.
# - The fake rate (i.e. beta) for electrons of this cut.
#
# 3) Given the efficiencies and fake rates of each cut, try to combine these (again
# assuming that they are independent) into knowledge of your sample purities and
# also the total number of electrons and pions in the whole sample. Do the sum of
# estimated electrons and pions added actually match the number of particles in total?
# This is a good cross check!
#
# 4) If the number of pions was suddenly 1.000 that of elections, would you still be
# able to get a sample of fairly pure (say 90% pure) electrons? And if so, what would
# the efficiency for these electrons be?
#
# 5) One of the purposes of the testbeam what to measure the response of the TRT
# detector to exactly electrons and pions. Consider for example only events that has
# 33 TRT hits (i.e. nLT = 33). As the High-Threshold probability (i.e. probability of
# passing the High-Threshold, given that the Low-Threshold was passed), is assumed to
# be constant in the TRT detector (but quite different for electrons and pions), what
# distribution should the number of High-Threshold hits (nHT) follow? And is that
# really the case, both for electrons and pions?
# 
# 6) Expanding on problem 2), try now to calculate ROC curves for each of the three
# detectors. These are obtained by making a clean selection using the two other
# detectors for electrons and pions seperately, and then integrating over these two
# distributions, using the running (normalised) integral of each as x and y coordinate
# in a (ROC) graph.
# If you do not manage on your own, perhaps consider the ROC calculator example,
# which is posted along with this exercise.
# 
# (Optional)
# 7) Still considering nLT = 33, and given that there are both electrons and pions in
# the sample (each with a different HT probability), nHT should in the unselected data
# be a combination of two distributions. Try to fit the number of HT hits with two
# distributions combined.
# Do they fit the data? And can this fit be used to estimate the fraction of pions and
# electrons in the sample? And does that estimate match you previous estimate?
# Perhaps retry with other values for the number of TRT hits.
#
# (Optional)
# 8) Try to select pions using three different (mutually exclusive) techniques:
#    a) Passing only a hadronic calorimeter requirement (e.g. that the sum of the three
#       HCal values is above some minimum energy).
#    b) Passing only Cherenkov AND EMcalo requirements.
#    c) Passing both a) and b).
# Try to measure the HT probability (i.e. fraction of High-Threshold hits) for each of
# these three pion samples. Do they agree with each other?
# 
# ----------------------------------------------------------------------------------- #