#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Python macro for selecting b-jets in Aleph Z->qqbar MC:
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   25th of April 2019
#
#  Code development:
#  1: Started using only prob_b cut at 0.13:                        Gave error rate of 16%
#  2: Combined with bqvjet cuts both singularly and in combination: Gave error rate of 12%
#  3: Included also ptlrel along and in combinations:               Gave error rate of 11.2%
#  4: Looped over 100 random combinations to get best:              Gave error rate of 10.8%
#  5: Looped over 100 random combinations in tighter range:         Gave error rate of 10.6%
#  6: Looped over 1000 random combinations in yet tighter range:    Gave error rate of 10.54%
#  For comparison, the Aleph NN gives:                                                  9.90%
#  Your future ML algorithms gives:                                                     ?.???%
# ----------------------------------------------------------------------------------- #

from __future__ import print_function, division   # Ensures Python3 printing & division standard
from matplotlib import pyplot as plt
from matplotlib import colors
from matplotlib.colors import LogNorm
import numpy as np
import csv

# Possible other packages to consider:
# cornerplot, seaplot, sklearn.decomposition(PCA)

r = np.random
r.seed(42)

SavePlots = False          # For now, don't save plots (once you trust your code, switch on)
plt.close('all')           # To close all open figures


# ----------------------------------------------------------------------------------- #
# Evaluate (made into a function, as this is called many times):
# ----------------------------------------------------------------------------------- #
def evaluate(bquark) :
    N = [[0,0], [0,0]]   # Make a list of lists (i.e. matrix) for counting successes/failures.
    for i in np.arange(len(isb)):
        if (bquark[i] == 0 and isb[i] == 0) : N[0][0] += 1
        if (bquark[i] == 0 and isb[i] == 1) : N[0][1] += 1
        if (bquark[i] == 1 and isb[i] == 0) : N[1][0] += 1
        if (bquark[i] == 1 and isb[i] == 1) : N[1][1] += 1
    fracWrong = float(N[0][1]+N[1][0])/float(len(isb))
    return N, fracWrong


# ----------------------------------------------------------------------------------- #
# Main program start:
# ----------------------------------------------------------------------------------- #

# Get data:
# ----------------------------------------------------------------------------------- #
data = np.genfromtxt('AlephBtag_MC_small_v2.csv', names=True)

energy = data['energy']
cTheta = data['cTheta']
phi    = data['phi']
prob_b = data['prob_b']
spheri = data['spheri']
pt2rel = data['pt2rel']
multip = data['multip']
bqvjet = data['bqvjet']
ptlrel = data['ptlrel']
nnbjet = data['nnbjet']
isb    = data['isb']


# Produce 1D figures:
# ----------------------------------------------------------------------------------- #

# Define the histogram range and binning (important - MatPlotLib is NOT good at this):
Nbins = 100
xmin = 0.0
xmax = 1.0

# Make new lists selected based on what the jets really are (b-quark jet or light-quark jet):
prob_b_bjets = []
prob_b_ljets = []
bqvjet_bjets = []
bqvjet_ljets = []
for i in np.arange(len(isb)) :
    if (isb[i] == 1) :
        prob_b_bjets.append(prob_b[i])
        bqvjet_bjets.append(bqvjet[i])
    else             :
        prob_b_ljets.append(prob_b[i])
        bqvjet_ljets.append(bqvjet[i])

# Produce the actual figure, here with two histograms in it:
fig, ax = plt.subplots(figsize=(12, 6))      # Create just a single figure and axes (figsize is in inches!)
hist_prob_b_bjets = ax.hist(prob_b_bjets, bins=Nbins, range=(xmin, xmax), histtype='step', linewidth=2, label='prob_b_bjets', color='blue')
hist_prob_b_ljets = ax.hist(prob_b_ljets, bins=Nbins, range=(xmin, xmax), histtype='step', linewidth=2, label='prob_b_ljets', color='red')
ax.set_xlabel("Probability of b-quark based on track impact parameters")     # Label of x-axis
ax.set_ylabel("Frequency / 0.01")                                            # Label of y-axis
ax.set_title("Distribution of prob_b")                                       # Title of plot
ax.legend(loc='best')                                                        # Legend. Could also be 'upper right'
ax.grid(axis='y')

fig.tight_layout()
fig.show()

if SavePlots :
    fig.savefig('Hist_prob_b_and_bqvjet.pdf', dpi=600)


# Produce 2D figures:
# ----------------------------------------------------------------------------------- #

# First we try a scatter plot, to see how the individual events distribute themselves:
fig2, ax2 = plt.subplots(figsize=(12, 6))
scat2_prob_b_vs_bqvjet_bjets = ax2.scatter(prob_b_bjets, bqvjet_bjets, label='b-jets', color='blue')
scat2_prob_b_vs_bqvjet_ljets = ax2.scatter(prob_b_ljets, bqvjet_ljets, label='l-jets', color='red')
ax2.legend(loc='best')
fig2.tight_layout()
fig2.show()

if SavePlots :
    fig2.savefig('Scatter_prob_b_vs_bqvjet.pdf', dpi=600)


# However, as can be seen in the figure, the overlap between b-jets and light-jets is large,
# and one covers much of the other in a scatter plot, which also does not show the amount of
# statistics in the dense regions. Therefore, we try two separate 2D histograms (zoomed):
fig3, ax3 = plt.subplots(1, 2, figsize=(12, 6))
hist2_prob_b_vs_bqvjet_bjets = ax3[0].hist2d(prob_b_bjets, bqvjet_bjets, bins=[40,40], range=[[0.0, 0.4], [0.0, 0.4]])
hist2_prob_b_vs_bqvjet_ljets = ax3[1].hist2d(prob_b_ljets, bqvjet_ljets, bins=[40,40], range=[[0.0, 0.4], [0.0, 0.4]])
ax3[0].set_title("b-jets")
ax3[1].set_title("light-jets")

fig3.tight_layout()
fig3.show()

if SavePlots :
    fig3.savefig('Hist2D_prob_b_vs_bqvjet.pdf', dpi=600)


# Selection:
# ----------------------------------------------------------------------------------- #

# I give the selection cuts names, so that they only need to be changed in ONE place (also ensures consistency!):
loose_propb = 0.10
tight_propb = 0.16
loose_bqvjet = 0.12
tight_bqvjet = 0.28
loose_ptlrel = 0.40
tight_ptlrel = 0.60

bquark=[]
for i in np.arange(len(prob_b)):
    if   (prob_b[i] > tight_propb)  : bquark.append(1)
    elif (bqvjet[i] > tight_bqvjet) : bquark.append(1)
    elif (ptlrel[i] > tight_ptlrel) : bquark.append(1)
    elif ((prob_b[i] > loose_propb)  and (bqvjet[i] > loose_bqvjet)) : bquark.append(1)
    elif ((prob_b[i] > loose_propb)  and (ptlrel[i] > loose_ptlrel)) : bquark.append(1)
    elif ((bqvjet[i] > loose_bqvjet) and (ptlrel[i] > loose_ptlrel)) : bquark.append(1)
    elif ((ptlrel[i] > 0.3) and (prob_b[i] > 0.08) and (bqvjet[i] > 0.10)) : bquark.append(1)  # Adds very little!
    else : bquark.append(0)

N, fracWrong = evaluate(bquark)
print("\nRESULT OF FIRST ATTEMPT AT A GOOD SELECTION:")
print("  First number is my estimate, second is the MC truth:")
print("  True-Negative (0,0)  = ", N[0][0])
print("  False-Negative (0,1) = ", N[0][1])
print("  False-Positive (1,0) = ", N[1][0])
print("  True-Positive (1,1)  = ", N[1][1])
print("    Fraction wrong = ( (0,1) + (1,0) ) / sum = ", fracWrong)


# Compare with NN-approach from 1990'ies:
# ----------------------------------------------------------------------------------- #

bquark=[]
for i in np.arange(len(prob_b)):
    if   (nnbjet[i] > 0.82) : bquark.append(1)
    else : bquark.append(0)

N, fracWrong = evaluate(bquark)
print("\nALEPH BJET TAG:")
print("  First number is my estimate, second is the MC truth:")
print("  True-Negative (0,0)  = ", N[0][0])
print("  False-Negative (0,1) = ", N[0][1])
print("  False-Positive (1,0) = ", N[1][0])
print("  True-Positive (1,1)  = ", N[1][1])
print("    Fraction wrong = ( (0,1) + (1,0) ) / sum = ", fracWrong)


# Looping over selections:
# ----------------------------------------------------------------------------------- #

# Here I make a range of values (min, max) from which I choose (randomly - more on that) from:
range_loose_prob_b = [0.08, 0.11]
range_tight_prob_b = [0.16, 0.21]
range_loose_bqvjet = [0.13, 0.18]
range_tight_bqvjet = [0.28, 0.40]
range_loose_ptlrel = [0.20, 0.40]
range_tight_ptlrel = [0.45, 0.65]

# Set high initially, and lowered by better selections:
fracWrong_min = 0.999

# Try several times (here 100):
for Ntest in range(100) :

    # Choose random selection and test it:
    loose_prob_b = r.uniform(range_loose_prob_b[0], range_loose_prob_b[1])
    tight_prob_b = r.uniform(range_tight_prob_b[0], range_tight_prob_b[1])
    loose_bqvjet = r.uniform(range_loose_bqvjet[0], range_loose_bqvjet[1])
    tight_bqvjet = r.uniform(range_tight_bqvjet[0], range_tight_bqvjet[1])
    loose_ptlrel = r.uniform(range_loose_ptlrel[0], range_loose_ptlrel[1])
    tight_ptlrel = r.uniform(range_tight_ptlrel[0], range_tight_ptlrel[1])

    bquark=[]
    for i in np.arange(len(prob_b)):
        if   (prob_b[i] > tight_prob_b) : bquark.append(1)
        elif (bqvjet[i] > tight_bqvjet) : bquark.append(1)
        elif (ptlrel[i] > tight_ptlrel) : bquark.append(1)
        elif ((prob_b[i] > loose_prob_b) and (bqvjet[i] > loose_bqvjet)) : bquark.append(1)
        elif ((prob_b[i] > loose_prob_b) and (ptlrel[i] > loose_ptlrel)) : bquark.append(1)
        elif ((bqvjet[i] > loose_bqvjet) and (ptlrel[i] > loose_ptlrel)) : bquark.append(1)
        else : bquark.append(0)

    N, fracWrong = evaluate(bquark)

    # Flag/print best selection(s):
    if (fracWrong < fracWrong_min) :
        print("\nRESULT OF TESTING MANY SELECTION CUTS:")
        print("  First number is my estimate, second is the MC truth:", Ntest)
        print("  True-Negative (0,0)  = ", N[0][0])
        print("  False-Negative (0,1) = ", N[0][1])
        print("  False-Positive (1,0) = ", N[1][0])
        print("  True-Positive (1,1)  = ", N[1][1])
        print("    Fraction wrong = ( (0,1) + (1,0) ) / sum = ", fracWrong)

        fracWrong_min = fracWrong
        print("  GOT A NEW MINIMUM.... GREAT!")
        print("  loose_prob_b = ", loose_prob_b)
        print("  tight_prob_b = ", tight_prob_b)
        print("  loose_bqvjet = ", loose_bqvjet)
        print("  tight_bqvjet = ", tight_bqvjet)
        print("  loose_ptlrel = ", loose_ptlrel)
        print("  tight_ptlrel = ", tight_ptlrel)


# If you want to execute your scripts in Spyder and normally through the terminal,
# you can use the snippet below to pause scripts running in the terminal,
# so your figures don't immediately close.
# If you are only going to use Spyder, just remove or ignore the part below
try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


"""
Gotten using seed 42 and 100 attempts:

  First number is my estimate, second is the MC truth:
('  True-Negative (0,0)  = ', 25289)
('  False-Negative (0,1) = ', 2563)
('  False-Positive (1,0) = ', 1019)
('  True-Positive (1,1)  = ', 5097)
('    Fraction wrong = ( (0,1) + (1,0) ) / sum = ', 0.10545219029674988)
  GOT A NEW MINIMUM.... SUPER!
('  loose_prob_b = ', 0.09655720516323457)
('  tight_prob_b = ', 0.18804689857676932)
('  loose_bqvjet = ', 0.17383268013291725)
('  tight_bqvjet = ', 0.3284179439454877)
('  loose_ptlrel = ', 0.22680304569012816)
('  tight_ptlrel = ', 0.4557565352626678)
"""