#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python macro for selecting b-jets in Aleph Z->qqbar MC: # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 25th of April 2019 # # Code development: # 1: Started using only prob_b cut at 0.13: Gave error rate of 16% # 2: Combined with bqvjet cuts both singularly and in combination: Gave error rate of 12% # 3: Included also ptlrel along and in combinations: Gave error rate of 11.2% # 4: Looped over 100 random combinations to get best: Gave error rate of 10.8% # 5: Looped over 100 random combinations in tighter range: Gave error rate of 10.6% # 6: Looped over 1000 random combinations in yet tighter range: Gave error rate of 10.54% # For comparison, the Aleph NN gives: 9.90% # Your future ML algorithms gives: ?.???% # ----------------------------------------------------------------------------------- # from __future__ import print_function, division # Ensures Python3 printing & division standard from matplotlib import pyplot as plt from matplotlib import colors from matplotlib.colors import LogNorm import numpy as np import csv # Possible other packages to consider: # cornerplot, seaplot, sklearn.decomposition(PCA) r = np.random r.seed(42) SavePlots = False # For now, don't save plots (once you trust your code, switch on) plt.close('all') # To close all open figures # ----------------------------------------------------------------------------------- # # Evaluate (made into a function, as this is called many times): # ----------------------------------------------------------------------------------- # def evaluate(bquark) : N = [[0,0], [0,0]] # Make a list of lists (i.e. matrix) for counting successes/failures. for i in np.arange(len(isb)): if (bquark[i] == 0 and isb[i] == 0) : N[0][0] += 1 if (bquark[i] == 0 and isb[i] == 1) : N[0][1] += 1 if (bquark[i] == 1 and isb[i] == 0) : N[1][0] += 1 if (bquark[i] == 1 and isb[i] == 1) : N[1][1] += 1 fracWrong = float(N[0][1]+N[1][0])/float(len(isb)) return N, fracWrong # ----------------------------------------------------------------------------------- # # Main program start: # ----------------------------------------------------------------------------------- # # Get data: # ----------------------------------------------------------------------------------- # data = np.genfromtxt('AlephBtag_MC_small_v2.csv', names=True) energy = data['energy'] cTheta = data['cTheta'] phi = data['phi'] prob_b = data['prob_b'] spheri = data['spheri'] pt2rel = data['pt2rel'] multip = data['multip'] bqvjet = data['bqvjet'] ptlrel = data['ptlrel'] nnbjet = data['nnbjet'] isb = data['isb'] # Produce 1D figures: # ----------------------------------------------------------------------------------- # # Define the histogram range and binning (important - MatPlotLib is NOT good at this): Nbins = 100 xmin = 0.0 xmax = 1.0 # Make new lists selected based on what the jets really are (b-quark jet or light-quark jet): prob_b_bjets = [] prob_b_ljets = [] bqvjet_bjets = [] bqvjet_ljets = [] for i in np.arange(len(isb)) : if (isb[i] == 1) : prob_b_bjets.append(prob_b[i]) bqvjet_bjets.append(bqvjet[i]) else : prob_b_ljets.append(prob_b[i]) bqvjet_ljets.append(bqvjet[i]) # Produce the actual figure, here with two histograms in it: fig, ax = plt.subplots(figsize=(12, 6)) # Create just a single figure and axes (figsize is in inches!) hist_prob_b_bjets = ax.hist(prob_b_bjets, bins=Nbins, range=(xmin, xmax), histtype='step', linewidth=2, label='prob_b_bjets', color='blue') hist_prob_b_ljets = ax.hist(prob_b_ljets, bins=Nbins, range=(xmin, xmax), histtype='step', linewidth=2, label='prob_b_ljets', color='red') ax.set_xlabel("Probability of b-quark based on track impact parameters") # Label of x-axis ax.set_ylabel("Frequency / 0.01") # Label of y-axis ax.set_title("Distribution of prob_b") # Title of plot ax.legend(loc='best') # Legend. Could also be 'upper right' ax.grid(axis='y') fig.tight_layout() fig.show() if SavePlots : fig.savefig('Hist_prob_b_and_bqvjet.pdf', dpi=600) # Produce 2D figures: # ----------------------------------------------------------------------------------- # # First we try a scatter plot, to see how the individual events distribute themselves: fig2, ax2 = plt.subplots(figsize=(12, 6)) scat2_prob_b_vs_bqvjet_bjets = ax2.scatter(prob_b_bjets, bqvjet_bjets, label='b-jets', color='blue') scat2_prob_b_vs_bqvjet_ljets = ax2.scatter(prob_b_ljets, bqvjet_ljets, label='l-jets', color='red') ax2.legend(loc='best') fig2.tight_layout() fig2.show() if SavePlots : fig2.savefig('Scatter_prob_b_vs_bqvjet.pdf', dpi=600) # However, as can be seen in the figure, the overlap between b-jets and light-jets is large, # and one covers much of the other in a scatter plot, which also does not show the amount of # statistics in the dense regions. Therefore, we try two separate 2D histograms (zoomed): fig3, ax3 = plt.subplots(1, 2, figsize=(12, 6)) hist2_prob_b_vs_bqvjet_bjets = ax3[0].hist2d(prob_b_bjets, bqvjet_bjets, bins=[40,40], range=[[0.0, 0.4], [0.0, 0.4]]) hist2_prob_b_vs_bqvjet_ljets = ax3[1].hist2d(prob_b_ljets, bqvjet_ljets, bins=[40,40], range=[[0.0, 0.4], [0.0, 0.4]]) ax3[0].set_title("b-jets") ax3[1].set_title("light-jets") fig3.tight_layout() fig3.show() if SavePlots : fig3.savefig('Hist2D_prob_b_vs_bqvjet.pdf', dpi=600) # Selection: # ----------------------------------------------------------------------------------- # # I give the selection cuts names, so that they only need to be changed in ONE place (also ensures consistency!): loose_propb = 0.10 tight_propb = 0.16 loose_bqvjet = 0.12 tight_bqvjet = 0.28 loose_ptlrel = 0.40 tight_ptlrel = 0.60 bquark=[] for i in np.arange(len(prob_b)): if (prob_b[i] > tight_propb) : bquark.append(1) elif (bqvjet[i] > tight_bqvjet) : bquark.append(1) elif (ptlrel[i] > tight_ptlrel) : bquark.append(1) elif ((prob_b[i] > loose_propb) and (bqvjet[i] > loose_bqvjet)) : bquark.append(1) elif ((prob_b[i] > loose_propb) and (ptlrel[i] > loose_ptlrel)) : bquark.append(1) elif ((bqvjet[i] > loose_bqvjet) and (ptlrel[i] > loose_ptlrel)) : bquark.append(1) elif ((ptlrel[i] > 0.3) and (prob_b[i] > 0.08) and (bqvjet[i] > 0.10)) : bquark.append(1) # Adds very little! else : bquark.append(0) N, fracWrong = evaluate(bquark) print("\nRESULT OF FIRST ATTEMPT AT A GOOD SELECTION:") print(" First number is my estimate, second is the MC truth:") print(" True-Negative (0,0) = ", N[0][0]) print(" False-Negative (0,1) = ", N[0][1]) print(" False-Positive (1,0) = ", N[1][0]) print(" True-Positive (1,1) = ", N[1][1]) print(" Fraction wrong = ( (0,1) + (1,0) ) / sum = ", fracWrong) # Compare with NN-approach from 1990'ies: # ----------------------------------------------------------------------------------- # bquark=[] for i in np.arange(len(prob_b)): if (nnbjet[i] > 0.82) : bquark.append(1) else : bquark.append(0) N, fracWrong = evaluate(bquark) print("\nALEPH BJET TAG:") print(" First number is my estimate, second is the MC truth:") print(" True-Negative (0,0) = ", N[0][0]) print(" False-Negative (0,1) = ", N[0][1]) print(" False-Positive (1,0) = ", N[1][0]) print(" True-Positive (1,1) = ", N[1][1]) print(" Fraction wrong = ( (0,1) + (1,0) ) / sum = ", fracWrong) # Looping over selections: # ----------------------------------------------------------------------------------- # # Here I make a range of values (min, max) from which I choose (randomly - more on that) from: range_loose_prob_b = [0.08, 0.11] range_tight_prob_b = [0.16, 0.21] range_loose_bqvjet = [0.13, 0.18] range_tight_bqvjet = [0.28, 0.40] range_loose_ptlrel = [0.20, 0.40] range_tight_ptlrel = [0.45, 0.65] # Set high initially, and lowered by better selections: fracWrong_min = 0.999 # Try several times (here 100): for Ntest in range(100) : # Choose random selection and test it: loose_prob_b = r.uniform(range_loose_prob_b[0], range_loose_prob_b[1]) tight_prob_b = r.uniform(range_tight_prob_b[0], range_tight_prob_b[1]) loose_bqvjet = r.uniform(range_loose_bqvjet[0], range_loose_bqvjet[1]) tight_bqvjet = r.uniform(range_tight_bqvjet[0], range_tight_bqvjet[1]) loose_ptlrel = r.uniform(range_loose_ptlrel[0], range_loose_ptlrel[1]) tight_ptlrel = r.uniform(range_tight_ptlrel[0], range_tight_ptlrel[1]) bquark=[] for i in np.arange(len(prob_b)): if (prob_b[i] > tight_prob_b) : bquark.append(1) elif (bqvjet[i] > tight_bqvjet) : bquark.append(1) elif (ptlrel[i] > tight_ptlrel) : bquark.append(1) elif ((prob_b[i] > loose_prob_b) and (bqvjet[i] > loose_bqvjet)) : bquark.append(1) elif ((prob_b[i] > loose_prob_b) and (ptlrel[i] > loose_ptlrel)) : bquark.append(1) elif ((bqvjet[i] > loose_bqvjet) and (ptlrel[i] > loose_ptlrel)) : bquark.append(1) else : bquark.append(0) N, fracWrong = evaluate(bquark) # Flag/print best selection(s): if (fracWrong < fracWrong_min) : print("\nRESULT OF TESTING MANY SELECTION CUTS:") print(" First number is my estimate, second is the MC truth:", Ntest) print(" True-Negative (0,0) = ", N[0][0]) print(" False-Negative (0,1) = ", N[0][1]) print(" False-Positive (1,0) = ", N[1][0]) print(" True-Positive (1,1) = ", N[1][1]) print(" Fraction wrong = ( (0,1) + (1,0) ) / sum = ", fracWrong) fracWrong_min = fracWrong print(" GOT A NEW MINIMUM.... GREAT!") print(" loose_prob_b = ", loose_prob_b) print(" tight_prob_b = ", tight_prob_b) print(" loose_bqvjet = ", loose_bqvjet) print(" tight_bqvjet = ", tight_bqvjet) print(" loose_ptlrel = ", loose_ptlrel) print(" tight_ptlrel = ", tight_ptlrel) # If you want to execute your scripts in Spyder and normally through the terminal, # you can use the snippet below to pause scripts running in the terminal, # so your figures don't immediately close. # If you are only going to use Spyder, just remove or ignore the part below try: __IPYTHON__ except: raw_input('Press Enter to exit') """ Gotten using seed 42 and 100 attempts: First number is my estimate, second is the MC truth: (' True-Negative (0,0) = ', 25289) (' False-Negative (0,1) = ', 2563) (' False-Positive (1,0) = ', 1019) (' True-Positive (1,1) = ', 5097) (' Fraction wrong = ( (0,1) + (1,0) ) / sum = ', 0.10545219029674988) GOT A NEW MINIMUM.... SUPER! (' loose_prob_b = ', 0.09655720516323457) (' tight_prob_b = ', 0.18804689857676932) (' loose_bqvjet = ', 0.17383268013291725) (' tight_bqvjet = ', 0.3284179439454877) (' loose_ptlrel = ', 0.22680304569012816) (' tight_ptlrel = ', 0.4557565352626678) """