#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python/ROOT macro for constructing a Fisher disciminant from two 2D Gaussianly # distributed correlated variables. # The macro creates artificial random data for two different types of processes, # and the goal is then to separate these by constructing a Fisher discriminant. # # References: # Glen Cowan, SDA, pages 51-57 # http://en.wikipedia.org/wiki/Linear_discriminant_analysis # http://en.wikipedia.org/wiki/Iris_flower_data_set # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 5th of October 2014 # # Author: Lars Egholm Pedersen # Email: egholm@nbi.dk # Date: 6th of October 2013 # # ----------------------------------------------------------------------------------- # from ROOT import * # ----------------------------------------------------------------------------------- # # Functions # ----------------------------------------------------------------------------------- # def sqr( a ) : return a*a # Function for generating a set of correlated random gaussian numbers. def get_corr( mu1, sig1, mu2, sig2, rho12 ) : r = TRandom3(0) # Just gonna initialize random generator in function for simplicity # Be carefull ALWAYS to use random seed that is changing faster # than you are calling the function!! theta = 0.5 * atan( 2.0 * rho12 * sig1 * sig2 / ( sqr(sig1) - sqr(sig2) ) ) sigu = sqrt( fabs( (sqr(sig1*cos(theta)) - sqr(sig2*sin(theta)) ) / ( sqr(cos(theta)) - sqr(sin(theta))) ) ) sigv = sqrt( fabs( (sqr(sig2*cos(theta)) - sqr(sig1*sin(theta)) ) / ( sqr(cos(theta)) - sqr(sin(theta))) ) ) u = r.Gaus( 0.0, sigu ) v = r.Gaus( 0.0, sigv ) x = mu1 + cos(theta)*u - sin(theta)*v y = mu2 + sin(theta)*u + cos(theta)*v return [x,y] # ----------------------------------------------------------------------------------- # # Fisher discriminant script. # ----------------------------------------------------------------------------------- # gROOT.Reset() # Setting of general plotting style: gStyle.SetCanvasColor(0) gStyle.SetFillColor(0) # Setting what to be shown in statistics box (using two different methods): gStyle.SetOptStat("emr") gStyle.SetOptFit(1111) # ----------------------------------------------------------------------------------- # # Define parameters # ----------------------------------------------------------------------------------- # nspec = 2 # Number of 'species' : signal / background nvar = 2 # Number of variables for discrimination mean_par0 = [ 15.0 , 12.0 ] # Process type 1,2 mean in x direction width_par0 = [ 2.0 , 3.0 ] # Process type 1,2 width in x direction mean_par1 = [ 50.0 , 55.0 ] # ... y width_par1 = [ 6.0 , 7.0 ] # ... y corr = [ 0.80, 0.90 ] # Coefficient of correlation ndata = 2000 # Amount of data you want to create color_index = [ 2, 4] # Make process type 1 red and two blue... marker_index = [24, 25] # Marker style for correlation plot par_name = ["Parameter_a", "Parameter_b"] #Parameter names draw_opt = ["", "same"] #Drawing option ... # ----------------------------------------------------------------------------------- # # Define all your graphics here # ----------------------------------------------------------------------------------- # hist_par0 = [ TH1D("hist_par0_spec0", "hist_par0_spec0", 50, 0.0, 25.0) , #Histograms for "par0" TH1D("hist_par0_spec1", "hist_par0_spec1", 50, 0.0, 25.0) ] #Projection hist_par1 = [ TH1D("hist_par1_spec0", "hist_par1_spec0", 50, 20.0, 80.0) , #Histograms for "par1" TH1D("hist_par1_spec1", "hist_par1_spec1", 50, 20.0, 80.0) ] #Projection hist_corr = [ TH2D("corr_par0_par1_spec0", "corr_par0_par1_spec0", 50, 0.0, 25.0, 50, 20.0, 80.0) , #Correlation TH2D("corr_par0_par1_spec1", "corr_par0_par1_spec1", 50, 0.0, 25.0, 50, 20.0, 80.0) ] #histograms hist_fisher = [ TH1D("hist_fisher_spec0", "hist_fisher_spec0", 200, -2.0, 2.0) , #Histograms for final TH1D("hist_fisher_spec1", "hist_fisher_spec1", 200, -2.0, 2.0) ] #fisher projection # ----------------------------------------------------------------------------------- # # Generate data and fill initial histograms # ----------------------------------------------------------------------------------- # for ispec in range( nspec ) : for iexp in range( ndata ) : #Get liniarly correlated random numbers... values = get_corr( mean_par0[ispec], width_par0[ispec], mean_par1[ispec], width_par1[ispec], corr[ispec] ) hist_par0[ispec].Fill( values[0] ) hist_par1[ispec].Fill( values[1] ) hist_corr[ispec].Fill( values[0], values[1] ) # ----------------------------------------------------------------------------------- # # Specify color etc of your histograms # ----------------------------------------------------------------------------------- # for ispec in range( nspec ) : hist_par0[ispec].SetLineColor( color_index[ispec] ) hist_par1[ispec].SetLineColor( color_index[ispec] ) hist_fisher[ispec].SetLineColor( color_index[ispec] ) hist_par0[ispec].GetXaxis().SetTitle( par_name[0] ) hist_par1[ispec].GetXaxis().SetTitle( par_name[1] ) hist_fisher[ispec].GetXaxis().SetTitle( "Fisher Discriminant" ) hist_corr[ispec].SetMarkerColor( color_index[ispec] ) hist_corr[ispec].SetMarkerStyle(marker_index[ispec] ) hist_corr[ispec].GetXaxis().SetTitle( par_name[0] ) hist_corr[ispec].GetYaxis().SetTitle( par_name[1] ) # ----------------------------------------------------------------------------------- # # Plot your generated data # ----------------------------------------------------------------------------------- # # Correlation plot: canvas_2D = TCanvas("canvas_2D", "canvas_2D", 80, 80, 1000, 700) for ispec in range(nspec) : hist_corr[ispec].Draw( draw_opt[ispec] ) # x and y projections: canvas_1D = TCanvas("canvas_1D", "canvas_1D", 50, 50, 1000, 700) canvas_1D.Divide(2) for ispec in range(nspec) : canvas_1D.cd( 1 ) hist_par0[ispec].Draw( draw_opt[ispec] ) canvas_1D.cd( 2 ) hist_par1[ispec].Draw( draw_opt[ispec] ) # ----------------------------------------------------------------------------------- # # Your calculation of the Fisher discriminant here... # ----------------------------------------------------------------------------------- # # Calculate means and widths (RMS) of the various variables: # Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients: # NOTE: You can get the covariances from the 2D histograms, using "GetCovariance()". covmat_A = TMatrixD(nvar, nvar) # Covariance matrix for blue points covmat_B = TMatrixD(nvar, nvar) # Covariance matrix for red points covmat_comb = TMatrixD(nvar, nvar) # Summed covariance matrix # invcovmat_comb = covmat_comb.Invert() # Inverted matrix # Calculate the four Fisher coefficients: # Loop over the data, and calculate the Fisher discriminant value, and see how well # the separation works: raw_input( ' ... ' ) # ----------------------------------------------------------------------------------- # # Questions # ----------------------------------------------------------------------------------- # # # As always, make sure that you know what the code is doing so far, and what the aim of # the exercise is (i.e. which problem to solve, and how). Then start to expand on it. # # 1) Look at the 1D distributions of the two discriminating variables for the two species, # and see how well you can separate them by eye. It seems somewhat possible, but # certainly far from perfect... # Once you consider the 2D distribution (scatter plot), then it is clear, that some # cut along a line at an angle will work much better. This exercise is about finding # that line, and thus the perpendicular axis to project the data onto! # # 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable # (pair of variables for covariance) for each species, and put these into the # matrices defined. # # 3) From the inverted summed matrix and vectors of means, calculate the four Fisher # coefficients, and given these, calculate the Fisher discriminant for the two # species in question. # # 4) What separation did you get, and is it notably better than what you obtain by eye? # Also, do your weights make sense? I.e. are they comparable to the widths of the # corresponding variable. # As a measure of how good the separation obtained is, we consider the "distance" # between the two distributions as a measure of goodness: # separation = (mu_A - mu_B)**2 / (sigma_A**2 + sigma_B**2) # # Compare the separation you get from each of the two 1D histograms of par0 and par1 # with what you get from the Fisher discriminant, using the above formula. # # ----------------------------------------------------------------------------------- #