#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python macro for constructing a Fisher disciminant from two 2D Gaussianly # distributed correlated variables. # The macro creates artificial random data for two different types of processes, # and the goal is then to separate these by constructing a Fisher discriminant. # # References: # Glen Cowan, Statistical Data Analysis, pages 51-57 # http://en.wikipedia.org/wiki/Linear_discriminant_analysis # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 22nd of December 2017 # # ----------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------- # # Imports # ----------------------------------------------------------------------------------- # # First, import the modules you want to use: from __future__ import print_function, division # import numpy as np # Matlab like syntax for linear algebra and functions import matplotlib.pyplot as plt # Plots and figures like you know them from Matlab from numpy.linalg import inv plt.close('all') # to close all open figures r = np.random # Random generator r.seed(42) # Set a random seed (but a fixed one) SavePlots = False # For now, don't save plots (once you trust your code, switch on) # ----------------------------------------------------------------------------------- # # Functions # ----------------------------------------------------------------------------------- # # Function for generating a set of correlated random gaussian numbers. # What is essentially done, is to produce UNcorrelated pairs of random numbers, and then rotating them. def get_corr( mu1, sig1, mu2, sig2, rho12 ) : theta = 0.5 * np.arctan( 2.0 * rho12 * sig1 * sig2 / ( sig1**2 - sig2**2 ) ) sigu = np.sqrt( np.abs( ((sig1*np.cos(theta))**2 - (sig2*np.sin(theta))**2 ) / ( np.cos(theta)**2 - np.sin(theta)**2) ) ) sigv = np.sqrt( np.abs( ((sig2*np.cos(theta))**2 - (sig1*np.sin(theta))**2 ) / ( np.cos(theta)**2 - np.sin(theta)**2) ) ) u = r.normal( 0.0, sigu ) v = r.normal( 0.0, sigv ) x = mu1 + np.cos(theta)*u - np.sin(theta)*v y = mu2 + np.sin(theta)*u + np.cos(theta)*v return [x,y] def GetCovariance(X, Y): return np.cov(X, Y, ddof=0)[0, 1] # ----------------------------------------------------------------------------------- # # Fisher discriminant script. # ----------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------- # # Define parameters # ----------------------------------------------------------------------------------- # nspec = 2 # Number of 'species' : signal / background nvar = 2 # Number of variables for discrimination mean_par0 = [ 15.0 , 12.0 ] # Process type 1,2 mean in x direction width_par0 = [ 2.0 , 3.0 ] # Process type 1,2 width in x direction mean_par1 = [ 50.0 , 55.0 ] # ... y width_par1 = [ 6.0 , 7.0 ] # ... y corr = [ 0.80, 0.90 ] # Coefficient of correlation ndata = 2000 # Amount of data you want to create # ----------------------------------------------------------------------------------- # # Generate data # ----------------------------------------------------------------------------------- # # For each "species", produce a number of (x,y) points: par0 = np.zeros((ndata, nspec)) par1 = np.zeros((ndata, nspec)) for ispec in range( nspec ) : for iexp in range( ndata ) : # Get liniarly correlated random numbers... values = get_corr( mean_par0[ispec], width_par0[ispec], mean_par1[ispec], width_par1[ispec], corr[ispec] ) par0[iexp, ispec] = values[0] par1[iexp, ispec] = values[1] # ----------------------------------------------------------------------------------- # # Plot your generated data # ----------------------------------------------------------------------------------- # # x and y projections fig_1D, ax_1D = plt.subplots(ncols=2, figsize=(10, 6)) ax_1D[0].hist(par0[:, 0], 50, (0, 25), histtype='step', label='hist_par0_spec0', color='Red') ax_1D[0].hist(par0[:, 1], 50, (0, 25), histtype='step', label='hist_par0_spec1', color='Blue') ax_1D[0].set(title='hist_par0', xlim=(0,25)) ax_1D[0].legend(loc='upper left') ax_1D[1].hist(par1[:, 0], 50, (20, 80), histtype='step', label='hist_par1_spec0', color='Red') ax_1D[1].hist(par1[:, 1], 50, (20, 80), histtype='step', label='hist_par1_spec1', color='Blue') ax_1D[1].set(title='hist_par0', xlim=(20, 80)) ax_1D[1].legend(loc='upper left') plt.show(block=False) if (SavePlots) : fig_1D.savefig('Hist_InputVars.pdf', dpi=600) # Wait with drawing this 2D distribution, so that you think about the 1D distributions first! # # Correlation plot # fig_corr, ax_corr = plt.subplots(figsize=(10, 6)) # ax_corr.scatter(par0[:, 0], par1[:, 0], color='Red', s=10) # ax_corr.scatter(par0[:, 1], par1[:, 1], color='Blue', s=10) # ax_corr.set(xlabel='Parameter_a', ylabel='Parameter_b', title='Correlation') # if SavePlots: # fig_corr.savefig('Hist_InputVars2D.pdf', dpi=600) # ----------------------------------------------------------------------------------- # # Your calculation of the Fisher discriminant here... # ----------------------------------------------------------------------------------- # # Calculate means and widths (RMS) of the various variables: Mean_A0 = 0.0 # Get the value yourself to understand where to get it from! Mean_B0 = 0.0 # Get the value yourself to understand where to get it from! RMS_A0 = 0.0 # Get the value yourself to understand where to get it from! RMS_B0 = 0.0 # Get the value yourself to understand where to get it from! # Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients: # NOTE: You can get the covariances using the supplied function "GetCovariance". covmat_sum = np.zeros((nvar, nvar)) # Summed covariance matrix covmat_sum[0, 0] = RMS_A0**2 + RMS_B0**2 # Variances of variable 1 for sample A and B # covmat_sum[0, 1] = something with the covariances... print("Combined covariance matrix:") print(covmat_sum) print("") print("Inverted combined covariance matrix:") #covmat_sum_inv = inv(covmat_sum) # With the given values, this will fail, as the matrix is singular! #print(covmat_sum_inv) print("") # Calculate the two Fisher coefficients: # wf = ??? # Apply the Fisher weights to the data and see how well the separation works # data_fisher = ? # Plot the Fisher discriminant: # fig_fisher, ax_fisher = plt.subplots(figsize=(10, 6)) # ax_fisher.hist(fisher_data[:, 0], 200, (-22, 3), histtype='step', color='Red', label='spec0') # ax_fisher.hist(fisher_data[:, 1], 200, (-22, 3), histtype='step', color='Blue', label='spec0') # ax_fisher.set(xlim=(-22, 3), xlabel='Fisher-discriminant') # if (SavePlots) : # fig_fisher.savefig('Hist_FisherOutput.pdf', dpi=600) # fill below yourself print(" The separation between the samples is: {:5.2f}".format( 0 )) try: __IPYTHON__ except: raw_input('Press Enter to exit') # ----------------------------------------------------------------------------------- # # Questions # ----------------------------------------------------------------------------------- # # # As always, make sure that you know what the code is doing so far, and what the aim of # the exercise is (i.e. which problem to solve, and how). Then start to expand on it. # # 1) Look at the 1D distributions of the two discriminating variables for the two species, # and see how well you can separate them by eye. It seems somewhat possible, but # certainly far from perfect... # Once you consider the 2D distribution (scatter plot - to be uncommented by you!), # then it is clear, that some cut along a line at an angle will work much better. # This exercise is about finding that optimal line, and thus the perpendicular axis # to project the data onto! # # 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable # (pair of variables for covariance) for each species, and put these into the # matrices defined. # # 3) From the inverted summed matrix and vectors of means, calculate the two Fisher # coefficients, and given these, calculate the Fisher discriminant for the two # species in question, i.e. F = c_x * x + c_y * y for each point (x,y). # # 4) What separation did you get, and is it notably better than what you obtain by eye? # Also, do your weights make sense? I.e. are they comparable to the widths of the # corresponding variable? # As a simple measure of how good the separation obtained is, we consider the "distance" # between the two distributions as a measure of goodness: # separation = (mu_A - mu_B)**2 / (sigma_A**2 + sigma_B**2) # # Compare the separation you get from each of the two 1D histograms of par0 and par1 # with what you get from the Fisher discriminant, using the above formula. # # Of course the ultimate comparison should be done using ROC curves! # # ----------------------------------------------------------------------------------- #