#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # ROOT macro for analysing the famous Anderson-Fisher Iris data set from Gaspe Peninsula. # # Edgar Anderson took 50 samples of three species (Setosa, Versicolor, and Virginica) # of Iris at the Gaspe Peninsula, and measured four distinguishing features on each # flower: # Sepal Length # Sepal Width # Petal Length # Petal Width # Using these measurements, Ronald Fisher was able to make a classification scheme, for # which he invented the Fisher Linear Discriminant. # # References: # Glen Cowan, SDA, pages 51-57 # http://en.wikipedia.org/wiki/Linear_discriminant_analysis # http://en.wikipedia.org/wiki/Iris_flower_data_set # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 5th of October 2014 # # Author: Lars Egholm Pedersen (NBI) # Email: egholm@nbi.dk # Date: 6th of October 2013 # # # ----------------------------------------------------------------------------------- # from ROOT import * # ----------------------------------------------------------------------------------- # # Functions # ----------------------------------------------------------------------------------- # def sqr( a ) : return a*a # ----------------------------------------------------------------------------------- # # Full solution to the fisher discriminant exercise # ----------------------------------------------------------------------------------- # gROOT.Reset() # Setting of general plotting style: gStyle.SetCanvasColor(0) gStyle.SetFillColor(0) # Setting what to be shown in statistics box (using two different methods): gStyle.SetOptStat("emr") gStyle.SetOptFit(1111) # ----------------------------------------------------------------------------------- # # Define parameters # ----------------------------------------------------------------------------------- # # Define path to datafile filepath = "DataSet_AndersonFisherIris.txt" nspec = 3 # Setosa, Versicolor, Virginica nvar = 4 # Sepal Length, Sepal Width, Petal Length, Petal Width xmin = [3.5, 1.7, 0.0, 0.0] # Range for different variables xmax = [8.5, 4.7, 7.5, 3.0] # Range for different variables nbin = [ 25, 30, 25, 30] # Number of bins for different variables var_name = [ "Sepal Length", "Sepal Width", "Petal Length", "Petal Width" ] # Variable name spec_name = [ "Setosa" , "Versicolor" , "Virginica" ] # Species name index_size = [1.0, 1.0, 1.0] # Size of marker index_style = [ 24, 25, 26] # Style of marker index_color = [ 2, 3, 4] # Color of marker # NOTE: For options, see: http://root.cern.ch/root/html/TAttMarker.html # ----------------------------------------------------------------------------------- # # Define all your graphics here # ----------------------------------------------------------------------------------- # # Dist will now be a list of lists containing 1D histograms: dist = [ [ TH1D( "dist_spec%d_var%d"%(ispec, ivar), "dist", nbin[ivar], xmin[ivar], xmax[ivar] ) for ivar in range(nvar) ] #Innermost list is defined per variable for ispec in range(nspec) ] #Outputmost list is difined per species # Likewise, corr will be a list of a list of lists of 2D histograms (scatter plots for checking correlations): corr = [ [ [ TH2D( "corr_spec%d_var%d_%d"%(ispec, ivar, jvar), "corr", # Initialize by name and title nbin[ivar], xmin[ivar], xmax[ivar] , # Number of bins in x-dir, range nbin[jvar], xmin[jvar], xmax[jvar] ) # Number of bins in y-dir, range for jvar in range( nvar ) ] # Innermost list is defined per variable for ivar in range( nvar ) ] # Next if also defined per variable for ispec in range( nspec) ] # Outermost is defined per species # Note/example: # If you for instance want to access the histogram for species two, variable one, it is: dist[2][1] # Likewise, the correlation histogram between variable two and three for species one is: corr[1][2][3] # ----------------------------------------------------------------------------------- # # Loop over and read data from input # ----------------------------------------------------------------------------------- # data = [] # Container for all variables with open( filepath, 'r' ) as infile : counter = 0 for line in infile: data.append(line.strip().split()) # Read line -> Convert to list for ivar in range( nvar ) : data[-1][ivar] = float(data[-1][ivar]) # First nvar values are floats data[-1][4] = int(data[-1][4])-1 # Last one is an integer width species type: 1,2,3 # Start counting at 0, to match your lists : 0,1,2 # Print some numbers as a sanity check if (counter < 5) : print" Read data: %5.2f %5.2f %5.2f %5.2f %d "%(data[-1][0] , data[-1][1] , data[-1][2] , data[-1][3] , data[-1][4] ) counter += 1 # Fill 1D histograms for ivar in range( nvar ) : dist[data[-1][4]][ivar].Fill( data[-1][ivar] ) # Fill 2D correlation histograms for jvar in range( nvar ) : corr[data[-1][4]][ivar][jvar].Fill( data[-1][ivar], data[-1][jvar] ) # ----------------------------------------------------------------------------------- # # Display data: # ----------------------------------------------------------------------------------- # text = TLatex() # Usefull for drawing text on plots text.SetNDC() text.SetTextFont(1) text.SetTextColor(1) text.SetTextSize(0.06) # Draw 1D histograms: canvas = TCanvas("canvas", "", 1200, 900) canvas.SetFillColor(0) canvas.Divide(2,2) # Take care of a single odd case: dist[0][1].SetMaximum(13.0) for ivar in range( nvar ) : canvas.cd( ivar+1 ) for ispec in range( nspec ) : dist[ispec][ivar].SetLineColor( index_color[ispec] ) dist[ispec][ivar].SetLineWidth( 2 ) if ispec == 0 : dist[ispec][ivar].Draw() else : dist[ispec][ivar].Draw( "same" ) canvas.Update() canvas.Draw() # Draw 2D histograms: canvas2 = TCanvas("canvas2","",1200, 900) canvas2.SetFillColor(0) canvas2.Divide(nvar,nvar) for ivar in range( nvar ) : for jvar in range( nvar ) : canvas2.cd( ivar*nvar+jvar+1 ) if ivar == jvar : text.DrawLatex( 0.17,0.22, var_name[ivar] ) else : for ispec in range( nspec ) : corr[ispec][ivar][jvar].SetMarkerSize(index_size[ispec]) corr[ispec][ivar][jvar].SetMarkerStyle(index_style[ispec]) corr[ispec][ivar][jvar].SetMarkerColor(index_color[ispec]) if ispec == 0 : corr[ispec][ivar][jvar].Draw() else : corr[ispec][ivar][jvar].Draw("same") canvas2.Update() canvas2.Draw() # ----------------------------------------------------------------------------------- # # Make Fisher discriminant between species Versicolor (1) and Virginica (2): # (i.e. not considering Setosa (0) for now) # ----------------------------------------------------------------------------------- # # Calculate means and widths (RMS) of the various variables: # Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients: # NOTE: You can get the covariances from the 2D histograms, using "GetCovariance()". covmat_Vers = TMatrixD(nvar, nvar) # Covariance matrix for Versicolor covmat_Virg = TMatrixD(nvar, nvar) # Covariance matrix for Verginica covmat_comb = TMatrixD(nvar, nvar) # Summed covariance matrix # invcovmat_comb = covmat_comb.Invert() # Inverted matrix # Calculate the four Fisher coefficients: # Loop over the data (species Versicolor (1) and Virginica (2)), and calculate their # Fisher discriminant value, and see how well the separation works: raw_input( ' ... ' ) # ----------------------------------------------------------------------------------- # # Questions # ----------------------------------------------------------------------------------- # # # As always, make sure that you know what the code is doing so far, and what the aim of # the exercise is (i.e. which problem to solve, and how). Then start to expand on it. # # 1) Look at the distributions of the four discriminating variables for the two species # Versicolor (1) and Virginica (2), and see how well you can separate them by eye. # It seems reasonably possible, but certainly not perfect... # # 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable # (pair of variables for covariance) for each species, and put these into the # matrices defined. # # 3) From the inverted summed matrix, calculate the four Fisher coefficients, and # given these, calculate the Fisher discriminant for the two species in question. # # 4) What separation did you get, and is it notably better than what you obtain by eye? # Also, do your weights make sense? I.e. are they comparable to the widths of the # corresponding variable. # # ----------------------------------------------------------------------------------- #