#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # ROOT macro for analysing the famous Anderson-Fisher Iris data set from Gaspe Peninsula. # # Edgar Anderson took 50 samples of three species (Setosa, Versicolor, and Virginica) # of Iris at the Gaspe Peninsula, and measured four distinguishing features on each # flower: # Sepal Length # Sepal Width # Petal Length # Petal Width # Using these measurements, Ronald Fisher was able to make a classification scheme, for # which he invented the Fisher Linear Discriminant. # # References: # Glen Cowan, SDA, pages 51-57 # http://en.wikipedia.org/wiki/Iris_flower_data_set # http://en.wikipedia.org/wiki/Linear_discriminant_analysis # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 6th of October 2013 # # Author: Lars Egholm Pedersen (NBI) # Email: egholm@nbi.dk # Date: 6th of October 2013 # # # ----------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------- # # Imports # ----------------------------------------------------------------------------------- # from ROOT import * # ----------------------------------------------------------------------------------- # # Functions # ----------------------------------------------------------------------------------- # def sqr( a ) : return a*a # ----------------------------------------------------------------------------------- # # Full solution to the fisher discriminant exercise # ----------------------------------------------------------------------------------- # gROOT.Reset() # Setting of general plotting style: gStyle.SetCanvasColor(0) gStyle.SetFillColor(0) # Setting what to be shown in statistics box (using two different methods): gStyle.SetOptStat("emr") gStyle.SetOptFit(1111) # ----------------------------------------------------------------------------------- # # Define parameters # ----------------------------------------------------------------------------------- # # Define path to datafile filepath = "DataSet_AndersonFisherIris.txt" nspec = 3 # Setosa, Versicolor, Virginica nvar = 4 # Sepal Length, Sepal Width, Petal Length, Petal Width xmin = [3.5, 1.7, 0.0, 0.0] #Range for different variables xmax = [8.5, 4.7, 7.5, 3.0] nbin = [ 25, 30, 25, 30] #Number of bins for different variables var_name = [ "Sepal Length", "Sepal Width", "Petal Length", "Petal Width" ] #Variable name spec_name = [ "Setosa" , "Versicolor" , "Virginica" ] #Species name index_size = [1.0, 1.0, 1.0] #Size of marker index_style = [ 24, 25, 26] #Style of marker index_color = [ 2, 3, 4] #Color of marker #For options, see: http://root.cern.ch/root/html/TAttMarker.html # ----------------------------------------------------------------------------------- # # Define all your graphics here # ----------------------------------------------------------------------------------- # #Dist will now be a list of lists containing histograms dist = [ [ TH1D( "dist_spec%d_var%d"%(ispec, ivar), "dist", nbin[ivar], xmin[ivar], xmax[ivar] ) for ivar in range(nvar) ] #Innermost list is defined per variable for ispec in range(nspec) ] #Outputmost list is difined per species corr = [ [ [ TH2D( "corr_spec%d_var%d_%d"%(ispec, ivar, jvar), "corr", # Initialize by name and title nbin[ivar], xmin[ivar], xmax[ivar] , # Number of bins in x-dir, range nbin[jvar], xmin[jvar], xmax[jvar] ) # Number of bins in y-dir, range for jvar in range( nvar ) ] # Innermost list is defined per variable for ivar in range( nvar ) ] # Next if also defined per variable for ispec in range( nspec) ] # Outermost is defined per species #If you for instance want to access the histogram for species two, variable one : #dist[2][1] #And correlation histogram between variable two and three for species one : #corr[1][2][3] # ----------------------------------------------------------------------------------- # # Loop over and read data from input # ----------------------------------------------------------------------------------- # data = [] #Container for all variables with open( filepath, 'r' ) as infile : counter = 0 for line in infile: data.append(line.strip().split()) # Read line -> Convert to list for ivar in range( nvar ) : data[-1][ivar] = float(data[-1][ivar]) # First nvar values are floats data[-1][4] = int(data[-1][4])-1 # Last one is an integer width species type: 1,2,3 # Start counting at 0, to match your lists : 0,1,2 #Print some numbers as a sanity check if counter < 5 : print" Read data: %5.2f %5.2f %5.2f %5.2f %d "%(data[-1][0] , data[-1][1] , data[-1][2] , data[-1][3] , data[-1][4] ) counter += 1 #Fill histograms for ivar in range( nvar ) : dist[data[-1][4]][ivar].Fill( data[-1][ivar] ) #Fill correlation histograms for jvar in range( nvar ) : corr[data[-1][4]][ivar][jvar].Fill( data[-1][ivar], data[-1][jvar] ) # ----------------------------------------------------------------------------------- # # Display data: # ----------------------------------------------------------------------------------- # text = TLatex() #Usefull for drawing text on plots text.SetNDC() text.SetTextFont(1) text.SetTextColor(1) text.SetTextSize(0.06) # Draw 1D histograms: canvas = TCanvas("canvas","",1200,900) canvas.SetFillColor(0) canvas.Divide(2,2) # Take care of a single odd case: dist[0][1].SetMaximum(13.0) for ivar in range( nvar ) : canvas.cd( ivar+1 ) for ispec in range( nspec ) : dist[ispec][ivar].SetLineColor( index_color[ispec] ) dist[ispec][ivar].SetLineWidth( 2 ) if ispec == 0 : dist[ispec][ivar].Draw() else : dist[ispec][ivar].Draw( "same" ) canvas.Update() canvas.Draw() # Draw 2D histograms: canvas2 = TCanvas("canvas2","",1200, 900) canvas2.SetFillColor(0) canvas2.Divide(nvar,nvar) for ivar in range( nvar ) : for jvar in range( nvar ) : canvas2.cd( ivar*nvar+jvar+1 ) if ivar == jvar : text.DrawLatex( 0.17,0.22, var_name[ivar] ) else : for ispec in range( nspec ) : corr[ispec][ivar][jvar].SetMarkerSize(index_size[ispec]) corr[ispec][ivar][jvar].SetMarkerStyle(index_style[ispec]) corr[ispec][ivar][jvar].SetMarkerColor(index_color[ispec]) if ispec == 0 : corr[ispec][ivar][jvar].Draw() else : corr[ispec][ivar][jvar].Draw("same") canvas2.Update() canvas2.Draw() # ----------------------------------------------------------------------------------- # # Make Fisher discriminant between species Versicolor (1) and Virginica (2): # (i.e. not considering Setosa(0) for now) # ----------------------------------------------------------------------------------- # # Means and widths (RMS): mu = [ [0.0]*nvar for i in range(nspec) ] # means covmat_comb = TMatrixD(nvar, nvar) # summed covariance matrix wf = [0.0]*nvar # fisher weights raw_input( ' ... ' ) # ----------------------------------------------------------------------------------- # # Questions # ----------------------------------------------------------------------------------- # # # Calculate the discriminant... # # Also what separation did you get, and do your weights make sense? # # ----------------------------------------------------------------------------------- #