#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python macro for analysing the famous Anderson-Fisher Iris data set from Gaspe Peninsula. # # Edgar Anderson took 50 samples of three species (Setosa, Versicolor, and Virginica) # of Iris at the Gaspe Peninsula, and measured four distinguishing features on each # flower: # Sepal Length # Sepal Width # Petal Length # Petal Width # Using these measurements, Ronald Fisher was able to make a classification scheme, for # which he invented the Fisher Linear Discriminant. # # References: # Glen Cowan, Statistical Data Analysis, pages 51-57 # http://en.wikipedia.org/wiki/Linear_discriminant_analysis # http://en.wikipedia.org/wiki/Iris_flower_data_set # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 28th of December 2017 # # ----------------------------------------------------------------------------------- # # First, import the modules you want to use: from __future__ import print_function, division import numpy as np # Matlab like syntax for linear algebra and functions import matplotlib.pyplot as plt # Plots and figures like you know them from Matlab from numpy.linalg import inv # Inveert a matrix plt.close('all') # To close all open figures r = np.random # Random generator r.seed(42) # Set a random seed (but a fixed one) SavePlots = True # For now, don't save plots (once you trust your code, switch on) # ----------------------------------------------------------------------------------- # # Functions # ----------------------------------------------------------------------------------- # def GetCovariance(X, Y): return np.cov(X, Y, ddof=0)[0, 1] # ----------------------------------------------------------------------------------- # # Define parameters # ----------------------------------------------------------------------------------- # # Define path to datafile filepath = "DataSet_AndersonFisherIris.txt" nspec = 3 # Setosa, Versicolor, Virginica nvar = 4 # Sepal Length, Sepal Width, Petal Length, Petal Width xmin = [3.5, 1.7, 0.0, 0.0] # Range for different variables xmax = [8.5, 4.7, 7.5, 3.0] # Range for different variables nbin = [ 25, 30, 25, 30] # Number of bins for different variables var_name = [ "Sepal Length", "Sepal Width", "Petal Length", "Petal Width" ] # Variable name spec_name = [ "Setosa" , "Versicolor" , "Virginica" ] # Species name colors = ['Red', 'Green', 'Blue'] markers = ['o', '^', 's'] # ----------------------------------------------------------------------------------- # # Loop over and read data from input: # ----------------------------------------------------------------------------------- # data = [] # Container for all variables with open( filepath, 'r' ) as infile : counter = 0 for line in infile: data.append(line.strip().split()) # Read line -> Convert to list for ivar in range( nvar ) : data[-1][ivar] = float(data[-1][ivar]) # First nvar values are floats data[-1][4] = int(data[-1][4])-1 # Last one is an integer width species type: 1,2,3 # Start counting at 0, to match your lists : 0,1,2 # Print some numbers as a sanity check if (counter < 5) : print(" Read data: {:5.2f} {:5.2f} {:5.2f} {:5.2f} {:d} ".format(data[-1][0], data[-1][1], data[-1][2], data[-1][3], data[-1][4])) counter += 1 # Numpy version: data = np.loadtxt(filepath) data[:, -1] -= 1 # Subtract one from all the last integers in the data: # Last one is an integer width species type: 1,2,3 # Start counting at 0, to match your lists : 0,1,2 # ----------------------------------------------------------------------------------- # # Display data: # ----------------------------------------------------------------------------------- # mu = np.zeros((nspec, nvar)) RMS = np.zeros((nspec, nvar)) fig_1D, ax_1D = plt.subplots(nrows=2, ncols=2, figsize=(10,8)) ax_1D = ax_1D.flatten() for ivar in range(nvar): # for each variable plot each species' corresponding histogram of that variable for ispec in range(nspec): data_spec = data[data[:, -1] == ispec] # extract the relevant species data_spec_var = data_spec[:, ivar] # extract the relevant variable ax_1D[ivar].hist( data_spec_var, nbin[ivar], range=(xmin[ivar], xmax[ivar]), histtype='step', color = colors[ispec], label=spec_name[ispec], lw=2) mu[ispec, ivar] = data_spec_var.mean() RMS[ispec, ivar] = data_spec_var.std(ddof=1) ax_1D[ivar].set(title=var_name[ivar], ylabel='Counts') ax_1D[ivar].legend() fig_1D.tight_layout() plt.show(block=False) # Draw 2D histograms: fig_corr, ax_corr = plt.subplots(nrows=4, ncols=4, figsize=(12, 10)) covariances = np.zeros((nvar, nvar, nspec)) for ivar in range(nvar): for jvar in range(nvar): ax = ax_corr[ivar, jvar] for ispec in range(nspec): data_ispec = data[data[:, -1] == ispec] # extract the relevant species data_ispec_ivar = data_ispec[:, ivar] data_ispec_jvar = data_ispec[:, jvar] covariances[ivar, jvar, ispec] = GetCovariance(data_ispec_ivar, data_ispec_jvar) if ivar == jvar: # i.e. diagonal ax.text( 0.5, 0.5, var_name[ivar], horizontalalignment='center', verticalalignment='center', transform=ax.transAxes) else: ax.scatter(data_ispec_ivar, data_ispec_jvar, marker = markers[ispec], color = colors[ispec]) fig_corr.tight_layout() plt.show(block=False) # ----------------------------------------------------------------------------------- # # Make Fisher discriminant between species Versicolor (1) and Virginica (2): # (i.e. not considering Setosa (0) for now) # ----------------------------------------------------------------------------------- # # Calculate means and widths (RMS) of the various variables: # Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients: #covmat_Vers = ? # Covariance matrix for Versicolor #covmat_Virg = ? # Covariance matrix for Verginica # covmat_comb = covmat_Vers + covmat_Virg # Summed covariance matrix # covmat_comb_inv = inv(covmat_comb) # Inverted matrix # Calculate the four Fisher coefficients: # Loop over the data (species Versicolor (1) and Virginica (2)), and calculate their # Fisher discriminant value, and see how well the separation works: try: __IPYTHON__ except: raw_input('Press Enter to exit') # ----------------------------------------------------------------------------------- # # Questions # ----------------------------------------------------------------------------------- # # # As always, make sure that you know what the code is doing so far, and what the aim of # the exercise is (i.e. which problem to solve, and how). Then start to expand on it. # # 1) Look at the distributions of the four discriminating variables for the two species # Versicolor (1) and Virginica (2), and see how well you can separate them by eye. # It seems reasonably possible, but certainly not perfect... # # 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable # (pair of variables for covariance) for each species, and put these into the # matrices defined. # # 3) From the inverted summed matrix, calculate the four Fisher coefficients, and # given these, calculate the Fisher discriminant for the two species in question. # # 4) What separation did you get, and is it notably better than what you obtain by eye? # Possibly make two ROC curves to quantify this. # # ----------------------------------------------------------------------------------- #