// ----------------------------------------------------------------------------------- // /* ROOT macro for analysing the famous Anderson-Fisher Iris data set from Gaspe Peninsula. Edgar Anderson took 50 samples of three species (Setosa, Versicolor, and Virginica) of Iris at the Gaspe Peninsula, and measured four distinguishing features on each flower: Sepal Length Sepal Width Petal Length Petal Width Using these measurements, Ronald Fisher was able to make a classification scheme, for which he invented the Fisher Linear Discriminant. References: Glen Cowan, SDA, pages 51-57 http://en.wikipedia.org/wiki/Iris_flower_data_set Author: Troels C. Petersen (NBI/CERN) Email: Troels.Petersen@cern.ch Date: 16th of September 2009 */ // ----------------------------------------------------------------------------------- // //---------------------------------------------------------------------------------- // Run in ROOT by: .x FisherDiscriminant.c // Output: FisherDiscriminant.eps //---------------------------------------------------------------------------------- double sqr(double a) { return a*a; } // ----------------------------------------------------------------------------------- // void FisherDiscriminant_Solution() { // ----------------------------------------------------------------------------------- // gROOT->Reset(); // Setting of general plotting style: gStyle->SetCanvasColor(0); gStyle->SetFillColor(0); // Setting what to be shown in statistics box (using two different methods): gStyle->SetOptStat("emr"); gStyle->SetOptFit(1111); // ------------------------------------------------------------------ // // Read data from file: // ------------------------------------------------------------------ // // Open data file: FILE *data = fopen("../DataSet_AndersonFisherIris2.txt","r"); const int Nspec = 3; // Setosa, Versicolor, Virginica const int Nvar = 4; // Sepal Length, Sepal Width, Petal Length, Petal Width const int Nmax = 200; double xmin[Nvar] = {3.5, 1.7, 0.0, 0.0}; double xmax[Nvar] = {8.5, 4.7, 7.5, 3.0}; double Nbin[Nvar] = { 25, 30, 25, 30}; // Make 12 1D and 48 2D histograms: char name[20]; TH1F* Dist[Nspec][Nvar]; TH2F* Corr[Nspec][Nvar][Nvar]; for (int ispec = 0; ispec < Nspec; ispec++) { for (int ivar = 0; ivar < Nvar; ivar++) { sprintf(name, "Dist_Spec%1d_Var%1d", ispec, ivar); Dist[ispec][ivar] = new TH1F(name, name, Nbin[ivar], xmin[ivar], xmax[ivar]); for (int jvar = 0; jvar < Nvar; jvar++) { sprintf(name, "Corr_Spec%1d_Var%1d%1d", ispec, ivar, jvar); Corr[ispec][ivar][jvar] = new TH2F(name, name, Nbin[ivar], xmin[ivar], xmax[ivar], Nbin[jvar], xmin[jvar], xmax[jvar]); } } } // Loop over and read data as long as there is data (i.e. not End-Of-File (EOF)). int n = 0; double x[Nvar][Nmax]; int species[Nmax]; while (fscanf(data, "%lf %lf %lf %lf %d \n", &x[0][n], &x[1][n], &x[2][n], &x[3][n], &species[n]) != EOF) { if (n < 5) printf(" Read data: %5.2f %5.2f %5.2f %5.2f %d \n", x[0][n], x[1][n], x[2][n], x[3][n], species[n]); // Fill histograms: for (int ivar = 0; ivar < Nvar; ivar++) { Dist[species[n]-1][ivar]->Fill(x[ivar][n]); for (int jvar = 0; jvar < Nvar; jvar++) { Corr[species[n]-1][ivar][jvar]->Fill(x[ivar][n],x[jvar][n]); } } n++; } printf(" Found %d entries. \n", n); fclose(data); // ------------------------------------------------------------------ // // Display data: // ------------------------------------------------------------------ // // Define the array of variables (numbered 0-3): char* Variable_name[Nvar] = {"Sepal Length", "Sepal Width", "Petal Length", "Petal Width"}; TLatex *text = new TLatex(); text->SetNDC(); text->SetTextFont(1); text->SetTextColor(1); text->SetTextSize(0.06); int index_size[Nspec] = {1.0, 1.0, 1.0}; int index_style[Nspec] = { 24, 25, 26}; int index_color[Nspec] = { 2, 3, 4}; // Draw 1D histograms: canvas = new TCanvas("canvas","",250,20,750,750); canvas->SetFillColor(0); canvas->Divide(2,2); // Take care of a single odd case: Dist[0][1]->SetMaximum(13.0); for (int ivar = 0; ivar < Nvar; ivar++) { canvas->cd(ivar+1); for (int ispec = 0; ispec < Nspec; ispec++) { Dist[ispec][ivar]->SetLineColor(index_color[ispec]); Dist[ispec][ivar]->SetLineWidth(2); if (ispec == 0) Dist[ispec][ivar]->Draw(); else Dist[ispec][ivar]->Draw("same"); } } // Draw 2D histograms: canvas2 = new TCanvas("canvas2","",280,50,750,750); canvas2->SetFillColor(0); canvas2->Divide(4,4); for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { canvas2->cd(ivar*4+jvar+1); if (ivar == jvar) { // Text: // text->DrawLatex(0.17,0.22,Variable_name[ivar]); } else { for (int ispec = 0; ispec < Nspec; ispec++) { Corr[ispec][ivar][jvar]->SetMarkerSize(index_size[ispec]); Corr[ispec][ivar][jvar]->SetMarkerStyle(index_style[ispec]); Corr[ispec][ivar][jvar]->SetMarkerColor(index_color[ispec]); if (ispec == 0) Corr[ispec][ivar][jvar]->Draw(); else Corr[ispec][ivar][jvar]->Draw("same"); } } } } // ------------------------------------------------------------------ // // Make Fisher discriminant between species Versicolor and Virginica: // ------------------------------------------------------------------ // canvasFi = new TCanvas("canvasFi","",310,80,750,750); canvasFi->SetFillColor(0); canvasFi->Divide(1,2); // Fisher discriminant WITHOUT taking correlations into account! // ------------------------------------------------------------- // This requires only the means and the widths (RMS): double mu[Nspec][Nvar], sig[Nspec][Nvar]; for (int ispec = 0; ispec < Nspec; ispec++) { for (int ivar = 0; ivar < Nvar; ivar++) { mu[ispec][ivar] = Dist[ispec][ivar]->GetMean(); sig[ispec][ivar] = Dist[ispec][ivar]->GetRMS(); } } // Calculate Fisher coefficients (not using correlations): double wf_nocorr[Nvar]; for (int ivar = 0; ivar < Nvar; ivar++) { wf_nocorr[ivar] = (mu[2][ivar] - mu[1][ivar]) / (sqr(sig[1][ivar]) + sqr(sig[2][ivar])); } // Plot distribution of Fisher TH1F* Dist_Fisher_nocorr[2]; Dist_Fisher_nocorr[0] = new TH1F("Dist_Fisher_nocorr_Spec1", "", 60, 20.0, 50.0); Dist_Fisher_nocorr[1] = new TH1F("Dist_Fisher_nocorr_Spec2", "", 60, 20.0, 50.0); for (int i=50; i < n; i++) { double fisher_nocorr = wf_nocorr[0] * x[0][i] + wf_nocorr[1] * x[1][i] + wf_nocorr[2] * x[2][i] + wf_nocorr[3] * x[3][i]; Dist_Fisher_nocorr[species[i]-2]->Fill(fisher_nocorr); } canvasFi->cd(1); Dist_Fisher_nocorr[0]->SetLineWidth(2); Dist_Fisher_nocorr[1]->SetLineWidth(2); Dist_Fisher_nocorr[0]->SetLineColor(3); Dist_Fisher_nocorr[1]->SetLineColor(4); Dist_Fisher_nocorr[0]->Draw(); Dist_Fisher_nocorr[1]->Draw("same"); printf(" The separation between the samples is: %5.2f \n", (Dist_Fisher_nocorr[1]->GetMean() - Dist_Fisher_nocorr[0]->GetMean()) / sqrt(sqr(Dist_Fisher_nocorr[0]->GetRMS()) + sqr(Dist_Fisher_nocorr[1]->GetRMS()))); // Fisher discriminant WITH correlations taken into account! // --------------------------------------------------------- // This also requires the inverted covariance matrices: printf(" Covariance matrix (combination of two types): \n"); TMatrixD CovMat1(Nvar,Nvar), CovMat2(Nvar,Nvar), CovMatComb(Nvar,Nvar); for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { CovMat1(ivar,jvar) = Corr[1][ivar][jvar]->GetCovariance(); CovMat2(ivar,jvar) = Corr[2][ivar][jvar]->GetCovariance(); CovMatComb(ivar,jvar) = CovMat1(ivar,jvar) + CovMat2(ivar,jvar); printf(" %7.3f", CovMatComb(ivar,jvar)); } printf("\n"); } printf(" Inverted covariance matrix: \n"); CovMatComb.Invert(); for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { printf(" %7.3f", CovMatComb(ivar,jvar)); } printf("\n"); } // Calculate Fisher coefficients: double wf[Nvar] = {0.0, 0.0, 0.0, 0.0}; for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { wf[ivar] += (mu[2][jvar] - mu[1][jvar]) * CovMatComb(ivar,jvar); } } // Plot distribution of Fisher TH1F* Dist_Fisher[2]; Dist_Fisher[0] = new TH1F("Dist_Fisher_Spec1", "", 80, 0.0, 80.0); Dist_Fisher[1] = new TH1F("Dist_Fisher_Spec2", "", 80, 0.0, 80.0); for (int i=50; i < n; i++) { double fisher = wf[0] * x[0][i] + wf[1] * x[1][i] + wf[2] * x[2][i] + wf[3] * x[3][i]; Dist_Fisher[species[i]-2]->Fill(fisher); } canvasFi->cd(2); Dist_Fisher[0]->SetLineWidth(2); Dist_Fisher[1]->SetLineWidth(2); Dist_Fisher[0]->SetLineColor(3); Dist_Fisher[1]->SetLineColor(4); Dist_Fisher[0]->Draw(); Dist_Fisher[1]->Draw("same"); printf(" The separation between the samples is: %5.2f \n", (Dist_Fisher[1]->GetMean() - Dist_Fisher[0]->GetMean()) / sqrt(sqr(Dist_Fisher[0]->GetRMS()) + sqr(Dist_Fisher[1]->GetRMS()))); }