// ----------------------------------------------------------------------------------- // /* ROOT macro for analysing the famous Anderson-Fisher Iris data set from Gaspe Peninsula. Edgar Anderson took 50 samples of three species (Setosa, Versicolor, and Virginica) of Iris at the Gaspe Peninsula, and measured four distinguishing features on each flower: Sepal Length Sepal Width Petal Length Petal Width Using these measurements, Ronald Fisher was able to make a classification scheme, for which he invented the Fisher Linear Discriminant. References: Glen Cowan, SDA, pages 51-57 http://en.wikipedia.org/wiki/Iris_flower_data_set Author: Troels C. Petersen (NBI) Email: petersen@nbi.dk Date: 2nd of October 2011 */ // ----------------------------------------------------------------------------------- // double sqr(double a) { return a*a; } // ----------------------------------------------------------------------------------- // void FisherDiscriminant_FullSolution() { // ----------------------------------------------------------------------------------- // gROOT->Reset(); // Setting of general plotting style: gStyle->SetCanvasColor(0); gStyle->SetFillColor(0); // Setting what to be shown in statistics box (using two different methods): gStyle->SetOptStat("emr"); gStyle->SetOptFit(1111); // ------------------------------------------------------------------ // // Read data from file: // ------------------------------------------------------------------ // // Open data file: FILE *data = fopen("DataSet_AndersonFisherIris2.txt","r"); const int Nspec = 3; // Setosa, Versicolor, Virginica const int Nvar = 4; // Sepal Length, Sepal Width, Petal Length, Petal Width const int Nmax = 200; double xmin[Nvar] = {3.5, 1.7, 0.0, 0.0}; double xmax[Nvar] = {8.5, 4.7, 7.5, 3.0}; double Nbin[Nvar] = { 25, 30, 25, 30}; // Make 12 1D (3 species and 4 variables) and 48 2D (3 species and 4*4 correlations) histograms: char name[20]; TH1F* Dist[Nspec][Nvar]; TH2F* Corr[Nspec][Nvar][Nvar]; for (int ispec = 0; ispec < Nspec; ispec++) { for (int ivar = 0; ivar < Nvar; ivar++) { sprintf(name, "Dist_Spec%1d_Var%1d", ispec, ivar); Dist[ispec][ivar] = new TH1F(name, name, Nbin[ivar], xmin[ivar], xmax[ivar]); for (int jvar = 0; jvar < Nvar; jvar++) { sprintf(name, "Corr_Spec%1d_Var%1d%1d", ispec, ivar, jvar); Corr[ispec][ivar][jvar] = new TH2F(name, name, Nbin[ivar], xmin[ivar], xmax[ivar], Nbin[jvar], xmin[jvar], xmax[jvar]); } } } // Loop over and read data as long as there is data (i.e. not End-Of-File (EOF)). int n = 0; double x[Nvar][Nmax]; int species[Nmax]; while (fscanf(data, "%lf %lf %lf %lf %d \n", &x[0][n], &x[1][n], &x[2][n], &x[3][n], &species[n]) != EOF) { if (n < 5) printf(" Read data: %5.2f %5.2f %5.2f %5.2f %d \n", x[0][n], x[1][n], x[2][n], x[3][n], species[n]); // Fill histograms: for (int ivar = 0; ivar < Nvar; ivar++) { Dist[species[n]-1][ivar]->Fill(x[ivar][n]); for (int jvar = 0; jvar < Nvar; jvar++) { Corr[species[n]-1][ivar][jvar]->Fill(x[ivar][n],x[jvar][n]); } } n++; } printf(" Found %d entries. \n", n); fclose(data); // ------------------------------------------------------------------ // // Display data: // ------------------------------------------------------------------ // // Define the array of variables (numbered 0-3): char* Variable_name[Nvar] = {"Sepal Length", "Sepal Width", "Petal Length", "Petal Width"}; TLatex *text = new TLatex(); text->SetNDC(); text->SetTextFont(1); text->SetTextColor(1); text->SetTextSize(0.06); int index_size[Nspec] = {1.0, 1.0, 1.0}; int index_style[Nspec] = { 24, 25, 26}; int index_color[Nspec] = { 2, 3, 4}; // Draw 1D histograms: canvas = new TCanvas("canvas","",250,20,550,550); canvas->SetFillColor(0); canvas->Divide(2,2); // Take care of a single odd case: Dist[0][1]->SetMaximum(13.0); for (int ivar = 0; ivar < Nvar; ivar++) { canvas->cd(ivar+1); for (int ispec = 0; ispec < Nspec; ispec++) { Dist[ispec][ivar]->SetLineColor(index_color[ispec]); Dist[ispec][ivar]->SetLineWidth(2); if (ispec == 0) Dist[ispec][ivar]->Draw(); else Dist[ispec][ivar]->Draw("same"); } } // Draw 2D histograms: canvas2 = new TCanvas("canvas2","",280,50,550,550); canvas2->SetFillColor(0); canvas2->Divide(4,4); for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { canvas2->cd(ivar*4+jvar+1); if (ivar == jvar) { // Text: text->DrawLatex(0.17,0.22,Variable_name[ivar]); } else { for (int ispec = 0; ispec < Nspec; ispec++) { Corr[ispec][ivar][jvar]->SetMarkerSize(index_size[ispec]); Corr[ispec][ivar][jvar]->SetMarkerStyle(index_style[ispec]); Corr[ispec][ivar][jvar]->SetMarkerColor(index_color[ispec]); if (ispec == 0) Corr[ispec][ivar][jvar]->Draw(); else Corr[ispec][ivar][jvar]->Draw("same"); } } } } // ------------------------------------------------------------------ // // Make Fisher discriminant between species Versicolor (1) and Virginica (2): // (i.e. not considering Setosa(0) for now) // ------------------------------------------------------------------ // // Means and widths (RMS): double mu[Nspec][Nvar], sig[Nspec][Nvar]; for (int ispec = 0; ispec < Nspec; ispec++) { for (int ivar = 0; ivar < Nvar; ivar++) { mu[ispec][ivar] = Dist[ispec][ivar]->GetMean(); sig[ispec][ivar] = Dist[ispec][ivar]->GetRMS(); } } // This also requires the inverted covariance matrices: printf(" Combined covariance matrix: \n"); TMatrixD CovMat1(Nvar,Nvar), CovMat2(Nvar,Nvar), CovMatComb(Nvar,Nvar); for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { CovMat1(ivar,jvar) = Corr[1][ivar][jvar]->GetCovariance(); CovMat2(ivar,jvar) = Corr[2][ivar][jvar]->GetCovariance(); CovMatComb(ivar,jvar) = CovMat1(ivar,jvar)+CovMat2(ivar,jvar); printf(" %7.3f", CovMatComb(ivar,jvar)); } printf("\n"); } printf(" Inverted combined covariance matrix: \n"); CovMatComb.Invert(); for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { printf(" %7.3f", CovMatComb(ivar,jvar)); } printf("\n"); } // ------------------------------------------------------------------ // // Calculate Fisher coefficients and plot them: // ------------------------------------------------------------------ // double wf[Nvar] = {0.0, 0.0, 0.0, 0.0}; for (int ivar = 0; ivar < Nvar; ivar++) { for (int jvar = 0; jvar < Nvar; jvar++) { wf[ivar] += (mu[2][jvar] - mu[1][jvar]) * CovMatComb(ivar,jvar); } } // Plot distribution of Fisher canvasFi = new TCanvas("canvasFi","",50,20,550,550); canvasFi->SetFillColor(0); TH1F* Dist_Fisher[2]; Dist_Fisher[0] = new TH1F("Dist_Fisher_Spec1", "", 40, 0.0, 20.0); Dist_Fisher[1] = new TH1F("Dist_Fisher_Spec2", "", 40, 0.0, 20.0); for (int i=50; i < n; i++) { double fisher = wf[0] * x[0][i] + wf[1] * x[1][i] + wf[2] * x[2][i] + wf[3] * x[3][i]; Dist_Fisher[species[i]-2]->Fill(fisher); } Dist_Fisher[0]->SetLineWidth(2); Dist_Fisher[1]->SetLineWidth(2); Dist_Fisher[0]->SetLineColor(3); Dist_Fisher[1]->SetLineColor(4); Dist_Fisher[0]->Draw(); Dist_Fisher[1]->Draw("same"); printf(" The separation between the samples is: %5.2f \n", (Dist_Fisher[1]->GetMean() - Dist_Fisher[0]->GetMean()) / sqrt(sqr(Dist_Fisher[0]->GetRMS()) + sqr(Dist_Fisher[1]->GetRMS()))); canvasFi->Update(); // canvasFi->SaveAs("FisherIrisSeparation.png"); }