#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  ROOT macro for analysing the famous Anderson-Fisher Iris data set from Gaspe Peninsula.
#
#  Edgar Anderson took 50 samples of three species (Setosa, Versicolor, and Virginica)
#  of Iris at the Gaspe Peninsula, and measured four distinguishing features on each
#  flower:
#    Sepal Length
#    Sepal Width
#    Petal Length
#    Petal Width
#  Using these measurements, Ronald Fisher was able to make a classification scheme, for
#  which he invented the Fisher Linear Discriminant.
#
#  References:
#    Glen Cowan, SDA, pages 51-57
#    http://en.wikipedia.org/wiki/Linear_discriminant_analysis
#    http://en.wikipedia.org/wiki/Iris_flower_data_set
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   5th of October 2014
#
#  Author: Lars Egholm Pedersen (NBI)
#  Email:  egholm@nbi.dk
#  Date:   6th of October 2013
#
#
# ----------------------------------------------------------------------------------- #

from ROOT import *


# ----------------------------------------------------------------------------------- #
# Functions
# ----------------------------------------------------------------------------------- #
def sqr( a ) : 
    return a*a


# ----------------------------------------------------------------------------------- #
# Full solution to the fisher discriminant exercise
# ----------------------------------------------------------------------------------- #
gROOT.Reset()

# Setting of general plotting style:
gStyle.SetCanvasColor(0)
gStyle.SetFillColor(0)
# Setting what to be shown in statistics box (using two different methods):
gStyle.SetOptStat("emr")
gStyle.SetOptFit(1111)


# ----------------------------------------------------------------------------------- #
# Define parameters
# ----------------------------------------------------------------------------------- #

# Define path to datafile
filepath = "DataSet_AndersonFisherIris.txt"

nspec =   3  # Setosa, Versicolor, Virginica
nvar  =   4  # Sepal Length, Sepal Width, Petal Length, Petal Width

xmin = [3.5, 1.7, 0.0, 0.0]    # Range for different variables
xmax = [8.5, 4.7, 7.5, 3.0]    # Range for different variables
nbin = [ 25,  30,  25,  30]    # Number of bins for different variables

var_name  = [ "Sepal Length", "Sepal Width", "Petal Length", "Petal Width" ]  # Variable name
spec_name = [ "Setosa"      , "Versicolor" , "Virginica"                   ]  # Species name

index_size  = [1.0, 1.0, 1.0]   # Size of marker
index_style = [ 24,  25,  26]   # Style of marker
index_color = [  2,   3,   4]   # Color of marker
# NOTE: For options, see: http://root.cern.ch/root/html/TAttMarker.html


# ----------------------------------------------------------------------------------- #
# Define all your graphics here
# ----------------------------------------------------------------------------------- #

# Dist will now be a list of lists containing 1D histograms:
dist = [ [ TH1D( "dist_spec%d_var%d"%(ispec, ivar), "dist", nbin[ivar], xmin[ivar], xmax[ivar] )
           for ivar in range(nvar) ] #Innermost list is defined per variable
         for ispec in range(nspec) ] #Outputmost list is difined per species

# Likewise, corr will be a list of a list of lists of 2D histograms (scatter plots for checking correlations):
corr = [ [ [ TH2D( "corr_spec%d_var%d_%d"%(ispec, ivar, jvar), "corr", # Initialize by name and title
                   nbin[ivar], xmin[ivar], xmax[ivar] ,                # Number of bins in x-dir, range
                   nbin[jvar], xmin[jvar], xmax[jvar] )                # Number of bins in y-dir, range
             for jvar in range( nvar ) ]                               # Innermost list is defined per variable
           for ivar in range( nvar )   ]                               # Next if also defined per variable
         for ispec in range( nspec)    ]                               # Outermost is defined per species

# Note/example:
# If you for instance want to access the histogram for species two, variable one, it is: dist[2][1]
# Likewise, the correlation histogram between variable two and three for species one is: corr[1][2][3]


# ----------------------------------------------------------------------------------- #
# Loop over and read data from input
# ----------------------------------------------------------------------------------- #

data     = []     # Container for all variables
with open( filepath, 'r' ) as infile : 
    counter = 0

    for line in infile: 

        data.append(line.strip().split())                                  # Read line -> Convert to list
        for ivar in range( nvar ) : data[-1][ivar] = float(data[-1][ivar]) # First nvar values are floats
        data[-1][4] = int(data[-1][4])-1                                   # Last one is an integer width species type: 1,2,3
                                                                           # Start counting at 0, to match your lists : 0,1,2
        # Print some numbers as a sanity check
        if (counter < 5) : print"  Read data: %5.2f %5.2f %5.2f %5.2f   %d "%(data[-1][0] , 
                                                                              data[-1][1] , 
                                                                              data[-1][2] , 
                                                                              data[-1][3] , 
                                                                              data[-1][4] )
        counter += 1
 
        # Fill 1D histograms
        for ivar in range( nvar ) : 
            dist[data[-1][4]][ivar].Fill( data[-1][ivar] )
            
            # Fill 2D correlation histograms
            for jvar in range( nvar ) : 
                corr[data[-1][4]][ivar][jvar].Fill( data[-1][ivar], data[-1][jvar] )


# ----------------------------------------------------------------------------------- #
# Display data:
# ----------------------------------------------------------------------------------- #

text = TLatex()     # Usefull for drawing text on plots
text.SetNDC()
text.SetTextFont(1)
text.SetTextColor(1)
text.SetTextSize(0.06)

# Draw 1D histograms:
canvas = TCanvas("canvas", "", 1200, 900)
canvas.SetFillColor(0)
canvas.Divide(2,2)

# Take care of a single odd case:
dist[0][1].SetMaximum(13.0) 

for ivar in range( nvar ) : 
    canvas.cd( ivar+1 )
    for ispec in range( nspec ) : 
        dist[ispec][ivar].SetLineColor( index_color[ispec] )
        dist[ispec][ivar].SetLineWidth( 2                  )

        if ispec == 0 : dist[ispec][ivar].Draw()
        else          : dist[ispec][ivar].Draw( "same" )

canvas.Update()
canvas.Draw()

# Draw 2D histograms:
canvas2 = TCanvas("canvas2","",1200, 900)
canvas2.SetFillColor(0)
canvas2.Divide(nvar,nvar)

for ivar in range( nvar ) :
    for jvar in range( nvar ) : 
        canvas2.cd( ivar*nvar+jvar+1 )
        if ivar == jvar : 
            text.DrawLatex( 0.17,0.22, var_name[ivar] )

        else : 
            for ispec in range( nspec ) : 
                corr[ispec][ivar][jvar].SetMarkerSize(index_size[ispec])
                corr[ispec][ivar][jvar].SetMarkerStyle(index_style[ispec])
                corr[ispec][ivar][jvar].SetMarkerColor(index_color[ispec])
                if ispec == 0 : corr[ispec][ivar][jvar].Draw()
                else          : corr[ispec][ivar][jvar].Draw("same")

canvas2.Update()
canvas2.Draw()


# ----------------------------------------------------------------------------------- #
# Make Fisher discriminant between species Versicolor (1) and Virginica (2):
# (i.e. not considering Setosa (0) for now)
# ----------------------------------------------------------------------------------- #

# Calculate means and widths (RMS) of the various variables:


# Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients:
# NOTE: You can get the covariances from the 2D histograms, using "GetCovariance()".
covmat_Vers = TMatrixD(nvar, nvar)        # Covariance matrix for Versicolor
covmat_Virg = TMatrixD(nvar, nvar)        # Covariance matrix for Verginica
covmat_comb = TMatrixD(nvar, nvar)        # Summed covariance matrix

# invcovmat_comb = covmat_comb.Invert()     # Inverted matrix


# Calculate the four Fisher coefficients:


# Loop over the data (species Versicolor (1) and Virginica (2)), and calculate their
# Fisher discriminant value, and see how well the separation works:


raw_input( ' ... ' )

# ----------------------------------------------------------------------------------- #
# Questions
# ----------------------------------------------------------------------------------- #
#
# As always, make sure that you know what the code is doing so far, and what the aim of
# the exercise is (i.e. which problem to solve, and how). Then start to expand on it.
#
# 1) Look at the distributions of the four discriminating variables for the two species
#    Versicolor (1) and Virginica (2), and see how well you can separate them by eye.
#    It seems reasonably possible, but certainly not perfect...
#
# 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable
#    (pair of variables for covariance) for each species, and put these into the
#    matrices defined.
#
# 3) From the inverted summed matrix, calculate the four Fisher coefficients, and
#    given these, calculate the Fisher discriminant for the two species in question.
#
# 4) What separation did you get, and is it notably better than what you obtain by eye?
#    Also, do your weights make sense? I.e. are they comparable to the widths of the
#    corresponding variable.
# 
# ----------------------------------------------------------------------------------- #