#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Python/ROOT macro for constructing a Fisher disciminant from two 2D Gaussianly
#  distributed correlated variables.
#  The macro creates artificial random data for two different types of processes,
#  and the goal is then to separate these by constructing a Fisher discriminant.
#
#  References:
#    Glen Cowan, SDA, pages 51-57
#    http://en.wikipedia.org/wiki/Linear_discriminant_analysis
#    http://en.wikipedia.org/wiki/Iris_flower_data_set
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   5th of October 2014
#
#  Author: Lars Egholm Pedersen
#  Email:  egholm@nbi.dk
#  Date:   6th of October 2013
#
# ----------------------------------------------------------------------------------- #

from ROOT import *


# ----------------------------------------------------------------------------------- #
# Functions
# ----------------------------------------------------------------------------------- #
def sqr( a ) : 
    return a*a

# Function for generating a set of correlated random gaussian numbers.
def get_corr( mu1, sig1, mu2, sig2, rho12 ) : 
    r = TRandom3(0) # Just gonna initialize random generator in function for simplicity
                    # Be carefull ALWAYS to use random seed that is changing faster
                    # than you are calling the function!!

    theta = 0.5 * atan( 2.0 * rho12 * sig1 * sig2 / ( sqr(sig1) - sqr(sig2) ) )
    sigu = sqrt( fabs( (sqr(sig1*cos(theta)) - sqr(sig2*sin(theta)) ) / ( sqr(cos(theta)) - sqr(sin(theta))) ) )
    sigv = sqrt( fabs( (sqr(sig2*cos(theta)) - sqr(sig1*sin(theta)) ) / ( sqr(cos(theta)) - sqr(sin(theta))) ) )

    u = r.Gaus( 0.0, sigu )
    v = r.Gaus( 0.0, sigv )

    x = mu1 + cos(theta)*u - sin(theta)*v
    y = mu2 + sin(theta)*u + cos(theta)*v

    return [x,y]


# ----------------------------------------------------------------------------------- #
# Fisher discriminant script.
# ----------------------------------------------------------------------------------- #
gROOT.Reset()

# Setting of general plotting style:
gStyle.SetCanvasColor(0)
gStyle.SetFillColor(0)

# Setting what to be shown in statistics box (using two different methods):
gStyle.SetOptStat("emr")
gStyle.SetOptFit(1111)


# ----------------------------------------------------------------------------------- #
# Define parameters
# ----------------------------------------------------------------------------------- #

nspec = 2                     # Number of 'species' : signal / background
nvar  = 2                     # Number of variables for discrimination

mean_par0  = [ 15.0 , 12.0  ] # Process type 1,2 mean in x direction
width_par0 = [  2.0 ,  3.0  ] # Process type 1,2 width in x direction
mean_par1  = [ 50.0 , 55.0  ] # ... y
width_par1 = [  6.0 ,  7.0  ] # ... y
corr       = [  0.80,  0.90 ] # Coefficient of correlation

ndata    = 2000          # Amount of data you want to create

color_index  = [ 2,  4]  # Make process type 1 red and two blue...
marker_index = [24, 25]  # Marker style for correlation plot

par_name = ["Parameter_a", "Parameter_b"] #Parameter names
draw_opt = ["", "same"]                   #Drawing option ... 


# ----------------------------------------------------------------------------------- #
# Define all your graphics here
# ----------------------------------------------------------------------------------- #

hist_par0 = [ TH1D("hist_par0_spec0", "hist_par0_spec0", 50,  0.0, 25.0) , #Histograms for "par0"
              TH1D("hist_par0_spec1", "hist_par0_spec1", 50,  0.0, 25.0) ] #Projection

hist_par1 = [ TH1D("hist_par1_spec0", "hist_par1_spec0", 50, 20.0, 80.0) , #Histograms for "par1"
              TH1D("hist_par1_spec1", "hist_par1_spec1", 50, 20.0, 80.0) ] #Projection

hist_corr   = [ TH2D("corr_par0_par1_spec0", "corr_par0_par1_spec0", 50, 0.0, 25.0, 50, 20.0, 80.0) , #Correlation 
                TH2D("corr_par0_par1_spec1", "corr_par0_par1_spec1", 50, 0.0, 25.0, 50, 20.0, 80.0) ] #histograms

hist_fisher = [ TH1D("hist_fisher_spec0", "hist_fisher_spec0", 200, -2.0, 2.0) , #Histograms for final
                TH1D("hist_fisher_spec1", "hist_fisher_spec1", 200, -2.0, 2.0) ] #fisher projection


# ----------------------------------------------------------------------------------- #
# Generate data and fill initial histograms
# ----------------------------------------------------------------------------------- #

for ispec in range( nspec ) : 
    for iexp in range( ndata ) : 
        #Get liniarly correlated random numbers...
        values = get_corr( mean_par0[ispec], width_par0[ispec], mean_par1[ispec], width_par1[ispec], corr[ispec] )

        hist_par0[ispec].Fill( values[0] )
        hist_par1[ispec].Fill( values[1] ) 
        hist_corr[ispec].Fill( values[0], values[1] )


# ----------------------------------------------------------------------------------- #
# Specify color etc of your histograms
# ----------------------------------------------------------------------------------- #

for ispec in range( nspec ) : 

    hist_par0[ispec].SetLineColor(   color_index[ispec] )
    hist_par1[ispec].SetLineColor(   color_index[ispec] )
    hist_fisher[ispec].SetLineColor( color_index[ispec] )

    hist_par0[ispec].GetXaxis().SetTitle(   par_name[0] )
    hist_par1[ispec].GetXaxis().SetTitle(   par_name[1] )
    hist_fisher[ispec].GetXaxis().SetTitle( "Fisher Discriminant" )

    hist_corr[ispec].SetMarkerColor( color_index[ispec] ) 
    hist_corr[ispec].SetMarkerStyle(marker_index[ispec] ) 

    hist_corr[ispec].GetXaxis().SetTitle( par_name[0] )
    hist_corr[ispec].GetYaxis().SetTitle( par_name[1] )


# ----------------------------------------------------------------------------------- #
# Plot your generated data
# ----------------------------------------------------------------------------------- #

# Correlation plot:
canvas_2D = TCanvas("canvas_2D", "canvas_2D", 80, 80, 1000, 700)

for ispec in range(nspec) : 
    hist_corr[ispec].Draw( draw_opt[ispec] )


# x and y projections:
canvas_1D = TCanvas("canvas_1D", "canvas_1D", 50, 50, 1000, 700)
canvas_1D.Divide(2)

for ispec in range(nspec) : 
    canvas_1D.cd( 1 ) 
    hist_par0[ispec].Draw( draw_opt[ispec] )
    canvas_1D.cd( 2 ) 
    hist_par1[ispec].Draw( draw_opt[ispec] )


# ----------------------------------------------------------------------------------- #
# Your calculation of the Fisher discriminant here...
# ----------------------------------------------------------------------------------- #

# Calculate means and widths (RMS) of the various variables:


# Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients:
# NOTE: You can get the covariances from the 2D histograms, using "GetCovariance()".
covmat_A    = TMatrixD(nvar, nvar)        # Covariance matrix for blue points
covmat_B    = TMatrixD(nvar, nvar)        # Covariance matrix for red points
covmat_comb = TMatrixD(nvar, nvar)        # Summed covariance matrix

# invcovmat_comb = covmat_comb.Invert()     # Inverted matrix


# Calculate the four Fisher coefficients:


# Loop over the data, and calculate the Fisher discriminant value, and see how well
# the separation works:


raw_input( ' ... ' )

# ----------------------------------------------------------------------------------- #
# Questions
# ----------------------------------------------------------------------------------- #
#
# As always, make sure that you know what the code is doing so far, and what the aim of
# the exercise is (i.e. which problem to solve, and how). Then start to expand on it.
#
# 1) Look at the 1D distributions of the two discriminating variables for the two species,
#    and see how well you can separate them by eye. It seems somewhat possible, but
#    certainly far from perfect...
#    Once you consider the 2D distribution (scatter plot), then it is clear, that some
#    cut along a line at an angle will work much better. This exercise is about finding
#    that line, and thus the perpendicular axis to project the data onto!
#
# 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable
#    (pair of variables for covariance) for each species, and put these into the
#    matrices defined.
#
# 3) From the inverted summed matrix and vectors of means, calculate the four Fisher
#    coefficients, and given these, calculate the Fisher discriminant for the two
#    species in question.
#
# 4) What separation did you get, and is it notably better than what you obtain by eye?
#    Also, do your weights make sense? I.e. are they comparable to the widths of the
#    corresponding variable.
#    As a measure of how good the separation obtained is, we consider the "distance"
#    between the two distributions as a measure of goodness:
#      separation = (mu_A - mu_B)**2 / (sigma_A**2 + sigma_B**2)
#
#    Compare the separation you get from each of the two 1D histograms of par0 and par1
#    with what you get from the Fisher discriminant, using the above formula.
# 
# ----------------------------------------------------------------------------------- #