#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Python macro for constructing a Fisher disciminant from two 2D Gaussianly
#  distributed correlated variables.
#  The macro creates artificial random data for two different types of processes,
#  and the goal is then to separate these by constructing a Fisher discriminant.
#
#  References:
#    Glen Cowan, Statistical Data Analysis, pages 51-57
#    http://en.wikipedia.org/wiki/Linear_discriminant_analysis
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   22nd of December 2017
#
# ----------------------------------------------------------------------------------- #


# ----------------------------------------------------------------------------------- #
# Imports
# ----------------------------------------------------------------------------------- #

# First, import the modules you want to use:
from __future__ import print_function, division        #
import numpy as np                                     # Matlab like syntax for linear algebra and functions
import matplotlib.pyplot as plt                        # Plots and figures like you know them from Matlab
from numpy.linalg import inv
plt.close('all')         # to close all open figures

r = np.random             # Random generator
r.seed(42)                # Set a random seed (but a fixed one)
SavePlots = False          # For now, don't save plots (once you trust your code, switch on)


# ----------------------------------------------------------------------------------- #
# Functions
# ----------------------------------------------------------------------------------- #

# Function for generating a set of correlated random gaussian numbers.
# What is essentially done, is to produce UNcorrelated pairs of random numbers, and then rotating them.
def get_corr( mu1, sig1, mu2, sig2, rho12 ) : 

    theta = 0.5 * np.arctan( 2.0 * rho12 * sig1 * sig2 / ( sig1**2 - sig2**2 ) )
    sigu = np.sqrt( np.abs( ((sig1*np.cos(theta))**2 - (sig2*np.sin(theta))**2 ) / ( np.cos(theta)**2 - np.sin(theta)**2) ) )
    sigv = np.sqrt( np.abs( ((sig2*np.cos(theta))**2 - (sig1*np.sin(theta))**2 ) / ( np.cos(theta)**2 - np.sin(theta)**2) ) )

    u = r.normal( 0.0, sigu )
    v = r.normal( 0.0, sigv )

    x = mu1 + np.cos(theta)*u - np.sin(theta)*v
    y = mu2 + np.sin(theta)*u + np.cos(theta)*v

    return [x,y]


def GetCovariance(X, Y):
    return np.cov(X, Y, ddof=0)[0, 1]


# ----------------------------------------------------------------------------------- #
# Fisher discriminant script.
# ----------------------------------------------------------------------------------- #

# ----------------------------------------------------------------------------------- #
# Define parameters
# ----------------------------------------------------------------------------------- #

nspec        = 2                     # Number of 'species' : signal / background
nvar         = 2                     # Number of variables for discrimination

mean_par0    = [ 15.0 , 12.0  ] # Process type 1,2 mean in x direction
width_par0   = [  2.0 ,  3.0  ] # Process type 1,2 width in x direction
mean_par1    = [ 50.0 , 55.0  ] # ... y
width_par1   = [  6.0 ,  7.0  ] # ... y
corr         = [  0.80,  0.90 ] # Coefficient of correlation

ndata        = 2000          # Amount of data you want to create


# ----------------------------------------------------------------------------------- #
# Generate data
# ----------------------------------------------------------------------------------- #

# For each "species", produce a number of (x,y) points:
par0 = np.zeros((ndata, nspec))
par1 = np.zeros((ndata, nspec))

for ispec in range( nspec ) : 
    for iexp in range( ndata ) : 
        # Get liniarly correlated random numbers...
        values = get_corr( mean_par0[ispec], width_par0[ispec], mean_par1[ispec], width_par1[ispec], corr[ispec] )

        par0[iexp, ispec] = values[0]
        par1[iexp, ispec] = values[1]


# ----------------------------------------------------------------------------------- #
# Plot your generated data
# ----------------------------------------------------------------------------------- #

# x and y projections
fig_1D, ax_1D = plt.subplots(ncols=2, figsize=(10, 6))

ax_1D[0].hist(par0[:, 0], 50, (0, 25), histtype='step', label='hist_par0_spec0', color='Red')
ax_1D[0].hist(par0[:, 1], 50, (0, 25), histtype='step', label='hist_par0_spec1', color='Blue')
ax_1D[0].set(title='hist_par0', xlim=(0,25))
ax_1D[0].legend(loc='upper left')

ax_1D[1].hist(par1[:, 0], 50, (20, 80), histtype='step', label='hist_par1_spec0', color='Red')
ax_1D[1].hist(par1[:, 1], 50, (20, 80), histtype='step', label='hist_par1_spec1', color='Blue')
ax_1D[1].set(title='hist_par0', xlim=(20, 80))
ax_1D[1].legend(loc='upper left')
plt.show(block=False)

if (SavePlots) :
    fig_1D.savefig('Hist_InputVars.pdf', dpi=600)


# Wait with drawing this 2D distribution, so that you think about the 1D distributions first!

# # Correlation plot
# fig_corr, ax_corr = plt.subplots(figsize=(10, 6))
# ax_corr.scatter(par0[:, 0], par1[:, 0], color='Red',  s=10)
# ax_corr.scatter(par0[:, 1], par1[:, 1], color='Blue', s=10)
# ax_corr.set(xlabel='Parameter_a', ylabel='Parameter_b', title='Correlation')

# if SavePlots:
#     fig_corr.savefig('Hist_InputVars2D.pdf', dpi=600)


# ----------------------------------------------------------------------------------- #
# Your calculation of the Fisher discriminant here...
# ----------------------------------------------------------------------------------- #

# Calculate means and widths (RMS) of the various variables:
Mean_A0 = 0.0  # Get the value yourself to understand where to get it from!
Mean_B0 = 0.0  # Get the value yourself to understand where to get it from!

RMS_A0  = 0.0  # Get the value yourself to understand where to get it from!
RMS_B0  = 0.0  # Get the value yourself to understand where to get it from!

# Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients:
# NOTE: You can get the covariances using the supplied function "GetCovariance".
covmat_sum = np.zeros((nvar, nvar))        # Summed covariance matrix

covmat_sum[0, 0] = RMS_A0**2 + RMS_B0**2   # Variances of variable 1 for sample A and B
# covmat_sum[0, 1] = something with the covariances...


print("Combined covariance matrix:")
print(covmat_sum)
print("")

print("Inverted combined covariance matrix:")
#covmat_sum_inv = inv(covmat_sum)      # With the given values, this will fail, as the matrix is singular!
#print(covmat_sum_inv)
print("")


# Calculate the two Fisher coefficients:


# wf = ???


# Apply the Fisher weights to the data and see how well the separation works


# data_fisher = ?

# Plot the Fisher discriminant:

# fig_fisher, ax_fisher = plt.subplots(figsize=(10, 6))
# ax_fisher.hist(fisher_data[:, 0], 200, (-22, 3), histtype='step', color='Red', label='spec0')
# ax_fisher.hist(fisher_data[:, 1], 200, (-22, 3), histtype='step', color='Blue', label='spec0')
# ax_fisher.set(xlim=(-22, 3), xlabel='Fisher-discriminant')

# if (SavePlots) :
#     fig_fisher.savefig('Hist_FisherOutput.pdf', dpi=600)


# fill below yourself
print("  The separation between the samples is: {:5.2f}".format( 0 ))


try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
# Questions
# ----------------------------------------------------------------------------------- #
#
# As always, make sure that you know what the code is doing so far, and what the aim of
# the exercise is (i.e. which problem to solve, and how). Then start to expand on it.
#
# 1) Look at the 1D distributions of the two discriminating variables for the two species,
#    and see how well you can separate them by eye. It seems somewhat possible, but
#    certainly far from perfect...
#    Once you consider the 2D distribution (scatter plot - to be uncommented by you!),
#    then it is clear, that some cut along a line at an angle will work much better.
#    This exercise is about finding that optimal line, and thus the perpendicular axis
#    to project the data onto!
#
# 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable
#    (pair of variables for covariance) for each species, and put these into the
#    matrices defined.
#
# 3) From the inverted summed matrix and vectors of means, calculate the two Fisher
#    coefficients, and given these, calculate the Fisher discriminant for the two
#    species in question, i.e. F = c_x * x + c_y * y for each point (x,y).
#
# 4) What separation did you get, and is it notably better than what you obtain by eye?
#    Also, do your weights make sense? I.e. are they comparable to the widths of the
#    corresponding variable?
#    As a simple measure of how good the separation obtained is, we consider the "distance"
#    between the two distributions as a measure of goodness:
#      separation = (mu_A - mu_B)**2 / (sigma_A**2 + sigma_B**2)
#
#    Compare the separation you get from each of the two 1D histograms of par0 and par1
#    with what you get from the Fisher discriminant, using the above formula.
#
#    Of course the ultimate comparison should be done using ROC curves!
#
# ----------------------------------------------------------------------------------- #