#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Python macro for analysing the famous Anderson-Fisher Iris data set from Gaspe Peninsula.
#
#  Edgar Anderson took 50 samples of three species (Setosa, Versicolor, and Virginica)
#  of Iris at the Gaspe Peninsula, and measured four distinguishing features on each
#  flower:
#    Sepal Length
#    Sepal Width
#    Petal Length
#    Petal Width
#  Using these measurements, Ronald Fisher was able to make a classification scheme, for
#  which he invented the Fisher Linear Discriminant.
#
#  References:
#    Glen Cowan, Statistical Data Analysis, pages 51-57
#    http://en.wikipedia.org/wiki/Linear_discriminant_analysis
#    http://en.wikipedia.org/wiki/Iris_flower_data_set
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   28th of December 2017
#
# ----------------------------------------------------------------------------------- #

# First, import the modules you want to use:
from __future__ import print_function, division
import numpy as np                                     # Matlab like syntax for linear algebra and functions
import matplotlib.pyplot as plt                        # Plots and figures like you know them from Matlab
from numpy.linalg import inv                           # Inveert a matrix 
plt.close('all')          # To close all open figures

r = np.random             # Random generator
r.seed(42)                # Set a random seed (but a fixed one)
SavePlots = True          # For now, don't save plots (once you trust your code, switch on)

# ----------------------------------------------------------------------------------- #
# Functions
# ----------------------------------------------------------------------------------- #

def GetCovariance(X, Y):
    return np.cov(X, Y, ddof=0)[0, 1]

# ----------------------------------------------------------------------------------- #
# Define parameters
# ----------------------------------------------------------------------------------- #

# Define path to datafile
filepath = "DataSet_AndersonFisherIris.txt"

nspec =   3  # Setosa, Versicolor, Virginica
nvar  =   4  # Sepal Length, Sepal Width, Petal Length, Petal Width

xmin = [3.5, 1.7, 0.0, 0.0]    # Range for different variables
xmax = [8.5, 4.7, 7.5, 3.0]    # Range for different variables
nbin = [ 25,  30,  25,  30]    # Number of bins for different variables

var_name  = [ "Sepal Length", "Sepal Width", "Petal Length", "Petal Width" ]  # Variable name
spec_name = [ "Setosa"      , "Versicolor" , "Virginica"                   ]  # Species name

colors = ['Red', 'Green', 'Blue']
markers = ['o', '^', 's']

# ----------------------------------------------------------------------------------- #
# Loop over and read data from input:
# ----------------------------------------------------------------------------------- #

data = []          # Container for all variables
with open( filepath, 'r' ) as infile :
    counter = 0
    for line in infile:
        data.append(line.strip().split())                                  # Read line -> Convert to list
        for ivar in range( nvar ) : data[-1][ivar] = float(data[-1][ivar]) # First nvar values are floats
        data[-1][4] = int(data[-1][4])-1                                   # Last one is an integer width species type: 1,2,3
                                                                           # Start counting at 0, to match your lists : 0,1,2
        # Print some numbers as a sanity check
        if (counter < 5) : 
            print("  Read data: {:5.2f} {:5.2f} {:5.2f} {:5.2f}   {:d} ".format(data[-1][0], data[-1][1], data[-1][2], data[-1][3], data[-1][4]))
        counter += 1


# Numpy version:
data = np.loadtxt(filepath)
data[:, -1] -= 1            # Subtract one from all the last integers in the data:
                            # Last one is an integer width species type: 1,2,3
                            # Start counting at 0, to match your lists : 0,1,2


# ----------------------------------------------------------------------------------- #
# Display data:
# ----------------------------------------------------------------------------------- #

mu = np.zeros((nspec, nvar))
RMS = np.zeros((nspec, nvar))

fig_1D, ax_1D = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
ax_1D = ax_1D.flatten()

for ivar in range(nvar): # for each variable plot each species' corresponding histogram of that variable
    for ispec in range(nspec):
        data_spec = data[data[:, -1] == ispec]  # extract the relevant species
        data_spec_var = data_spec[:, ivar]      # extract the relevant variable
                
        ax_1D[ivar].hist(   data_spec_var, 
                            nbin[ivar], 
                            range=(xmin[ivar], xmax[ivar]), 
                            histtype='step', 
                            color = colors[ispec],
                            label=spec_name[ispec],
                            lw=2)
        
        mu[ispec, ivar] = data_spec_var.mean()
        RMS[ispec, ivar] = data_spec_var.std(ddof=1)
         
    ax_1D[ivar].set(title=var_name[ivar], ylabel='Counts')
    ax_1D[ivar].legend()
           
fig_1D.tight_layout()
plt.show(block=False)


# Draw 2D histograms:
fig_corr, ax_corr = plt.subplots(nrows=4, ncols=4, figsize=(12, 10))
covariances = np.zeros((nvar, nvar, nspec))
for ivar in range(nvar):
    for jvar in range(nvar):
        
        ax = ax_corr[ivar, jvar]
        
        for ispec in range(nspec):
        
            data_ispec = data[data[:, -1] == ispec]  # extract the relevant species
            data_ispec_ivar = data_ispec[:, ivar]
            data_ispec_jvar = data_ispec[:, jvar]
                            
            covariances[ivar, jvar, ispec] = GetCovariance(data_ispec_ivar, data_ispec_jvar)

            if ivar == jvar: # i.e. diagonal
                ax.text( 0.5, 0.5, var_name[ivar], 
                                          horizontalalignment='center', 
                                          verticalalignment='center', 
                                          transform=ax.transAxes)
            else:
                ax.scatter(data_ispec_ivar, data_ispec_jvar, marker = markers[ispec], color = colors[ispec])

fig_corr.tight_layout()
plt.show(block=False)


# ----------------------------------------------------------------------------------- #
# Make Fisher discriminant between species Versicolor (1) and Virginica (2):
# (i.e. not considering Setosa (0) for now)
# ----------------------------------------------------------------------------------- #

# Calculate means and widths (RMS) of the various variables:


# Put the (co-)variances into a matrix, and invert it to get the Fisher coefficients:
#covmat_Vers = ?        # Covariance matrix for Versicolor
#covmat_Virg = ?        # Covariance matrix for Verginica

# covmat_comb =  covmat_Vers + covmat_Virg       	    # Summed covariance matrix
 
# covmat_comb_inv = inv(covmat_comb)			    # Inverted matrix

# Calculate the four Fisher coefficients:


# Loop over the data (species Versicolor (1) and Virginica (2)), and calculate their
# Fisher discriminant value, and see how well the separation works:


try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
# Questions
# ----------------------------------------------------------------------------------- #
#
# As always, make sure that you know what the code is doing so far, and what the aim of
# the exercise is (i.e. which problem to solve, and how). Then start to expand on it.
#
# 1) Look at the distributions of the four discriminating variables for the two species
#    Versicolor (1) and Virginica (2), and see how well you can separate them by eye.
#    It seems reasonably possible, but certainly not perfect...
#
# 2) Calculate the mean, widths (RMS) and covariance of each discriminating variable
#    (pair of variables for covariance) for each species, and put these into the
#    matrices defined.
#
# 3) From the inverted summed matrix, calculate the four Fisher coefficients, and
#    given these, calculate the Fisher discriminant for the two species in question.
#
# 4) What separation did you get, and is it notably better than what you obtain by eye?
#    Possibly make two ROC curves to quantify this.
#
# ----------------------------------------------------------------------------------- #