# I'm using this excellent tutorial for reference: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
# It's useful since the plotting takes into account the situations where you have more than just a 1 or 0 as the 'target' or 'y'.

#Imports all the relevant packages (and always numpy, you never know!)
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt



# load dataset into Pandas DataFrame - Use your own filepath to the data
df = pd.read_csv('AlephBtag_MC_small_v2.csv',names=['energy','cTheta','phi','prob_b','spheri','pt2rel','multip','bqvjet','ptlrel','nnbjet','isb'],skiprows=1,delim_whitespace=True)

# Separating out the features
features = ['energy','cTheta','phi','prob_b','spheri','pt2rel','multip','bqvjet','ptlrel']
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['isb']].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

# PCA Projection to 2D
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

# Concatenating DataFrame along axis = 1. finalDf is the final DataFrame before plotting the data.
finalDf = pd.concat([principalDf, df[['isb']]], axis = 1)

#Visualize 2D Projection
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

# 's' is the marker size, I reduced it significantly from the tutorial (s = 50), to better distinguish the data points on the plot.
targets = [0,1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['isb'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 3)
ax.legend(targets)
ax.grid()

plt.plot()

# The explained variance tells you how much information (variance) can be attributed to each of the principal components.
print(pca.explained_variance_ratio_)
# 
print(pca.singular_values_)


try:
    __IPYTHON__
except:
    input('Press Enter to exit')