# I'm using this excellent tutorial for reference: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60 # It's useful since the plotting takes into account the situations where you have more than just a 1 or 0 as the 'target' or 'y'. #Imports all the relevant packages (and always numpy, you never know!) import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt # load dataset into Pandas DataFrame - Use your own filepath to the data df = pd.read_csv('AlephBtag_MC_small_v2.csv',names=['energy','cTheta','phi','prob_b','spheri','pt2rel','multip','bqvjet','ptlrel','nnbjet','isb'],skiprows=1,delim_whitespace=True) # Separating out the features features = ['energy','cTheta','phi','prob_b','spheri','pt2rel','multip','bqvjet','ptlrel'] x = df.loc[:, features].values # Separating out the target y = df.loc[:,['isb']].values # Standardizing the features x = StandardScaler().fit_transform(x) # PCA Projection to 2D pca = PCA(n_components=2) principalComponents = pca.fit_transform(x) principalDf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2']) # Concatenating DataFrame along axis = 1. finalDf is the final DataFrame before plotting the data. finalDf = pd.concat([principalDf, df[['isb']]], axis = 1) #Visualize 2D Projection fig = plt.figure(figsize = (8,8)) ax = fig.add_subplot(1,1,1) ax.set_xlabel('Principal Component 1', fontsize = 15) ax.set_ylabel('Principal Component 2', fontsize = 15) ax.set_title('2 component PCA', fontsize = 20) # 's' is the marker size, I reduced it significantly from the tutorial (s = 50), to better distinguish the data points on the plot. targets = [0,1] colors = ['r', 'g'] for target, color in zip(targets,colors): indicesToKeep = finalDf['isb'] == target ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'] , finalDf.loc[indicesToKeep, 'principal component 2'] , c = color , s = 3) ax.legend(targets) ax.grid() plt.plot() # The explained variance tells you how much information (variance) can be attributed to each of the principal components. print(pca.explained_variance_ratio_) # print(pca.singular_values_) try: __IPYTHON__ except: input('Press Enter to exit')