import pandas as pd from sklearn import preprocessing import numpy as np import matplotlib.pylab as plt from sklearn.decomposition import PCA as PCA # Load the data dat1=pd.read_table("http://www.nbi.dk/~mathies/RGender.dat", delimiter=' ') # shape of the data dat1.shape # The row names (features) dat1.index ### Make a feature and target dataframe bfi = dat1.iloc[1::,:] # Make a DataFrame "answers" as a copy of all the data but without the gender column (The 0'th col) gender = dat1.iloc[0:1,:] # Copy gender to its own variable (We are going to try and predict this one) ### Scaling! p1 = preprocessing.scale(bfi.transpose(), axis=0, with_mean=True, with_std=True, copy=True) np.shape(p1) # it should be: shape (n_samples, n_components) # In our case we have 43 features. # Preform the PCA pca = PCA() transformed = pca.fit_transform(p1) # The standard deviation of each component can be plotted according to size by plt.figure() plt.plot(np.std(transformed, axis = 0), '.') # plots the first five components of the first person transformed[0,0:5] # generate a data frame with gender as target and "k" leading PCs k=5 d1 = transformed[:,0:k] # for a linear model, we do not have to split # in training and validation, look up the command cv.glm dat_learn=d1 dat_valid=d1 # generalized linear model from sklearn import linear_model g1 = linear_model.LinearRegression() g1.fit(d1, gender.transpose()) probabilities = g1.predict(d1) prediction = g1.predict(d1) > 0.5 prediction[0:5] Is_prediction_equal_to_actual_gender = np.array(gender == 1)[0] == prediction.transpose() Procent_right = np.sum(Is_prediction_equal_to_actual_gender) / len(prediction)