import pandas as pd
from sklearn import preprocessing
import numpy as np
import matplotlib.pylab as plt
from sklearn.decomposition import PCA as PCA
# Load the data
dat1=pd.read_table("http://www.nbi.dk/~mathies/RGender.dat", delimiter=' ')

# shape of the data
dat1.shape

# The row names (features)
dat1.index

### Make a feature and target dataframe
bfi = dat1.iloc[1::,:]                   # Make a DataFrame "answers" as a copy of all the data but without the gender column (The 0'th col)
gender  = dat1.iloc[0:1,:]                   # Copy gender to its own variable (We are going to try and predict this one)

### Scaling!
p1 = preprocessing.scale(bfi.transpose(), axis=0, with_mean=True, with_std=True, copy=True)
np.shape(p1) # it should be: shape (n_samples, n_components) # In our case we have 43 features.

# Preform the PCA
pca = PCA()

transformed = pca.fit_transform(p1)

# The standard deviation of each component can be plotted according to size by
plt.figure()
plt.plot(np.std(transformed, axis = 0), '.')

# plots the first five components of the first person
transformed[0,0:5]

# generate a data frame with gender as target and "k" leading PCs
k=5
d1 = transformed[:,0:k]
# for a linear model, we do not have to split
# in training and validation, look up the command cv.glm
dat_learn=d1
dat_valid=d1
# generalized linear model

from sklearn import linear_model
g1 = linear_model.LinearRegression()
g1.fit(d1, gender.transpose())

probabilities = g1.predict(d1)
prediction = g1.predict(d1) > 0.5
prediction[0:5]

Is_prediction_equal_to_actual_gender = np.array(gender == 1)[0] == prediction.transpose()
Procent_right = np.sum(Is_prediction_equal_to_actual_gender) / len(prediction)