{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Running PCA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We start by loading the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First X entry:  [ 4.68299e+01  3.87100e-01  8.52800e-01  3.27000e-02  2.10500e-01\n",
      "  3.02000e-02  4.77600e-01  3.14000e-02 -3.00000e-04  1.64200e-01]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "Btag1 = np.loadtxt('AlephBtag_MC_small_v2.csv',skiprows=1)\n",
    "X = Btag1[:,:-1]\n",
    "y = Btag1[:,-1]\n",
    "\n",
    "print ('First X entry: ', X[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, we scale the data, as we did last week"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First scaled X entry:  [6.33633634e-01 7.07393567e-01 6.35024112e-01 4.80480480e-01\n",
      " 2.53053053e-01 2.84284284e-01 8.07057057e-01 3.03561914e-01\n",
      " 9.99999998e-08 2.96963630e-01]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import quantile_transform\n",
    "\n",
    "X_scaled = quantile_transform(X, copy=True)\n",
    "\n",
    "print ('First scaled X entry: ', X_scaled[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, we fit the PCA to 2 components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First PCA X entry: [-0.27576634 -0.13737593]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "X_pca = pca.fit_transform(X_scaled)\n",
    "\n",
    "print ('First PCA X entry:', X_pca[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then, we can print the variance of the components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.27906251 0.14808326]\n"
     ]
    }
   ],
   "source": [
    " print (pca.explained_variance_ratio_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, we can plot the transformed data, along with their classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Figure size 1200x800 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12, 8))\n",
    "truths = np.array([X_pca[i] for i in range(len(y)) if y[i] == 1])\n",
    "falses = np.array([X_pca[i] for i in range(len(y)) if y[i] != 1])\n",
    "plt.scatter(falses[:,0], falses[:,1], color='blue', alpha=.1)\n",
    "plt.scatter(truths[:,0], truths[:,1], color='red', alpha=.1)\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}