# -*- coding: utf-8 -*-
"""
Regression on Housing price data, with the aim of estimating the sales price.

Authors: Christian Michelsen & Troels Petersen
Date: 2016

"""

from __future__ import division, print_function   # if running Python 2.7, run this line

import numpy as np                # numerical analysis (pip install numpy --user)
import matplotlib
#matplotlib.use("pdf")
import matplotlib.pyplot as plt   # plotting package (Usually comes with Python, pip install matplotlib --user)
import pandas as pd               # excel-like dataframes  (pip install pandas --user)
import sklearn                    # scikit learn, machine learning package (pip install scikit_learn --user)
import seaborn as sns             # nicer standard plot looks (pip install seaborn --user)
#import xgboost as xgb             # machine learning, gradient boosting, package 


#==============================================================================
#  initial parameters
#==============================================================================


CreatePlots = True         # Plot results to screen
SavePlots = True           # Save these plots
VeryVerbose = False        # Print more than the bare results 
do_ML_Models = True        # Run Machine Learning algorithms once (in their standard setting)
do_GridSearch = True      # Run ML testing many different settings (NOTE: This is very slow).
do_Classification = True   # example of classification 
do_Multilabel_Classification = True
plt.close('all') # close all previously open figures

pd.set_option('display.max_rows', 10) # number of max rows to print for a DataFrame  
sns.set_context("poster") # change some standard settings of plots
current_palette = sns.color_palette() # some nicer colors. Can be accessed with current_palette[0] eg.


#==============================================================================
#  Read data
#==============================================================================


print("\n\n  --------------------------------------------------------------  ")
print("      Housing prices analysis")
print("  ------------------------------------------------------------------ \n\n ")

# read csv file:
df = pd.read_csv('HousingPrices_Cleaned.csv')  # other often used options: sep=';', decimal=".", na_values=['NA', 'NAN', '.']
print("Dataframe loaded with dimension: ", df.shape, "\n")
print(df.head(), "\n")

# pandas recognizes floats and integers automatically, but needs some help with the dates:
dates_columns = ['DATE_OF_SALES_PRICE', 'DATE_OF_PREVIOUS_SALES_PRICE_FIRST', 'OMREGNINGS_DATO']
for column in dates_columns:
    df[column] = pd.to_datetime(df[column], format="%Y-%m-%d", errors='coerce')

#types of data in the different columns
types = df.dtypes
print("Types of data:", types, "\n")


# how to add additional data:
GPS = pd.read_csv('GPS_data.csv')
df = pd.concat([df, GPS], axis=1)
SeaDist = pd.read_csv('SEA_DIST.csv')
df = pd.concat([df, SeaDist], axis=1)



# =============================================================================
# Initial overview of the data
# =============================================================================


# get percentage of NaNs for each column
N_nans = df.isnull().sum(axis=0) / len(df) * 100

# Create plot of the percentage of NaNs for the different colums:
if CreatePlots:
    
    fig, ax = plt.subplots(figsize=(18,9)) # create the actual figure and the figure handle (ax)
    x = np.arange(len(df.columns)) # create a range from 0 to the length of the columns
    plt.bar(x, N_nans, align='center') # bar chart
    plt.xticks(x, df.columns, rotation=90) # set the x ticks 
    ax.tick_params(axis='x', which='major', labelsize=10) # rotate the x ticks and set text size
    plt.xlim(0, len(df.columns)) # sets the x limits
    plt.title('Percentage of NaNs for the different categories')
    plt.xlabel('Category')
    plt.ylabel('Percent')
    fig.set_tight_layout(True)
    if SavePlots:
        fig.savefig('fig_all_NaNs.pdf', format="pdf", dpi=300)
    plt.show(block=False)
    del x


# =============================================================================
# data cleanup    
# =============================================================================


# choose only columns that have less than 10% NaNs:
columns_good = df.columns[N_nans < 10]
print("Columns available to use after throwing out bad ones: ", columns_good.tolist(), "\n")

# keep only rows with data (i.e. no nans). 
# Other suggestions here would be imputation (replace NaNs with mean/median)
df_good = df[columns_good].copy().dropna(axis=0)
print("After removing NaNs, new dataframe dimension: ", df_good.shape)
print("i.e. {:.2f}% of rows remaining. \n".format(len(df_good) / len(df) * 100))

# drop sizes smaller than 20 m^2 (or any other cuts)
df_good = df_good[df_good.SIZE_OF_HOUSE > 20]

# now create the smaller dataframe on which we want to test ML models on:
columns_to_keep = ['SIZE_OF_HOUSE', 'POSTAL_CODE', 'CONSTRUCTION_YEAR', 
                   'SCHOOL_DISTANCE_1', 'SUPERMARKET_DISTANCE_1']
#alternatively use all columns by uncommenting this line below:
#columns_to_keep = columns_good

X = df_good[columns_to_keep].copy() # copy to ensure that we wont be editing the original data
y = df_good['KOEBESUM_BELOEB'].copy()


# =============================================================================
# Overview of data
# =============================================================================


def overview_plot(X, y, X_strings, title_strings, x_lims, n_ticks=3):
    """ TODO, documentation here """
    
    N = X.shape[1]
    fig = plt.figure(figsize=(16, 7))
    ax_hist = []
    for i in range(N):
        ax_hist.append(plt.subplot(2, N, 1+i))
        ax_hist[-1].hist(X[X_strings[i]], bins=50, range=x_lims[i], histtype='step', linewidth=1.1)
        ax_hist[-1].set_title(title_strings[i], size=16)
        ax_hist[-1].set_ylabel('Counts')
        ax_hist[-1].locator_params(axis='x', nbins=n_ticks)
    
    correlations = np.corrcoef(X.T, y)[-1, :-1]
    ax_scatter = []
    for i in range(X.shape[1]):
        x = X[X_strings[i]]
        mask = (y < 8e6)  &  (x_lims[i][0] < x)  &  (x < x_lims[i][1])
        ax_scatter.append(plt.subplot(2, N, 1+i+N))
        with sns.axes_style("white"):
            ax_scatter[-1].hexbin(x[mask], y[mask]/1e6, gridsize=20)
        ax_scatter[-1].set_title(title_strings[i], size=16)
        ax_scatter[-1].set_ylabel('House Price')
        ax_scatter[-1].set_ylim(0, 8)
        ax_scatter[-1].set_xlim(x_lims[i])
        ax_scatter[-1].locator_params(axis='x', nbins=n_ticks)
        text = r'$\rho$ = {:.3f}'.format(correlations[i])
        ax_scatter[-1].text(0.98, 0.9, text, color='k', horizontalalignment='right',  
                  verticalalignment='center', transform=ax_scatter[-1].transAxes,
                  fontdict={'size': 10})
    
    fig.set_tight_layout(True)
    if SavePlots:
        fig.savefig('fig_InputVariableOverview.pdf', format='pdf', dpi=600)
    plt.show(block=False)
    

title_strings = ['Size of House', 'Postal Code', 'Construction Year', 
                 'School Distance 1', 'Supermarket Distance 1']
xlims = [(20, 300), (1000, 10000), (1700, 2017), (0, 7000), (0, 4000)]
if CreatePlots:
    overview_plot(X, y , columns_to_keep, title_strings, xlims)


#==============================================================================
#  Split data up into test and train
#==============================================================================


N_total = len(X) // 5 # total number of examples to use. For all, use: len(X), or uncomment below
X_sampled = X.sample(n=N_total, replace=False, random_state=42)
y_sampled = y.sample(n=N_total, replace=False, random_state=42)
#X_sampled = X.copy()
#y_sampled = y.copy()
print ("Size of X_sampled = {}, y_sampled = {}\n\n".format(X_sampled.shape, y_sampled.shape))
# other cuts could be only to use houses in e.g. CPH:
#X_sampled = X[X.POSTAL_CODE < 2500].copy()
#y_sampled = y[X.POSTAL_CODE < 2500].copy()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.25, random_state=42)


# =============================================================================
# Useful functions to use later
# =============================================================================


#from sklearn.metrics import mean_absolute_error, mean_squared_error

def MAE(y_true, y_pred):
    """ TODO documentation here """
    return np.mean(np.abs(y_true - y_pred)) # or mean_absolute_error(y_true, y_pred)

def MSE(y_true, y_pred):
    """ TODO documentation here """
    return np.mean((y_true - y_pred)**2) # mean_squared_error(y_true, y_pred)
 
def print_MAE_MSE(y_true, y_pred, method_string):
    """ TODO documentation here """
    string = ("{}: Mean Absolute Error = {:.3f} 1e6, Mean Squared Error "
    "= {:.3f} 1e12").format(method_string, MAE(y_true, y_pred)/1e6, MSE(y_true, y_pred)/1e12)
    print(string)


def calc_z(y_true, y_pred):
    """ TODO documentation here """
    z = (y_pred - y_true) / y_true
    return z


def overview_methods(y_true, y_pred, title_strings, x_lims=(-1, 10), n_ticks=4, GridSearchString=''):
    """ TODO documentation here """
    N = len(title_strings)
    fig = plt.figure(figsize=(16, 8))
    # ax_hist = []
    for i in range(N):
        #  ax_hist = fig.add_subplot(2, N, 1+i)
        #   ax_hist.hist(y_true/1e6, bins=50)# , range=x_lims, histtype='step', lw=1.1, ls='--', alpha=0.9, color=current_palette[2])
        # ax_hist[-1].hist(y_pred[i]/1e6, bins=50, range=x_lims, histtype='step', lw=1.3, color=current_palette[0])
        # ax_hist[-1].set_title(title_strings[i], size=16)
        # ax_hist[-1].set_xlabel('DKK')
        # ax_hist[-1].set_ylabel('Counts')
        # ax_hist[-1].locator_params(axis='x', nbins=n_ticks)
        ax_hist = fig.add_subplot(2, N, 1+i)
        ax_hist.hist(y_true/1e6, bins=50, range=x_lims , histtype='step', lw=1.1, alpha=0.9, color=current_palette[2])
        ax_hist.hist(y_pred[i]/1e6, bins=50, range=x_lims, histtype='step', lw=1.3, color=current_palette[0])
        ax_hist.set_title(title_strings[i], size=16)
        ax_hist.set_xlabel('DKK')
        ax_hist.set_ylabel('Counts')
        ax_hist.locator_params(axis='x', nbins=n_ticks)
        mae = MAE(y_true, y_pred[i])
        mse = MSE(y_true, y_pred[i])
        text = 'MAE = {:.3f} 1e6 \nMSE = {:.3f} 1e12'.format(mae/1e6, mse/1e12)
        ax_hist.text(0.35, 0.9, text, color='k', horizontalalignment='left',  
                  verticalalignment='center', transform=ax_hist.transAxes,
                  fontdict={'size': 10})

    zmin = -2
    zmax = 3
    #ax_hist_z = []
    for i in range(N):
        z = calc_z(y_true, y_pred[i])
        zmask = (zmin < z) & (z < zmax)
        # ax_hist_z.append(fig.add_subplot(2, N, 1+i+N))
        # ax_hist_z[-1].hist(z, bins=50, range=(zmin, zmax), histtype='step', lw=1.3, label='z', color=current_palette[0])
        # ax_hist_z[-1].set_xlim(zmin, zmax)
        # ax_hist_z[-1].set_title(title_strings[i], size=16)
        # ax_hist_z[-1].set_xlabel('z')
        # ax_hist_z[-1].set_ylabel('Counts')
        # ax_hist_z[-1].locator_params(axis='x', nbins=n_ticks)
        ax_hist_z = fig.add_subplot(2, N, 1+i+N)
        ax_hist_z.hist(z, bins=50, range=(zmin, zmax), histtype='step', lw=1.3, label='z', color=current_palette[0])
        ax_hist_z.set_xlim(zmin, zmax)
        ax_hist_z.set_title(title_strings[i], size=16)
        ax_hist_z.set_xlabel('z')
        ax_hist_z.set_ylabel('Counts')
        ax_hist_z.locator_params(axis='x', nbins=n_ticks)
        text = r'$\mu$ = {:.0f}'.format(z.mean())+'\n' + r'$\sigma$ = {:.0f}'.format(z.std())
        text += '\n' + r'$\~\mu$ = {:.3f}'.format(z[zmask].mean())+'\n' + r'$\~\sigma$ = {:.3f}'.format(z[zmask].std())
        ax_hist_z.text(0.65, 0.95, text, color='k', horizontalalignment='left',  
                  verticalalignment='top', transform=ax_hist_z.transAxes,
                  fontdict={'size': 10})

    fig.set_tight_layout(True)
    if SavePlots:
        fig.savefig('fig_MLresultsOverview_%s.pdf'%(GridSearchString), format='pdf', dpi=600)
    plt.show(block=False)

from sklearn.pipeline import make_pipeline
from sklearn import preprocessing





# =============================================================================
# Models
# =============================================================================


if do_ML_Models:
    
    #  Linear Regression
    from sklearn.linear_model import LinearRegression
    clf_lin = make_pipeline( preprocessing.StandardScaler(), LinearRegression()) # makes a pipeline, to scale input properly
    clf_lin.fit(X_train, y_train) # fits the model
    y_pred_lin = clf_lin.predict(X_test) # predicts using the model
    print_MAE_MSE(y_test, y_pred_lin, 'Lin')
    
    # K Nearest Meighbours
    from sklearn.neighbors import KNeighborsRegressor #, RadiusNeighborsRegressor
    clf_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsRegressor(n_neighbors=10))
    clf_KNN.fit(X_train, y_train)
    y_pred_KNN = clf_KNN.predict(X_test)
    print_MAE_MSE(y_test, y_pred_KNN, 'KNN')
    
    # Decision Tree
    from sklearn.tree import DecisionTreeRegressor
    clf_DT = make_pipeline( preprocessing.StandardScaler(), DecisionTreeRegressor(criterion='mse'))
    clf_DT.fit(X_train, y_train)
    y_pred_DT = clf_DT.predict(X_test)
    print_MAE_MSE(y_test, y_pred_DT, 'DT ')
    
    #  Random forest
    from sklearn.ensemble import RandomForestRegressor
    clf_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=50))
    clf_RF.fit(X_train, y_train)
    y_pred_RF = clf_RF.predict(X_test)
    print_MAE_MSE(y_test, y_pred_RF, 'RF ')
    
    # Neural Network
    from sklearn.neural_network import MLPRegressor
    clf_NN = make_pipeline( preprocessing.StandardScaler(), MLPRegressor(hidden_layer_sizes=(10,5,2), max_iter=1000, solver='lbfgs'))
    clf_NN.fit(X_train, y_train)
    y_pred_NN = clf_NN.predict(X_test)
    print_MAE_MSE(y_test, y_pred_NN, 'NN ')
    
    ##  Xgboost
    #from xgboost import XGBRegressor
    #clf_XGB = make_pipeline( preprocessing.StandardScaler(), XGBRegressor(n_estimators=50))
    #clf_XGB.fit(X_train, y_train)
    #y_pred_XGB = clf_XGB.predict(X_test)
    #print_MAE_MSE(y_test, y_pred_XGB, 'XGB')
    
    y_pred_all = [y_pred_lin, y_pred_KNN, y_pred_DT, y_pred_RF, y_pred_NN] # y_pred_XGB
    methods = ['Lin', 'KNN', 'DT', 'RF', 'NN'] #  'XGB',
    overview_methods(y_test.values, y_pred_all, methods)
    
    
    if VeryVerbose:
        print("Linear coefficients:") 
        print(clf_lin.steps[1][1].coef_)
        print("DT feature importance:")
        print(clf_DT.steps[1][1].feature_importances_)
        print("RF feature importance:")
        print(clf_RF.steps[1][1].feature_importances_)
    #    print("XGB feature importance:")
    #    print(clf_XGB.steps[1][1].feature_importances_)
    #    print("NN coefficients:")
    #    print(clf_NN.steps[1][1].coefs_)


# =============================================================================
# Decision Tree visualization
# =============================================================================


def Decision_Tree_Visualization(X_train, y_train):
    from sklearn.tree import export_graphviz
    from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
    import pydotplus 
    clf_reg = DecisionTreeRegressor(criterion='mse').fit(X_train, y_train)
    DT_data_reg = export_graphviz(clf_reg, out_file=None, feature_names=X.columns, 
                              filled=True, rounded=True, special_characters=True, max_depth=5) 
    graph_reg = pydotplus.graph_from_dot_data(DT_data_reg) 
    graph_reg.write_pdf("DecisionTree_Regression.pdf") 
    
    
    clf_clas = DecisionTreeClassifier().fit(X_train, y_train.apply(lambda x: 1 if x < 2e6 else 0))
    DT_data_clas = export_graphviz(clf_clas, out_file=None, feature_names=X.columns, class_names = ['0', '1'],
                              filled=True, rounded=True, special_characters=True, max_depth=5)   
    graph_clas = pydotplus.graph_from_dot_data(DT_data_clas) 
    graph_clas.write_pdf("DecisionTree_Classification.pdf") 
    
do_viz = False
if do_viz:
    Decision_Tree_Visualization(X_train, y_train)


# =============================================================================
# bad predictions?
# =============================================================================


def get_bad_prediction(df, y_pred, n=3):
    """ TODO """
    z = calc_z(y_test, y_pred)
    df_bad = df.loc[z.index].copy()
    df_bad['Prediction'] = y_pred_RF / 1e6
    df_bad['Price'] = y_test / 1e6
    df_bad['z'] = z
    df_bad.sort_values('z', ascending=True, inplace=True)
    n_worst_predictions = pd.concat([df_bad.iloc[:n, :], df_bad.iloc[-n:, :]])
    return n_worst_predictions

if do_ML_Models:

    n_worst_predictions_RF = get_bad_prediction(df_good, y_pred_RF)
    
    if VeryVerbose:
        print("\n\n The n Worst predicictions")
    #    with pd.option_context('display.max_rows', None, 'display.max_columns', 30):
        print(n_worst_predictions_RF)
        print("\n\n")
    
    
# =============================================================================
# Performing GridSearch
# =============================================================================


if do_GridSearch:
    
    print("\n\nRunning GridSearch, please stay patient \n")
    
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics.scorer import make_scorer
    params_best = {}
    params_worst = {}
    
    def custom_loss():
        return make_scorer(MSE, greater_is_better=False)
    
    # K Nearest Meighbours
    from sklearn.neighbors import KNeighborsRegressor #, RadiusNeighborsRegressor
    hyperparameters_KNN = { 'kneighborsregressor__n_neighbors' : [5, 10, 20],
                              'kneighborsregressor__weights' : ['uniform', 'distance'] }    
    clf_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsRegressor())
    clf_KNN = GridSearchCV(clf_KNN, hyperparameters_KNN, cv=5, n_jobs=-1, scoring=custom_loss()) # alternatives: 'neg_mean_squared_error', neg_mean_absolute_error, neg_median_absolute_error
#    clf_KNN = GridSearchCV(clf_KNN, hyperparameters_KNN, cv=5, n_jobs=-1, scoring='neg_mean_squared_error') # alternatives: 'neg_mean_squared_error', neg_mean_absolute_error, neg_median_absolute_error
    clf_KNN.fit(X_train, y_train)
    y_pred_KNN = clf_KNN.predict(X_test)
    print_MAE_MSE(y_test, y_pred_KNN, 'KNN')
    params_best['KNN'] = clf_KNN.cv_results_['params'][np.argmin(clf_KNN.cv_results_['rank_test_score'])]
    params_worst['KNN'] = clf_KNN.cv_results_['params'][np.argmax(clf_KNN.cv_results_['rank_test_score'])]
    
    
    #  Decision tree
    from sklearn.tree import DecisionTreeRegressor
    hyperparameters_DT = { 'decisiontreeregressor__max_features': ["sqrt", 'log2', None], 
                           'decisiontreeregressor__max_depth': [None, 50, 30, 10]}
    clf_DT = make_pipeline( preprocessing.StandardScaler(), DecisionTreeRegressor())
    clf_DT = GridSearchCV(clf_DT, hyperparameters_DT, cv=5, n_jobs=-1, scoring=custom_loss()) 
    clf_DT.fit(X_train, y_train)
    y_pred_DT = clf_DT.predict(X_test)
    print_MAE_MSE(y_test, y_pred_DT, 'DT ')
    params_best['DT'] = clf_DT.cv_results_['params'][np.argmin(clf_DT.cv_results_['rank_test_score'])]
    params_worst['DT'] = clf_DT.cv_results_['params'][np.argmax(clf_DT.cv_results_['rank_test_score'])]
        
    
    #  Random forest
    from sklearn.ensemble import RandomForestRegressor
    hyperparameters_RF = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                           'randomforestregressor__max_depth': [None, 5, 3], 
                           'randomforestregressor__n_estimators': [50, 100]}
    clf_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestRegressor())
    clf_RF = GridSearchCV(clf_RF, hyperparameters_RF, cv=5, n_jobs=-1, scoring=custom_loss()) 
    clf_RF.fit(X_train, y_train)
    y_pred_RF = clf_RF.predict(X_test)
    print_MAE_MSE(y_test, y_pred_RF, 'RF ')
    params_best['RF'] = clf_RF.cv_results_['params'][np.argmin(clf_RF.cv_results_['rank_test_score'])]
    params_worst['RF'] = clf_RF.cv_results_['params'][np.argmax(clf_RF.cv_results_['rank_test_score'])]
    
    
    # Neural Network
    from sklearn.neural_network import MLPRegressor
    hyperparameters_NN = { 'mlpregressor__alpha' : [0.0001, 0.001]}
                          # 'mlpregressor__activation': ['relu']}
    clf_NN = make_pipeline( preprocessing.StandardScaler(), MLPRegressor(hidden_layer_sizes=(10,5,2), max_iter=1000, solver='lbfgs'))
    clf_NN = GridSearchCV(clf_NN, hyperparameters_NN, cv=5, n_jobs=-1, scoring=custom_loss()) 
    clf_NN.fit(X_train, y_train)
    y_pred_NN = clf_NN.predict(X_test)
    print_MAE_MSE(y_test, y_pred_NN, 'NN ')
    params_best['NN'] = clf_NN.cv_results_['params'][np.argmin(clf_NN.cv_results_['rank_test_score'])]
    params_worst['NN'] = clf_NN.cv_results_['params'][np.argmax(clf_NN.cv_results_['rank_test_score'])]
    

#    #  Xgboost
#    from xgboost import XGBRegressor
#    hyperparameters_XGB = { 'xgbregressor__max_depth' : [1, 3, 5], 
#                            'xgbregressor__min_child_weight': [3, 5, 10],  # None
#                            'xgbregressor__n_estimators' : [50, 100, 150]}    
#    clf_XGB = make_pipeline( preprocessing.StandardScaler(), XGBRegressor())
#    clf_XGB = GridSearchCV(clf_XGB, hyperparameters_XGB, cv=5, n_jobs=-1, scoring=custom_loss()) 
#    clf_XGB.fit(X_train, y_train)
#    y_pred_XGB = clf_XGB.predict(X_test)
#    print_MAE_MSE(y_test, y_pred_XGB, 'XGB')
    
    
    y_pred_all = [y_pred_lin, y_pred_KNN, y_pred_DT, y_pred_RF, y_pred_NN] # y_pred_XGB
    methods = ['Lin', 'KNN', 'DT', 'RF', 'NN']
    if CreatePlots:
        overview_methods(y_test.values, y_pred_all, methods, GridSearchString='GridSearch')

    if VeryVerbose:
        print("KNN best params", clf_KNN.best_params_)
        print("DT best params", clf_DT.best_params_)
        print("DT feature importance", clf_DT.best_estimator_.steps[1][1].feature_importances_)
        print("RF best params", clf_RF.best_params_)
        print("RF feature importance", clf_RF.best_estimator_.steps[1][1].feature_importances_)
        print("NN coefficients", clf_NN.best_estimator_.steps[1][1].coefs_)
#        print("XGB best params", clf_XGB.best_params_)
#        print("XGB feature importance", clf_XGB.best_estimator_.steps[1][1].feature_importances_)



# =============================================================================
#  Classification.
#
#   I want to buy an appartment, but my bank says I have a budget of "price_cut".
#   Can I afford to buy X house?
#
# =============================================================================


def ACC(y_true, y_pred):
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_true, y_pred)

def F1(y_true, y_pred):
    from sklearn.metrics import f1_score
    return f1_score(y_pred, y_true)

def print_ACC_F1(y_true, y_pred, method_string):
    """ """
    string = ("{}: Accuracy = {:.3f}, F1 score = {:.3f}").format(method_string, 
                                 ACC(y_true, y_pred), F1(y_true, y_pred))
    print(string)


if do_Classification:
    
    print("\nRunning classification")
    
    y_cls = y_sampled.copy()
    price_cut = 2e6
    price_mask = y_cls < price_cut
    y_cls[price_mask] = 1
    y_cls[~price_mask] = 0
    # could also be done with the one-liner:
    #y_cls.apply(lambda x: 1 if x < price_cut else 0)
    
    X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_sampled, y_cls, test_size=0.25, random_state=42)
    
    
    #SGD, linear stochastic gradient descent
    from sklearn.linear_model import SGDClassifier
    cls_SGD = make_pipeline( preprocessing.StandardScaler(), SGDClassifier()) # makes a pipeline, to scale input properly
    cls_SGD.fit(X_train_cls, y_train_cls) # fits the model
    y_pred_SGD_cls = cls_SGD.predict(X_test_cls) # predicts using the model
    print_ACC_F1(y_test_cls, y_pred_SGD_cls, 'SGD')
    
    #  SVM Linear (Support vector machine)
    from sklearn.svm import LinearSVC
    cls_SVM = make_pipeline( preprocessing.StandardScaler(), LinearSVC()) # makes a pipeline, to scale input properly
    cls_SVM.fit(X_train_cls, y_train_cls) # fits the model
    y_pred_SVM_cls = cls_SVM.predict(X_test_cls) # predicts using the model
    print_ACC_F1(y_test_cls, y_pred_SVM_cls, 'SVM')
    
    # K Nearest Meighbours
    from sklearn.neighbors import KNeighborsClassifier #, RadiusNeighborsRegressor
    cls_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsClassifier(n_neighbors=10))
    cls_KNN.fit(X_train_cls, y_train_cls)
    y_pred_KNN_cls = cls_KNN.predict(X_test_cls)
    print_ACC_F1(y_test_cls, y_pred_KNN_cls, 'KNN')
    
    # Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    cls_DT = make_pipeline( preprocessing.StandardScaler(), DecisionTreeClassifier(criterion='gini'))
    cls_DT.fit(X_train_cls, y_train_cls)
    y_pred_DT_cls = cls_DT.predict(X_test_cls)
    print_ACC_F1(y_test_cls, y_pred_DT_cls, 'DT ')
    
    #  Random forest
    from sklearn.ensemble import RandomForestClassifier
    cls_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=50))
    cls_RF.fit(X_train_cls, y_train_cls)
    y_pred_RF_cls = cls_RF.predict(X_test_cls)
    print_ACC_F1(y_test_cls, y_pred_RF_cls, 'RF ')
    
    # I want to buy a new appartment In Valby. It is 95 m² and built in 1960. It is close to supermarkets (250m), but far away from any schools (1000m).
    # Can I afford this house?
    house_candidate = np.array([95, 2500, 1960., 1000., 250.]).reshape(1, -1) # the last reshape is needed for the dimensions of the array to match X

    methods_cls = ['SGD', 'SVM', 'KNN', 'DT ', 'RF ']
    cls_all = [cls_SGD, cls_SVM, cls_KNN, cls_DT, cls_RF]
    print("\nClassification, or 'Can I afford that appartment?' \n")
    for name, cls in zip(methods_cls, cls_all):
        string = "For method {} I ".format(name)
        if cls.predict(house_candidate) == 0:
            string += "cannot "
        else:
            string += "can    "
        string += 'afford the appartment'
        print(string)
    print("\n\n")
        
        
# =============================================================================

# Multilabel classification

# =============================================================================


def plot_confusion_matrix(cm, classes, method,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    
    import itertools
    fig = plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
    
    print()
    print(title)
    print(cm)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        string = "{:.3f}".format(cm[i, j])
        plt.text(j, i, string,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(0)
    fig.set_tight_layout(True)
    if SavePlots:
        fig.savefig('fig_DataConfusionMatrix_%s.pdf'%(method), format="pdf", dpi=300)

        
if do_Multilabel_Classification:
    print("Running Multilabel classification: \n")
    
    print("DataFrame shape:", df.shape)
    # Use only examples that has floor number available
    df_floors = df[~df.FLOOR_NUMBER.isnull()].copy()
    
    def mask_df_floors(df_floors, mask_floors):
    
        print("Shape after cut on NaN floor number: ", df_floors.shape)
         
        df_floors2 = df_floors[mask_floors].copy()
        print("Shape after cut on floor number and max floor number: ", df_floors2.shape)
        
        N_nans_floors = df_floors2.isnull().sum(axis=0) / len(df_floors2) * 100
        
        # choose only columns that have less than 10% NaNs:
        columns_good_floors = df_floors2.columns[N_nans_floors < 10]
        print("\nColumns available to use for the multilabel classification: ")
        print(columns_good_floors.tolist())
        print()
        
        # keep only rows with data (i.e. no nans). 
        # Other suggestions here would be imputation (replace NaNs with mean/median)
        df_floors_good = df_floors2[columns_good_floors].dropna(axis=0)
        print("After removing NaNs, new dataframe dimension: ", df_floors_good.shape)
        print("i.e. {:.2f}% of rows remaining. \n".format(len(df_floors_good) / len(df_floors2) * 100))
        
        return df_floors_good
    
    
    
    mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5)
#    mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) & (df_floors.SIZE_OF_HOUSE > 20) & (df_floors.KOEBESUM_BELOEB < 50e6)
    df_floors_good = mask_df_floors(df_floors, mask_floors)
    
    # now create the smaller dataframe on which we want to test ML models on:
    columns_to_keep_floors = ['SIZE_OF_HOUSE', 'POSTAL_CODE', 'CONSTRUCTION_YEAR',
                              'HOUSE_NUMBER', 'KOEBESUM_BELOEB',
                              'SCHOOL_DISTANCE_1', 'SUPERMARKET_DISTANCE_1']

    
    X_floors = df_floors_good[columns_to_keep_floors].copy() # copy to ensure that we wont be editing the original data
    y_floors = df_floors_good['FLOOR_NUMBER'].copy()
    y_floors[ (1 <= y_floors) & (y_floors < 5) ] = 3
    print ("Size of X_floors = {}, y_floors = {}\n\n".format(X_floors.shape, y_floors.shape))
  
    from sklearn.model_selection import train_test_split
    X_train_floor, X_test_floor, y_train_floor, y_test_floor = train_test_split(X_floors, y_floors, test_size=0.20, random_state=42)


    
    from sklearn.ensemble import RandomForestClassifier
    cls_multi_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=100))    
    cls_multi_RF.fit(X_train_floor, y_train_floor)
    y_pred_RF_labels = cls_multi_RF.predict(X_test_floor)

    from sklearn.neighbors import KNeighborsClassifier #, RadiusNeighborsRegressor
    cls_multi_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsClassifier(n_neighbors=10))
    cls_multi_KNN.fit(X_train_floor, y_train_floor)
    y_pred_KNN_labels = cls_multi_KNN.predict(X_test_floor)


    from sklearn.metrics import confusion_matrix
    cnf_matrix_RF = confusion_matrix(y_test_floor, y_pred_RF_labels)
    cnf_matrix_KNN = confusion_matrix(y_test_floor, y_pred_KNN_labels)
    cnf_matrix_all_3 = confusion_matrix(y_test_floor, 3*np.ones_like(y_test_floor))
    
    print("Percentage of different categories:")
    print("Class 0: {:.3f}, Class 3: {:.3f}, Class 5: {:.3f}".format((y_floors==0).sum() / len(y_floors), (y_floors==3).sum() / len(y_floors), (y_floors==5).sum() / len(y_floors)))
    
    if CreatePlots:
        # plt.figure()
        plot_confusion_matrix(cnf_matrix_RF, classes=['ST', '1-4', '5'], method='RF', normalize=True,
                              title='Normalized RF confusion matrix')
        # plt.figure()
        plot_confusion_matrix(cnf_matrix_KNN, classes=['ST', '1-4', '5'], method='KNN', normalize=True,
                              title='Normalized KNN confusion matrix')


# =============================================================================
# Check relationship between floor number and price
# =============================================================================


    print("\n\nRelationship between floor number and price: \n")
    
    # Here we consider, which selection we want to impose (to be considered with care!):
    mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) 
    # mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) & (df_floors.SIZE_OF_HOUSE > 20)
    # mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) & (df_floors.SIZE_OF_HOUSE > 20) & (df_floors.KOEBESUM_BELOEB < 50e6)
    
    df_floors_good = mask_df_floors(df_floors, mask_floors)
    
    df_price_and_floor = pd.concat([df_floors_good['FLOOR_NUMBER'].astype('int64'), df_floors_good['KOEBESUM_BELOEB']/1e6], axis=1)
    df_price_and_floor['SQUARE_M_PRICE'] = df_floors_good['KOEBESUM_BELOEB'] / df_floors_good['SIZE_OF_HOUSE']# .rename('SQUARE_M_PRICE')
    
    print("\nDescribe the relationship between Floor number and KOEBESUM: \n")
    print(df_price_and_floor.groupby(['FLOOR_NUMBER'])['KOEBESUM_BELOEB'].describe())
    print("\nDescribe the relationship between Floor number and SQUARE_M_PRICE: \n")
    print(df_price_and_floor.groupby(['FLOOR_NUMBER'])['SQUARE_M_PRICE'].describe())

    print("\n\n\nMeans:")
    print(df_price_and_floor.groupby(['FLOOR_NUMBER']).mean())
    print("\nMedians:")
    print(df_price_and_floor.groupby(['FLOOR_NUMBER']).median())



# =============================================================================
# 
# =============================================================================