# -*- coding: utf-8 -*- """ Regression on Housing price data, with the aim of estimating the sales price. Authors: Christian Michelsen & Troels Petersen Date: 2016 """ from __future__ import division, print_function # if running Python 2.7, run this line import numpy as np # numerical analysis (pip install numpy --user) import matplotlib #matplotlib.use("pdf") import matplotlib.pyplot as plt # plotting package (Usually comes with Python, pip install matplotlib --user) import pandas as pd # excel-like dataframes (pip install pandas --user) import sklearn # scikit learn, machine learning package (pip install scikit_learn --user) import seaborn as sns # nicer standard plot looks (pip install seaborn --user) #import xgboost as xgb # machine learning, gradient boosting, package #============================================================================== # initial parameters #============================================================================== CreatePlots = True # Plot results to screen SavePlots = True # Save these plots VeryVerbose = False # Print more than the bare results do_ML_Models = True # Run Machine Learning algorithms once (in their standard setting) do_GridSearch = True # Run ML testing many different settings (NOTE: This is very slow). do_Classification = True # example of classification do_Multilabel_Classification = True plt.close('all') # close all previously open figures pd.set_option('display.max_rows', 10) # number of max rows to print for a DataFrame sns.set_context("poster") # change some standard settings of plots current_palette = sns.color_palette() # some nicer colors. Can be accessed with current_palette[0] eg. #============================================================================== # Read data #============================================================================== print("\n\n -------------------------------------------------------------- ") print(" Housing prices analysis") print(" ------------------------------------------------------------------ \n\n ") # read csv file: df = pd.read_csv('HousingPrices_Cleaned.csv') # other often used options: sep=';', decimal=".", na_values=['NA', 'NAN', '.'] print("Dataframe loaded with dimension: ", df.shape, "\n") print(df.head(), "\n") # pandas recognizes floats and integers automatically, but needs some help with the dates: dates_columns = ['DATE_OF_SALES_PRICE', 'DATE_OF_PREVIOUS_SALES_PRICE_FIRST', 'OMREGNINGS_DATO'] for column in dates_columns: df[column] = pd.to_datetime(df[column], format="%Y-%m-%d", errors='coerce') #types of data in the different columns types = df.dtypes print("Types of data:", types, "\n") # how to add additional data: GPS = pd.read_csv('GPS_data.csv') df = pd.concat([df, GPS], axis=1) SeaDist = pd.read_csv('SEA_DIST.csv') df = pd.concat([df, SeaDist], axis=1) # ============================================================================= # Initial overview of the data # ============================================================================= # get percentage of NaNs for each column N_nans = df.isnull().sum(axis=0) / len(df) * 100 # Create plot of the percentage of NaNs for the different colums: if CreatePlots: fig, ax = plt.subplots(figsize=(18,9)) # create the actual figure and the figure handle (ax) x = np.arange(len(df.columns)) # create a range from 0 to the length of the columns plt.bar(x, N_nans, align='center') # bar chart plt.xticks(x, df.columns, rotation=90) # set the x ticks ax.tick_params(axis='x', which='major', labelsize=10) # rotate the x ticks and set text size plt.xlim(0, len(df.columns)) # sets the x limits plt.title('Percentage of NaNs for the different categories') plt.xlabel('Category') plt.ylabel('Percent') fig.set_tight_layout(True) if SavePlots: fig.savefig('fig_all_NaNs.pdf', format="pdf", dpi=300) plt.show(block=False) del x # ============================================================================= # data cleanup # ============================================================================= # choose only columns that have less than 10% NaNs: columns_good = df.columns[N_nans < 10] print("Columns available to use after throwing out bad ones: ", columns_good.tolist(), "\n") # keep only rows with data (i.e. no nans). # Other suggestions here would be imputation (replace NaNs with mean/median) df_good = df[columns_good].copy().dropna(axis=0) print("After removing NaNs, new dataframe dimension: ", df_good.shape) print("i.e. {:.2f}% of rows remaining. \n".format(len(df_good) / len(df) * 100)) # drop sizes smaller than 20 m^2 (or any other cuts) df_good = df_good[df_good.SIZE_OF_HOUSE > 20] # now create the smaller dataframe on which we want to test ML models on: columns_to_keep = ['SIZE_OF_HOUSE', 'POSTAL_CODE', 'CONSTRUCTION_YEAR', 'SCHOOL_DISTANCE_1', 'SUPERMARKET_DISTANCE_1'] #alternatively use all columns by uncommenting this line below: #columns_to_keep = columns_good X = df_good[columns_to_keep].copy() # copy to ensure that we wont be editing the original data y = df_good['KOEBESUM_BELOEB'].copy() # ============================================================================= # Overview of data # ============================================================================= def overview_plot(X, y, X_strings, title_strings, x_lims, n_ticks=3): """ TODO, documentation here """ N = X.shape[1] fig = plt.figure(figsize=(16, 7)) ax_hist = [] for i in range(N): ax_hist.append(plt.subplot(2, N, 1+i)) ax_hist[-1].hist(X[X_strings[i]], bins=50, range=x_lims[i], histtype='step', linewidth=1.1) ax_hist[-1].set_title(title_strings[i], size=16) ax_hist[-1].set_ylabel('Counts') ax_hist[-1].locator_params(axis='x', nbins=n_ticks) correlations = np.corrcoef(X.T, y)[-1, :-1] ax_scatter = [] for i in range(X.shape[1]): x = X[X_strings[i]] mask = (y < 8e6) & (x_lims[i][0] < x) & (x < x_lims[i][1]) ax_scatter.append(plt.subplot(2, N, 1+i+N)) with sns.axes_style("white"): ax_scatter[-1].hexbin(x[mask], y[mask]/1e6, gridsize=20) ax_scatter[-1].set_title(title_strings[i], size=16) ax_scatter[-1].set_ylabel('House Price') ax_scatter[-1].set_ylim(0, 8) ax_scatter[-1].set_xlim(x_lims[i]) ax_scatter[-1].locator_params(axis='x', nbins=n_ticks) text = r'$\rho$ = {:.3f}'.format(correlations[i]) ax_scatter[-1].text(0.98, 0.9, text, color='k', horizontalalignment='right', verticalalignment='center', transform=ax_scatter[-1].transAxes, fontdict={'size': 10}) fig.set_tight_layout(True) if SavePlots: fig.savefig('fig_InputVariableOverview.pdf', format='pdf', dpi=600) plt.show(block=False) title_strings = ['Size of House', 'Postal Code', 'Construction Year', 'School Distance 1', 'Supermarket Distance 1'] xlims = [(20, 300), (1000, 10000), (1700, 2017), (0, 7000), (0, 4000)] if CreatePlots: overview_plot(X, y , columns_to_keep, title_strings, xlims) #============================================================================== # Split data up into test and train #============================================================================== N_total = len(X) // 5 # total number of examples to use. For all, use: len(X), or uncomment below X_sampled = X.sample(n=N_total, replace=False, random_state=42) y_sampled = y.sample(n=N_total, replace=False, random_state=42) #X_sampled = X.copy() #y_sampled = y.copy() print ("Size of X_sampled = {}, y_sampled = {}\n\n".format(X_sampled.shape, y_sampled.shape)) # other cuts could be only to use houses in e.g. CPH: #X_sampled = X[X.POSTAL_CODE < 2500].copy() #y_sampled = y[X.POSTAL_CODE < 2500].copy() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.25, random_state=42) # ============================================================================= # Useful functions to use later # ============================================================================= #from sklearn.metrics import mean_absolute_error, mean_squared_error def MAE(y_true, y_pred): """ TODO documentation here """ return np.mean(np.abs(y_true - y_pred)) # or mean_absolute_error(y_true, y_pred) def MSE(y_true, y_pred): """ TODO documentation here """ return np.mean((y_true - y_pred)**2) # mean_squared_error(y_true, y_pred) def print_MAE_MSE(y_true, y_pred, method_string): """ TODO documentation here """ string = ("{}: Mean Absolute Error = {:.3f} 1e6, Mean Squared Error " "= {:.3f} 1e12").format(method_string, MAE(y_true, y_pred)/1e6, MSE(y_true, y_pred)/1e12) print(string) def calc_z(y_true, y_pred): """ TODO documentation here """ z = (y_pred - y_true) / y_true return z def overview_methods(y_true, y_pred, title_strings, x_lims=(-1, 10), n_ticks=4, GridSearchString=''): """ TODO documentation here """ N = len(title_strings) fig = plt.figure(figsize=(16, 8)) # ax_hist = [] for i in range(N): # ax_hist = fig.add_subplot(2, N, 1+i) # ax_hist.hist(y_true/1e6, bins=50)# , range=x_lims, histtype='step', lw=1.1, ls='--', alpha=0.9, color=current_palette[2]) # ax_hist[-1].hist(y_pred[i]/1e6, bins=50, range=x_lims, histtype='step', lw=1.3, color=current_palette[0]) # ax_hist[-1].set_title(title_strings[i], size=16) # ax_hist[-1].set_xlabel('DKK') # ax_hist[-1].set_ylabel('Counts') # ax_hist[-1].locator_params(axis='x', nbins=n_ticks) ax_hist = fig.add_subplot(2, N, 1+i) ax_hist.hist(y_true/1e6, bins=50, range=x_lims , histtype='step', lw=1.1, alpha=0.9, color=current_palette[2]) ax_hist.hist(y_pred[i]/1e6, bins=50, range=x_lims, histtype='step', lw=1.3, color=current_palette[0]) ax_hist.set_title(title_strings[i], size=16) ax_hist.set_xlabel('DKK') ax_hist.set_ylabel('Counts') ax_hist.locator_params(axis='x', nbins=n_ticks) mae = MAE(y_true, y_pred[i]) mse = MSE(y_true, y_pred[i]) text = 'MAE = {:.3f} 1e6 \nMSE = {:.3f} 1e12'.format(mae/1e6, mse/1e12) ax_hist.text(0.35, 0.9, text, color='k', horizontalalignment='left', verticalalignment='center', transform=ax_hist.transAxes, fontdict={'size': 10}) zmin = -2 zmax = 3 #ax_hist_z = [] for i in range(N): z = calc_z(y_true, y_pred[i]) zmask = (zmin < z) & (z < zmax) # ax_hist_z.append(fig.add_subplot(2, N, 1+i+N)) # ax_hist_z[-1].hist(z, bins=50, range=(zmin, zmax), histtype='step', lw=1.3, label='z', color=current_palette[0]) # ax_hist_z[-1].set_xlim(zmin, zmax) # ax_hist_z[-1].set_title(title_strings[i], size=16) # ax_hist_z[-1].set_xlabel('z') # ax_hist_z[-1].set_ylabel('Counts') # ax_hist_z[-1].locator_params(axis='x', nbins=n_ticks) ax_hist_z = fig.add_subplot(2, N, 1+i+N) ax_hist_z.hist(z, bins=50, range=(zmin, zmax), histtype='step', lw=1.3, label='z', color=current_palette[0]) ax_hist_z.set_xlim(zmin, zmax) ax_hist_z.set_title(title_strings[i], size=16) ax_hist_z.set_xlabel('z') ax_hist_z.set_ylabel('Counts') ax_hist_z.locator_params(axis='x', nbins=n_ticks) text = r'$\mu$ = {:.0f}'.format(z.mean())+'\n' + r'$\sigma$ = {:.0f}'.format(z.std()) text += '\n' + r'$\~\mu$ = {:.3f}'.format(z[zmask].mean())+'\n' + r'$\~\sigma$ = {:.3f}'.format(z[zmask].std()) ax_hist_z.text(0.65, 0.95, text, color='k', horizontalalignment='left', verticalalignment='top', transform=ax_hist_z.transAxes, fontdict={'size': 10}) fig.set_tight_layout(True) if SavePlots: fig.savefig('fig_MLresultsOverview_%s.pdf'%(GridSearchString), format='pdf', dpi=600) plt.show(block=False) from sklearn.pipeline import make_pipeline from sklearn import preprocessing # ============================================================================= # Models # ============================================================================= if do_ML_Models: # Linear Regression from sklearn.linear_model import LinearRegression clf_lin = make_pipeline( preprocessing.StandardScaler(), LinearRegression()) # makes a pipeline, to scale input properly clf_lin.fit(X_train, y_train) # fits the model y_pred_lin = clf_lin.predict(X_test) # predicts using the model print_MAE_MSE(y_test, y_pred_lin, 'Lin') # K Nearest Meighbours from sklearn.neighbors import KNeighborsRegressor #, RadiusNeighborsRegressor clf_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsRegressor(n_neighbors=10)) clf_KNN.fit(X_train, y_train) y_pred_KNN = clf_KNN.predict(X_test) print_MAE_MSE(y_test, y_pred_KNN, 'KNN') # Decision Tree from sklearn.tree import DecisionTreeRegressor clf_DT = make_pipeline( preprocessing.StandardScaler(), DecisionTreeRegressor(criterion='mse')) clf_DT.fit(X_train, y_train) y_pred_DT = clf_DT.predict(X_test) print_MAE_MSE(y_test, y_pred_DT, 'DT ') # Random forest from sklearn.ensemble import RandomForestRegressor clf_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=50)) clf_RF.fit(X_train, y_train) y_pred_RF = clf_RF.predict(X_test) print_MAE_MSE(y_test, y_pred_RF, 'RF ') # Neural Network from sklearn.neural_network import MLPRegressor clf_NN = make_pipeline( preprocessing.StandardScaler(), MLPRegressor(hidden_layer_sizes=(10,5,2), max_iter=1000, solver='lbfgs')) clf_NN.fit(X_train, y_train) y_pred_NN = clf_NN.predict(X_test) print_MAE_MSE(y_test, y_pred_NN, 'NN ') ## Xgboost #from xgboost import XGBRegressor #clf_XGB = make_pipeline( preprocessing.StandardScaler(), XGBRegressor(n_estimators=50)) #clf_XGB.fit(X_train, y_train) #y_pred_XGB = clf_XGB.predict(X_test) #print_MAE_MSE(y_test, y_pred_XGB, 'XGB') y_pred_all = [y_pred_lin, y_pred_KNN, y_pred_DT, y_pred_RF, y_pred_NN] # y_pred_XGB methods = ['Lin', 'KNN', 'DT', 'RF', 'NN'] # 'XGB', overview_methods(y_test.values, y_pred_all, methods) if VeryVerbose: print("Linear coefficients:") print(clf_lin.steps[1][1].coef_) print("DT feature importance:") print(clf_DT.steps[1][1].feature_importances_) print("RF feature importance:") print(clf_RF.steps[1][1].feature_importances_) # print("XGB feature importance:") # print(clf_XGB.steps[1][1].feature_importances_) # print("NN coefficients:") # print(clf_NN.steps[1][1].coefs_) # ============================================================================= # Decision Tree visualization # ============================================================================= def Decision_Tree_Visualization(X_train, y_train): from sklearn.tree import export_graphviz from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier import pydotplus clf_reg = DecisionTreeRegressor(criterion='mse').fit(X_train, y_train) DT_data_reg = export_graphviz(clf_reg, out_file=None, feature_names=X.columns, filled=True, rounded=True, special_characters=True, max_depth=5) graph_reg = pydotplus.graph_from_dot_data(DT_data_reg) graph_reg.write_pdf("DecisionTree_Regression.pdf") clf_clas = DecisionTreeClassifier().fit(X_train, y_train.apply(lambda x: 1 if x < 2e6 else 0)) DT_data_clas = export_graphviz(clf_clas, out_file=None, feature_names=X.columns, class_names = ['0', '1'], filled=True, rounded=True, special_characters=True, max_depth=5) graph_clas = pydotplus.graph_from_dot_data(DT_data_clas) graph_clas.write_pdf("DecisionTree_Classification.pdf") do_viz = False if do_viz: Decision_Tree_Visualization(X_train, y_train) # ============================================================================= # bad predictions? # ============================================================================= def get_bad_prediction(df, y_pred, n=3): """ TODO """ z = calc_z(y_test, y_pred) df_bad = df.loc[z.index].copy() df_bad['Prediction'] = y_pred_RF / 1e6 df_bad['Price'] = y_test / 1e6 df_bad['z'] = z df_bad.sort_values('z', ascending=True, inplace=True) n_worst_predictions = pd.concat([df_bad.iloc[:n, :], df_bad.iloc[-n:, :]]) return n_worst_predictions if do_ML_Models: n_worst_predictions_RF = get_bad_prediction(df_good, y_pred_RF) if VeryVerbose: print("\n\n The n Worst predicictions") # with pd.option_context('display.max_rows', None, 'display.max_columns', 30): print(n_worst_predictions_RF) print("\n\n") # ============================================================================= # Performing GridSearch # ============================================================================= if do_GridSearch: print("\n\nRunning GridSearch, please stay patient \n") from sklearn.model_selection import GridSearchCV from sklearn.metrics.scorer import make_scorer params_best = {} params_worst = {} def custom_loss(): return make_scorer(MSE, greater_is_better=False) # K Nearest Meighbours from sklearn.neighbors import KNeighborsRegressor #, RadiusNeighborsRegressor hyperparameters_KNN = { 'kneighborsregressor__n_neighbors' : [5, 10, 20], 'kneighborsregressor__weights' : ['uniform', 'distance'] } clf_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsRegressor()) clf_KNN = GridSearchCV(clf_KNN, hyperparameters_KNN, cv=5, n_jobs=-1, scoring=custom_loss()) # alternatives: 'neg_mean_squared_error', neg_mean_absolute_error, neg_median_absolute_error # clf_KNN = GridSearchCV(clf_KNN, hyperparameters_KNN, cv=5, n_jobs=-1, scoring='neg_mean_squared_error') # alternatives: 'neg_mean_squared_error', neg_mean_absolute_error, neg_median_absolute_error clf_KNN.fit(X_train, y_train) y_pred_KNN = clf_KNN.predict(X_test) print_MAE_MSE(y_test, y_pred_KNN, 'KNN') params_best['KNN'] = clf_KNN.cv_results_['params'][np.argmin(clf_KNN.cv_results_['rank_test_score'])] params_worst['KNN'] = clf_KNN.cv_results_['params'][np.argmax(clf_KNN.cv_results_['rank_test_score'])] # Decision tree from sklearn.tree import DecisionTreeRegressor hyperparameters_DT = { 'decisiontreeregressor__max_features': ["sqrt", 'log2', None], 'decisiontreeregressor__max_depth': [None, 50, 30, 10]} clf_DT = make_pipeline( preprocessing.StandardScaler(), DecisionTreeRegressor()) clf_DT = GridSearchCV(clf_DT, hyperparameters_DT, cv=5, n_jobs=-1, scoring=custom_loss()) clf_DT.fit(X_train, y_train) y_pred_DT = clf_DT.predict(X_test) print_MAE_MSE(y_test, y_pred_DT, 'DT ') params_best['DT'] = clf_DT.cv_results_['params'][np.argmin(clf_DT.cv_results_['rank_test_score'])] params_worst['DT'] = clf_DT.cv_results_['params'][np.argmax(clf_DT.cv_results_['rank_test_score'])] # Random forest from sklearn.ensemble import RandomForestRegressor hyperparameters_RF = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3], 'randomforestregressor__n_estimators': [50, 100]} clf_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestRegressor()) clf_RF = GridSearchCV(clf_RF, hyperparameters_RF, cv=5, n_jobs=-1, scoring=custom_loss()) clf_RF.fit(X_train, y_train) y_pred_RF = clf_RF.predict(X_test) print_MAE_MSE(y_test, y_pred_RF, 'RF ') params_best['RF'] = clf_RF.cv_results_['params'][np.argmin(clf_RF.cv_results_['rank_test_score'])] params_worst['RF'] = clf_RF.cv_results_['params'][np.argmax(clf_RF.cv_results_['rank_test_score'])] # Neural Network from sklearn.neural_network import MLPRegressor hyperparameters_NN = { 'mlpregressor__alpha' : [0.0001, 0.001]} # 'mlpregressor__activation': ['relu']} clf_NN = make_pipeline( preprocessing.StandardScaler(), MLPRegressor(hidden_layer_sizes=(10,5,2), max_iter=1000, solver='lbfgs')) clf_NN = GridSearchCV(clf_NN, hyperparameters_NN, cv=5, n_jobs=-1, scoring=custom_loss()) clf_NN.fit(X_train, y_train) y_pred_NN = clf_NN.predict(X_test) print_MAE_MSE(y_test, y_pred_NN, 'NN ') params_best['NN'] = clf_NN.cv_results_['params'][np.argmin(clf_NN.cv_results_['rank_test_score'])] params_worst['NN'] = clf_NN.cv_results_['params'][np.argmax(clf_NN.cv_results_['rank_test_score'])] # # Xgboost # from xgboost import XGBRegressor # hyperparameters_XGB = { 'xgbregressor__max_depth' : [1, 3, 5], # 'xgbregressor__min_child_weight': [3, 5, 10], # None # 'xgbregressor__n_estimators' : [50, 100, 150]} # clf_XGB = make_pipeline( preprocessing.StandardScaler(), XGBRegressor()) # clf_XGB = GridSearchCV(clf_XGB, hyperparameters_XGB, cv=5, n_jobs=-1, scoring=custom_loss()) # clf_XGB.fit(X_train, y_train) # y_pred_XGB = clf_XGB.predict(X_test) # print_MAE_MSE(y_test, y_pred_XGB, 'XGB') y_pred_all = [y_pred_lin, y_pred_KNN, y_pred_DT, y_pred_RF, y_pred_NN] # y_pred_XGB methods = ['Lin', 'KNN', 'DT', 'RF', 'NN'] if CreatePlots: overview_methods(y_test.values, y_pred_all, methods, GridSearchString='GridSearch') if VeryVerbose: print("KNN best params", clf_KNN.best_params_) print("DT best params", clf_DT.best_params_) print("DT feature importance", clf_DT.best_estimator_.steps[1][1].feature_importances_) print("RF best params", clf_RF.best_params_) print("RF feature importance", clf_RF.best_estimator_.steps[1][1].feature_importances_) print("NN coefficients", clf_NN.best_estimator_.steps[1][1].coefs_) # print("XGB best params", clf_XGB.best_params_) # print("XGB feature importance", clf_XGB.best_estimator_.steps[1][1].feature_importances_) # ============================================================================= # Classification. # # I want to buy an appartment, but my bank says I have a budget of "price_cut". # Can I afford to buy X house? # # ============================================================================= def ACC(y_true, y_pred): from sklearn.metrics import accuracy_score return accuracy_score(y_true, y_pred) def F1(y_true, y_pred): from sklearn.metrics import f1_score return f1_score(y_pred, y_true) def print_ACC_F1(y_true, y_pred, method_string): """ """ string = ("{}: Accuracy = {:.3f}, F1 score = {:.3f}").format(method_string, ACC(y_true, y_pred), F1(y_true, y_pred)) print(string) if do_Classification: print("\nRunning classification") y_cls = y_sampled.copy() price_cut = 2e6 price_mask = y_cls < price_cut y_cls[price_mask] = 1 y_cls[~price_mask] = 0 # could also be done with the one-liner: #y_cls.apply(lambda x: 1 if x < price_cut else 0) X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_sampled, y_cls, test_size=0.25, random_state=42) #SGD, linear stochastic gradient descent from sklearn.linear_model import SGDClassifier cls_SGD = make_pipeline( preprocessing.StandardScaler(), SGDClassifier()) # makes a pipeline, to scale input properly cls_SGD.fit(X_train_cls, y_train_cls) # fits the model y_pred_SGD_cls = cls_SGD.predict(X_test_cls) # predicts using the model print_ACC_F1(y_test_cls, y_pred_SGD_cls, 'SGD') # SVM Linear (Support vector machine) from sklearn.svm import LinearSVC cls_SVM = make_pipeline( preprocessing.StandardScaler(), LinearSVC()) # makes a pipeline, to scale input properly cls_SVM.fit(X_train_cls, y_train_cls) # fits the model y_pred_SVM_cls = cls_SVM.predict(X_test_cls) # predicts using the model print_ACC_F1(y_test_cls, y_pred_SVM_cls, 'SVM') # K Nearest Meighbours from sklearn.neighbors import KNeighborsClassifier #, RadiusNeighborsRegressor cls_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsClassifier(n_neighbors=10)) cls_KNN.fit(X_train_cls, y_train_cls) y_pred_KNN_cls = cls_KNN.predict(X_test_cls) print_ACC_F1(y_test_cls, y_pred_KNN_cls, 'KNN') # Decision Tree from sklearn.tree import DecisionTreeClassifier cls_DT = make_pipeline( preprocessing.StandardScaler(), DecisionTreeClassifier(criterion='gini')) cls_DT.fit(X_train_cls, y_train_cls) y_pred_DT_cls = cls_DT.predict(X_test_cls) print_ACC_F1(y_test_cls, y_pred_DT_cls, 'DT ') # Random forest from sklearn.ensemble import RandomForestClassifier cls_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=50)) cls_RF.fit(X_train_cls, y_train_cls) y_pred_RF_cls = cls_RF.predict(X_test_cls) print_ACC_F1(y_test_cls, y_pred_RF_cls, 'RF ') # I want to buy a new appartment In Valby. It is 95 m² and built in 1960. It is close to supermarkets (250m), but far away from any schools (1000m). # Can I afford this house? house_candidate = np.array([95, 2500, 1960., 1000., 250.]).reshape(1, -1) # the last reshape is needed for the dimensions of the array to match X methods_cls = ['SGD', 'SVM', 'KNN', 'DT ', 'RF '] cls_all = [cls_SGD, cls_SVM, cls_KNN, cls_DT, cls_RF] print("\nClassification, or 'Can I afford that appartment?' \n") for name, cls in zip(methods_cls, cls_all): string = "For method {} I ".format(name) if cls.predict(house_candidate) == 0: string += "cannot " else: string += "can " string += 'afford the appartment' print(string) print("\n\n") # ============================================================================= # Multilabel classification # ============================================================================= def plot_confusion_matrix(cm, classes, method, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ import itertools fig = plt.figure() plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) print() print(title) print(cm) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): string = "{:.3f}".format(cm[i, j]) plt.text(j, i, string, horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.grid(0) fig.set_tight_layout(True) if SavePlots: fig.savefig('fig_DataConfusionMatrix_%s.pdf'%(method), format="pdf", dpi=300) if do_Multilabel_Classification: print("Running Multilabel classification: \n") print("DataFrame shape:", df.shape) # Use only examples that has floor number available df_floors = df[~df.FLOOR_NUMBER.isnull()].copy() def mask_df_floors(df_floors, mask_floors): print("Shape after cut on NaN floor number: ", df_floors.shape) df_floors2 = df_floors[mask_floors].copy() print("Shape after cut on floor number and max floor number: ", df_floors2.shape) N_nans_floors = df_floors2.isnull().sum(axis=0) / len(df_floors2) * 100 # choose only columns that have less than 10% NaNs: columns_good_floors = df_floors2.columns[N_nans_floors < 10] print("\nColumns available to use for the multilabel classification: ") print(columns_good_floors.tolist()) print() # keep only rows with data (i.e. no nans). # Other suggestions here would be imputation (replace NaNs with mean/median) df_floors_good = df_floors2[columns_good_floors].dropna(axis=0) print("After removing NaNs, new dataframe dimension: ", df_floors_good.shape) print("i.e. {:.2f}% of rows remaining. \n".format(len(df_floors_good) / len(df_floors2) * 100)) return df_floors_good mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) # mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) & (df_floors.SIZE_OF_HOUSE > 20) & (df_floors.KOEBESUM_BELOEB < 50e6) df_floors_good = mask_df_floors(df_floors, mask_floors) # now create the smaller dataframe on which we want to test ML models on: columns_to_keep_floors = ['SIZE_OF_HOUSE', 'POSTAL_CODE', 'CONSTRUCTION_YEAR', 'HOUSE_NUMBER', 'KOEBESUM_BELOEB', 'SCHOOL_DISTANCE_1', 'SUPERMARKET_DISTANCE_1'] X_floors = df_floors_good[columns_to_keep_floors].copy() # copy to ensure that we wont be editing the original data y_floors = df_floors_good['FLOOR_NUMBER'].copy() y_floors[ (1 <= y_floors) & (y_floors < 5) ] = 3 print ("Size of X_floors = {}, y_floors = {}\n\n".format(X_floors.shape, y_floors.shape)) from sklearn.model_selection import train_test_split X_train_floor, X_test_floor, y_train_floor, y_test_floor = train_test_split(X_floors, y_floors, test_size=0.20, random_state=42) from sklearn.ensemble import RandomForestClassifier cls_multi_RF = make_pipeline( preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=100)) cls_multi_RF.fit(X_train_floor, y_train_floor) y_pred_RF_labels = cls_multi_RF.predict(X_test_floor) from sklearn.neighbors import KNeighborsClassifier #, RadiusNeighborsRegressor cls_multi_KNN = make_pipeline( preprocessing.StandardScaler(), KNeighborsClassifier(n_neighbors=10)) cls_multi_KNN.fit(X_train_floor, y_train_floor) y_pred_KNN_labels = cls_multi_KNN.predict(X_test_floor) from sklearn.metrics import confusion_matrix cnf_matrix_RF = confusion_matrix(y_test_floor, y_pred_RF_labels) cnf_matrix_KNN = confusion_matrix(y_test_floor, y_pred_KNN_labels) cnf_matrix_all_3 = confusion_matrix(y_test_floor, 3*np.ones_like(y_test_floor)) print("Percentage of different categories:") print("Class 0: {:.3f}, Class 3: {:.3f}, Class 5: {:.3f}".format((y_floors==0).sum() / len(y_floors), (y_floors==3).sum() / len(y_floors), (y_floors==5).sum() / len(y_floors))) if CreatePlots: # plt.figure() plot_confusion_matrix(cnf_matrix_RF, classes=['ST', '1-4', '5'], method='RF', normalize=True, title='Normalized RF confusion matrix') # plt.figure() plot_confusion_matrix(cnf_matrix_KNN, classes=['ST', '1-4', '5'], method='KNN', normalize=True, title='Normalized KNN confusion matrix') # ============================================================================= # Check relationship between floor number and price # ============================================================================= print("\n\nRelationship between floor number and price: \n") # Here we consider, which selection we want to impose (to be considered with care!): mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) # mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) & (df_floors.SIZE_OF_HOUSE > 20) # mask_floors = (0 <= df_floors.FLOOR_NUMBER) & (df_floors.FLOOR_NUMBER <= 5) & (df_floors.MAX_FLOOR_NUMBER == 5) & (df_floors.SIZE_OF_HOUSE > 20) & (df_floors.KOEBESUM_BELOEB < 50e6) df_floors_good = mask_df_floors(df_floors, mask_floors) df_price_and_floor = pd.concat([df_floors_good['FLOOR_NUMBER'].astype('int64'), df_floors_good['KOEBESUM_BELOEB']/1e6], axis=1) df_price_and_floor['SQUARE_M_PRICE'] = df_floors_good['KOEBESUM_BELOEB'] / df_floors_good['SIZE_OF_HOUSE']# .rename('SQUARE_M_PRICE') print("\nDescribe the relationship between Floor number and KOEBESUM: \n") print(df_price_and_floor.groupby(['FLOOR_NUMBER'])['KOEBESUM_BELOEB'].describe()) print("\nDescribe the relationship between Floor number and SQUARE_M_PRICE: \n") print(df_price_and_floor.groupby(['FLOOR_NUMBER'])['SQUARE_M_PRICE'].describe()) print("\n\n\nMeans:") print(df_price_and_floor.groupby(['FLOOR_NUMBER']).mean()) print("\nMedians:") print(df_price_and_floor.groupby(['FLOOR_NUMBER']).median()) # ============================================================================= # # =============================================================================