# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""


import numpy as np                # numerical analysis
import pandas as pd               # excel-like dataframes 

#==============================================================================
# 
#==============================================================================


print "\n\n  --------------------------------------------------------------  "
print "      Housing prices analysis"
print "  ------------------------------------------------------------------ \n\n "

## read csv file
df = pd.read_csv('HousingPrices_Org.csv', sep=';', decimal=".", na_values=['', ' ', '  ', '   ', '.', 'NULL'], low_memory=False)  # parse_dates=True, infer_datetime_format=True  na_values=['.'],   , error_bad_lines=True, encoding="utf-8")

cols_to_use = ['MI_OBJ_OIS_PROPERTY_ID', 
               'MI_OBJ_OIS_MUNICIPALITY_NUMBER',
               'MI_OBJ_OIS_POSTAL_CODE', 
               'MI_OBJ_OIS_SALES_PRICE',
               'MI_OBJ_OIS_DATE_OF_SALES_PRICE',
               'MI_OBJ_OIS_PREVIOUS_SALES_PRICE_FIRST',
               'MI_OBJ_OIS_DATE_OF_PREVIOUS_SALES_PRICE_FIRST',
               'MI_OBJ_OIS_PROPERTY_ADDRESS', 
               'MI_OBJ_OIS_HOUSE_NUMBER',
               'MI_OBJ_OIS_FLOOR_NUMBER',
               'MI_OBJ_OIS_MAX_FLOOR_NUMBER_BUILDING',
               'MI_OBJ_OIS_LAND_ZONE',
               'MI_OBJ_OIS_SIZE_OF_HOUSE', 
               'MI_OBJ_OIS_CONSTRUCTION_YEAR',     
               'MI_OBJ_OIS_ROOF_MATERIAL',
               'MI_KNN_PROPERTY_CONDITION',
               'MI_KNN_AVG_GEO_DISTANCE_NEIGHBORS',
#               'MI_OBJ_OIS_N_COORDINATE', # UTM format
#               'MI_OBJ_OIS_E_COORDINATE', # UTM format
               'C20_1MONTH%', 'C20_3MONTH%', 'C20_6MONTH%', 'C20_12MONTH%', 
               'SCHOOL_DISTANCE_1', 'SCHOOL_DISTANCE_2', 'SCHOOL_DISTANCE_3', 
               'SUPERMARKET_DISTANCE_1', 'SUPERMARKET_DISTANCE_2', 'SUPERMARKET_DISTANCE_3',
               'KOEBESUM_BELOEB',
               'OMREGNINGS_DATO',
           ]

df = df[cols_to_use]

names_to_use = ['PROPERTY_ID', 
               'MUNICIPALITY_NUMBER',
                'POSTAL_CODE', 
                'SALES_PRICE',
                'DATE_OF_SALES_PRICE',
                'PREVIOUS_SALES_PRICE_FIRST',
                'DATE_OF_PREVIOUS_SALES_PRICE_FIRST',
                'ADDRESS', 
                'HOUSE_NUMBER',
                'FLOOR_NUMBER',
                'MAX_FLOOR_NUMBER',
                'LAND_ZONE',
                'SIZE_OF_HOUSE', 
                'CONSTRUCTION_YEAR',     
                'ROOF_MATERIAL',
                'PROPERTY_CONDITION',
                'AVG_GEO_DISTANCE_NEIGHBORS',
#                'N_COORDINATE', # UTM format
#                'E_COORDINATE', # UTM format
                'C20_1MONTH', 'C20_3MONTH', 'C20_6MONTH', 'C20_12MONTH', 
                'SCHOOL_DISTANCE_1', 'SCHOOL_DISTANCE_2', 'SCHOOL_DISTANCE_3', 
                'SUPERMARKET_DISTANCE_1', 'SUPERMARKET_DISTANCE_2', 'SUPERMARKET_DISTANCE_3',
                'KOEBESUM_BELOEB', 
                'OMREGNINGS_DATO',
            ]

df.columns = names_to_use


# In order to make everything nummerical data, we exchange "ST" (Eng: Ground level)
# and "KL" (Eng: Basement) with the suitable floor number:
df.loc[df['FLOOR_NUMBER'] == 'ST', 'FLOOR_NUMBER'] = 0
df.loc[df['FLOOR_NUMBER'] == 'KL', 'FLOOR_NUMBER'] = -1

df.loc[df['MAX_FLOOR_NUMBER'] == 'ST', 'MAX_FLOOR_NUMBER'] = 0
df.loc[df['MAX_FLOOR_NUMBER'] == 'KL', 'MAX_FLOOR_NUMBER'] = -1



df.to_csv('HousingPrices_Cleaned.csv', index=False)


















