#!/usr/bin/env python
# coding: utf-8

# # A larger example
# 
# Here we will use a dataset on Italian wine. The dataset is taken from
# https://archive.ics.uci.edu/ml/machine-learning-databases/wine/
# 
# The actual data is in the file wine.data, and a description of the data can fe found in wine.names.
# 
# If we look at the beginning of the data file we see:
# `1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
# 1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
# 1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185
# 1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480
# 1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735
# 1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450
# 1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290
# 1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295
# 1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045
# 1,13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85,7.22,1.01,3.55,1045
# 1,14.1,2.16,2.3,18,105,2.95,3.32,.22,2.38,5.75,1.25,3.17,1510
# 1,14.12,1.48,2.32,16.8,95,2.2,2.43,.26,1.57,5,1.17,2.82,1280
# 1,13.75,1.73,2.41,16,89,2.6,2.76,.29,1.81,5.6,1.15,2.9,1320
# 1,14.75,1.73,2.39,11.4,91,3.1,3.69,.43,2.81,5.4,1.25,2.73,1150
# 1,14.38,1.87,2.38,12,102,3.3,3.64,.29,2.96,7.5,1.2,3,1547
# 1,13.63,1.81,2.7,17.2,112,2.85,2.91,.3,1.46,7.3,1.28,2.88,1310
# 1,14.3,1.92,2.72,20,120,2.8,3.14,.33,1.97,6.2,1.07,2.65,1280`
# 
# First is the class of the wine and the follows the data, which are (taken from wines.names:
# 1) Alcohol
# 2) Malic acid
# 3) Ash
# 4) Alcalinity of ash  
# 5) Magnesium
# 6) Total phenols
# 7) Flavanoids
# 8) Nonflavanoid phenols
# 9) Proanthocyanins
# 10)Color intensity
# 11)Hue
# 12)OD280/OD315 of diluted wines
# 13)Proline 
#     
#     
# The data is clearly a simple CSV file, thus we start by reading the data.

# In[6]:


import csv
import numpy

with open('wine.data') as input_file:
    raw_data = numpy.array([row for row in csv.reader(input_file)]).astype(numpy.float)

labels = raw_data[:, 0 ]
data   = raw_data[:, 1:]


# This time the data are not in the simple [0:1] range - so we normalize each column.

# In[7]:


_, num_c = data.shape
for i in range(num_c):
    data[:, i] = data[:, i] / numpy.max(data[:, i])


# We know that there are 13 columns in data, but at this point we may as well make our distance meassure indepedent of dimensions.

# In[8]:


def all_distances(point, db):
    result = []
    for entry in db:
        distance = 0.0
        for dim in zip(point, entry):
                distance += (dim[0] - dim[1])**2
        result.append(numpy.sqrt(distance))
    return numpy.array(result)


# We can reuse the simple election mechanism.

# In[9]:


import collections
def classify(point, k=5):
    distances = all_distances(point, data)
    votes = []
    for _ in range(k):
        winner = numpy.argmin(distances)
        votes.append(labels[winner])
        distances[winner] = 1000
    return collections.Counter(votes).most_common(1)[0][0]
    

# Now we can test the result against the database itself.

# In[10]:


score = 0
for point in raw_data:
    if point[0] == classify(point[1:], 6):
        score += 1
print('Matched',score,'of',len(raw_data))


# The result is quite satisfactory. However, since we are matching against the database itself, the tested point is itself in the test set, which is an unfair advantage compared to a real world scenario. Eliminating this bias is left as an exercise, it is quite simple though.

# You should play around with values of k as well, to tell the best number of neighbors to match against.