#!/usr/bin/env python # coding: utf-8 # # A larger example # # Here we will use a dataset on Italian wine. The dataset is taken from # https://archive.ics.uci.edu/ml/machine-learning-databases/wine/ # # The actual data is in the file wine.data, and a description of the data can fe found in wine.names. # # If we look at the beginning of the data file we see: # `1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065 # 1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050 # 1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185 # 1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480 # 1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735 # 1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450 # 1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290 # 1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295 # 1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045 # 1,13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85,7.22,1.01,3.55,1045 # 1,14.1,2.16,2.3,18,105,2.95,3.32,.22,2.38,5.75,1.25,3.17,1510 # 1,14.12,1.48,2.32,16.8,95,2.2,2.43,.26,1.57,5,1.17,2.82,1280 # 1,13.75,1.73,2.41,16,89,2.6,2.76,.29,1.81,5.6,1.15,2.9,1320 # 1,14.75,1.73,2.39,11.4,91,3.1,3.69,.43,2.81,5.4,1.25,2.73,1150 # 1,14.38,1.87,2.38,12,102,3.3,3.64,.29,2.96,7.5,1.2,3,1547 # 1,13.63,1.81,2.7,17.2,112,2.85,2.91,.3,1.46,7.3,1.28,2.88,1310 # 1,14.3,1.92,2.72,20,120,2.8,3.14,.33,1.97,6.2,1.07,2.65,1280` # # First is the class of the wine and the follows the data, which are (taken from wines.names: # 1) Alcohol # 2) Malic acid # 3) Ash # 4) Alcalinity of ash # 5) Magnesium # 6) Total phenols # 7) Flavanoids # 8) Nonflavanoid phenols # 9) Proanthocyanins # 10)Color intensity # 11)Hue # 12)OD280/OD315 of diluted wines # 13)Proline # # # The data is clearly a simple CSV file, thus we start by reading the data. # In[6]: import csv import numpy with open('wine.data') as input_file: raw_data = numpy.array([row for row in csv.reader(input_file)]).astype(numpy.float) labels = raw_data[:, 0 ] data = raw_data[:, 1:] # This time the data are not in the simple [0:1] range - so we normalize each column. # In[7]: _, num_c = data.shape for i in range(num_c): data[:, i] = data[:, i] / numpy.max(data[:, i]) # We know that there are 13 columns in data, but at this point we may as well make our distance meassure indepedent of dimensions. # In[8]: def all_distances(point, db): result = [] for entry in db: distance = 0.0 for dim in zip(point, entry): distance += (dim[0] - dim[1])**2 result.append(numpy.sqrt(distance)) return numpy.array(result) # We can reuse the simple election mechanism. # In[9]: import collections def classify(point, k=5): distances = all_distances(point, data) votes = [] for _ in range(k): winner = numpy.argmin(distances) votes.append(labels[winner]) distances[winner] = 1000 return collections.Counter(votes).most_common(1)[0][0] # Now we can test the result against the database itself. # In[10]: score = 0 for point in raw_data: if point[0] == classify(point[1:], 6): score += 1 print('Matched',score,'of',len(raw_data)) # The result is quite satisfactory. However, since we are matching against the database itself, the tested point is itself in the test set, which is an unfair advantage compared to a real world scenario. Eliminating this bias is left as an exercise, it is quite simple though. # You should play around with values of k as well, to tell the best number of neighbors to match against.