#!/usr/bin/env python # coding: utf-8 # # k-Nearest Neighbors algorithm # # For further reading this tutorial is recomended # # http://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/' # # First we will generate random numbers with very simple differences, and use them to understand KNN. The data will overlap, but we hope to find a clear tendency. # # # # We will model Huey, Dewey and Louie - who share a room for at any point in time does have an afinity to stay close to their own bed. # # In[7]: import numpy import matplotlib.pyplot as plt data = numpy.zeros((500,3)) data[:,0] = numpy.random.choice([-0.5, 0.0, 0.5],size=500) data[:, 1:] = numpy.random.normal(0.5, 0.15, (500,2)) #Structure data in label and positions labels = data[:, 0] position = data[:, 1:] #Introduce bias to data position[:, 0] += labels #Views for nicer viewing huey = position[labels == -0.5] dewey = position[labels == 0.0] louie = position[labels == 0.5] plt.plot(huey[:, 0], huey[:, 1] , 'bo') plt.plot(dewey[:,0], dewey[:, 1] , 'ro') plt.plot(louie[:,0], louie[:, 1] , 'go') plt.show() # Now we can easily make a function that finds the euclidean distance between a new point and all points in the room. # In[8]: def all_dist(observation, data): return numpy.sqrt((data[:, 0] - observation[0])**2 + (data[:, 1] - observation[1])**2) # In[9]: distances = all_dist((0,0), position) votes = [] for _ in range(5): winner = numpy.argmin(distances) votes.append(labels[winner]) distances[winner] = 1000 #Just set so high that it cannot win again print(votes) # In[10]: import collections winner = collections.Counter(votes).most_common(1)[0][0] #Counter returns a list of tuples if winner == -0.5: print('I guess Huey') elif winner == 0.0: print('I guess Dewey') elif winner == 0.5: print('I guess Louie') # Lets produce a set of random Dewey positions and see how well we fare. # In[11]: wrong = 0 for _ in range(100): distances = all_dist(numpy.random.normal(0.5, 0.15, 2), position) votes = [] for _ in range(5): winner = numpy.argmin(distances) votes.append(labels[winner]) distances[winner] = 1000 #Just set so high that it cannot win again winner = collections.Counter(votes).most_common(1)[0][0] #Counter returns a list of tuples if winner != 0.0: wrong += 1 print('Got it wrong in',wrong,'cases of 100') # So obviously, while we should get most correct classifications, the afinity of our ducklings is not sufficient to get it right every time.