#!/usr/bin/env python # coding: utf-8 # # Classifying cancer from 32 parameters # # Data is taken from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 # # We simply read all the data, drop the patient ID and place the label into an array of it's own. # In[5]: import csv import numpy with open('wdbc.data') as input_file: text_data = [row for row in csv.reader(input_file, delimiter=',')] for line in text_data: _ = line.pop(0) #We remove the ID - no need for it known_labels = ','.join([line.pop(0) for line in text_data]) raw_data = numpy.array(text_data).astype(numpy.float) data = raw_data / numpy.max(raw_data, axis = 0) # Now we can write a generic clustering mechanism, similar to the small previous example. # In[6]: def all_dist(observation, data): return numpy.sqrt((data[:, 0] - observation[0])**2 + (data[:, 1] - observation[1])**2) def cluster(data, k): samples, _= data.shape centroids = numpy.array([data[numpy.random.randint(samples), :,] for _ in range(k)]) done = False while not done: distances = numpy.empty((k,samples)) for d in range(k): distances[d, :] = all_dist(centroids[d], data) winners = numpy.argmin(distances, axis = 0) clusters = [data[winners == i, :] for i in range(k)] prev_centroids = centroids centroids = numpy.array([numpy.average(c, axis = 0) for c in clusters]) if numpy.sum(prev_centroids-centroids) == 0: done=True return winners # Now we can find the clusters, since we have only two categories its rather fast. We cannot know if category 0 is malign or benign, but have to assume that the smaller category is malign. We thus change the labels to that assumption. Then we can easily compare the classifications of each patient and check who well we did. # In[7]: clusters = cluster(data, 2) a, b = numpy.bincount(clusters) labels = known_labels+'' if a0] k = len(clusters) centroids = numpy.array([numpy.average(c, axis = 0) for c in clusters]) if len(prev_centroids) == len(centroids): if numpy.sum(prev_centroids-centroids) == 0: done=True return winners, centroids target_k = 2 n_centroids = 25 centroids = [] while n_centroids > target_k: clusters, centroids = cluster(data, n_centroids, centroids) if ( n_centroids > target_k ) and ( len(centroids) == n_centroids ): centroid_dist = numpy.sum(numpy.sqrt((centroids[:, numpy.newaxis, :]-centroids)**2), axis =2) centroid_dist[centroid_dist==0] = 1000.0 centroids = list(centroids) minpos = numpy.argmin(centroid_dist) point0, point1 = centroids.pop(minpos//n_centroids), centroids.pop((minpos%n_centroids)-1) #-1 because we pop centroids.append((point0 + point1)/2) n_centroids -= 1 else: n_centroids = len(centroids) clusters, centroids = cluster(data, n_centroids, centroids) #We have the number of required centroids now a, b = numpy.bincount(clusters) labels = known_labels+'' if a