#!/usr/bin/env python
# coding: utf-8

# # Classifying cancer from 32 parameters
# 
# Data is taken from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
# 
# We simply read all the data, drop the patient ID and place the label into an array of it's own. 

# In[5]:


import csv
import numpy

with open('wdbc.data') as input_file:
    text_data = [row for row in csv.reader(input_file, delimiter=',')]
for line in text_data:
    _ = line.pop(0) #We remove the ID - no need for it

known_labels = ','.join([line.pop(0) for line in text_data])
raw_data = numpy.array(text_data).astype(numpy.float)
data   = raw_data / numpy.max(raw_data, axis = 0)


# Now we can write a generic clustering mechanism, similar to the small previous example.

# In[6]:


def all_dist(observation, data):
    return numpy.sqrt((data[:, 0] - observation[0])**2 + (data[:, 1] - observation[1])**2)

def cluster(data, k):
    samples, _= data.shape
    centroids = numpy.array([data[numpy.random.randint(samples), :,] for _ in range(k)])
    done = False
    while not done:
        distances = numpy.empty((k,samples))
        for d in range(k):
            distances[d, :] = all_dist(centroids[d], data)
        winners = numpy.argmin(distances, axis = 0)
        clusters = [data[winners == i, :] for i in range(k)]
        prev_centroids = centroids
        centroids = numpy.array([numpy.average(c, axis = 0) for c in clusters])
        if numpy.sum(prev_centroids-centroids) == 0:
            done=True
    return winners


# Now we can find the clusters, since we have only two categories its rather fast. We cannot know if category 0 is malign or benign, but have to assume that the smaller category is malign. We thus change the labels to that assumption. Then we can easily compare the classifications of each patient and check who well we did.

# In[7]:


clusters = cluster(data, 2)
a, b = numpy.bincount(clusters)
labels = known_labels+''
if a<b:
    labels = labels.replace('M','0')
    labels = labels.replace('B','1')
else:
    labels = labels.replace('M','1')
    labels = labels.replace('B','0')
compare = (numpy.equal(clusters, numpy.array(labels.split(',')).astype(numpy.int)))
print(numpy.bincount(compare),'(Wrong, Right)')


# Run it a few times and realize that success differ extremely. Several approaches can be tried to remedy this.
# 
# Try and simply remove one or more dimensions to see if they are merely in the way (really: do a PCA but QaD tests are ok as well).
# 
# Try and change the distance metric for individual dimensions, so rather than simply include or not at in the first appraoch, we can tune the importance of a parameter.
# 

# In[8]:


def cluster(data, k, centroids = []):
    samples, _= data.shape
    if centroids == []:
        centroids = numpy.array([data[numpy.random.randint(samples), :,] for _ in range(k)])
    done = False
    while not done:
        distances = numpy.empty((k,samples))
        for d in range(k):
            distances[d, :] = all_dist(centroids[d], data)
        winners = numpy.argmin(distances, axis = 0)
        clusters = [data[winners == i, :] for i in range(k)]
        prev_centroids = centroids
        clusters = [c for c in clusters if len(c)>0]
        k = len(clusters)
        centroids = numpy.array([numpy.average(c, axis = 0) for c in clusters])
        if len(prev_centroids) == len(centroids):
            if numpy.sum(prev_centroids-centroids) == 0:
                done=True
    return winners, centroids

target_k = 2
n_centroids = 25
centroids = []
while n_centroids > target_k:
    clusters, centroids = cluster(data, n_centroids, centroids)
    if ( n_centroids > target_k ) and ( len(centroids) == n_centroids ):
        centroid_dist = numpy.sum(numpy.sqrt((centroids[:, numpy.newaxis, :]-centroids)**2), axis =2)
        centroid_dist[centroid_dist==0] = 1000.0
        centroids = list(centroids)
        minpos = numpy.argmin(centroid_dist)
        point0, point1 = centroids.pop(minpos//n_centroids), centroids.pop((minpos%n_centroids)-1) #-1 because we pop
        centroids.append((point0 + point1)/2)
        n_centroids -= 1
    else:
        n_centroids = len(centroids)
clusters, centroids = cluster(data, n_centroids, centroids) #We have the number of required centroids now
a, b = numpy.bincount(clusters)
labels = known_labels+''
if a<b:
    labels = labels.replace('M','0')
    labels = labels.replace('B','1')
else:
    labels = labels.replace('M','1')
    labels = labels.replace('B','0')
compare = (numpy.equal(clusters, numpy.array(labels.split(',')).astype(numpy.int)))
print(numpy.bincount(compare),'(Wrong, Right)')


# ***
# Note to self - try with many more clusters, and after convergence, 
# fuse the two clusters that are closest to one and repeat training. 
# Repeat until the desired number of clusters are found.
# 
# Fusing: simple mean, weighted mean or most discriminating (one furthest away from other centroids)
# ***