######################################################################
# D. Jason Koskinen
# April 29, 2016
#
# So this is the code to check the students
# submissions for their classification algorithms
# for the cancer question on the 2016 exam for
# Advanced Methonds in Applied Statistics
######################################################################

from array import array
import io
import numpy
import os
import root_numpy as rnp

import random
import ROOT
from ROOT import *

import sys

savePlots = False


gStyle.SetTitleSize( 0.055,"xyz")
gStyle.SetTitleOffset( 0.66,"xy")
gStyle.SetHistLineWidth( 2)

##############################
# Now check
##############################

lsdir = os.listdir("/Users/koskinen/Documents/Courses/AdvancedMethodsInAppliedStatistics2016/StudentExams/Advanced Methods in Applied Statistics_174202-2")

benign_student_files    = []
malignant_student_files = []

# Look through the list of files from the
# exam submissions and grab those
# that match the file names and file types
# which were explicitly part of the exam
# question for the classification exercise.
# It is not required to use a BDT, but
# the grading is based on the final
# purity in correctly classifying the
# sample as either benign or malignant.

for entry in lsdir:
    if 'BENIGN' in entry.upper():
        benign_student_files.append("/Users/koskinen/Documents/Courses/AdvancedMethodsInAppliedStatistics2016/StudentExams/Advanced Methods in Applied Statistics_174202-2/%s" % entry)
        print entry
    # end if
    if 'MALIGNANT' in entry.upper():
        malignant_student_files.append("/Users/koskinen/Documents/Courses/AdvancedMethodsInAppliedStatistics2016/StudentExams/Advanced Methods in Applied Statistics_174202-2/%s" % entry)
        print entry
    # end if
# end if

infile_mal_true = numpy.loadtxt("data/malignant_true.txt", delimiter = " ")
infile_ben_true = numpy.loadtxt("data/benign_true.txt", delimiter = " ")

for ben, mal in zip( benign_student_files, malignant_student_files):
    print ben

    infile_mal_ID = numpy.loadtxt( mal)
    infile_ben_ID = numpy.loadtxt( ben)

    overlap_mal = numpy.intersect1d(infile_mal_true[0:], infile_mal_ID[0:])
    overlap_ben = numpy.intersect1d(infile_ben_true[0:], infile_ben_ID[0:])

    print "len(infile_mal_true): ", len(infile_mal_true)
    print "len(overlap_mal): ", len(overlap_mal)
    
    print "len(infile_ben_true): ", len(infile_ben_true)
    print "len(overlap_ben): ", len(overlap_ben)

    print "efficiency: ", (len(overlap_mal)+len(overlap_ben))*1.0/(len(infile_mal_true)+len(infile_ben_true))
    print "\n\n"
# end for


raw_input('Press Enter to exit')