##############################
# D. Jason Koskinen
# Dec. 22, 2015
#
# The exercise herein is to take a txt file
# and read in multiple data sets of X and Y
# input and calculate basic statitics quantities.
#
# Do everything in normal python arrays and then
# use the numpy converter to put the data into
# numpy arrays. Why? Because numpy arrays are hard.
#
# The data set is somehwat of a classic in statistics
# and is known as Anscombe's quartet.
##############################

import io
import numpy as np
import scipy as sp
from scipy import stats as stats

infile = io.open("../data/FranksNumbers.txt")

# Making an empty array to fill with arrays. Normally
# arrays of arrays is a bad sign, but it will work out
# fine this time.

metaArray = []

for line in infile.readlines():
    splLine = line.split()
    if len(splLine) == 3:# This is when the data sets change
        metaArray.append([])
    # end if len()
    if len(splLine) == 0 or (not splLine[0].isdigit()):
        continue
    # end not if
    
    # read in from text is generally a string so make sure
    # to explicitly cast the variable as a float
    
    x = float(splLine[0])
    y = float(splLine[1])
    metaArray[-1].append([x,y])
# end for line

# Convert the array of arrays into
# a numpy array so that nice calculations
# can be made with ease.
    
a = np.asarray(metaArray)

for i in range(0,len(a)):
    slope     = stats.linregress(a[i])[0]
    intercept =  stats.linregress(a[i])[1]
    slope = 0.48
    intercept = 3.02
    # The following code 'flattens' the tuple, which then includes
    # the x-values (1st column in the file) as part of the set over
    # which to compute the variance.
    # but actually we just want to compute the variance of the y-values.
    print "Variance for dataset %i: %f (WRONG VALUE)" % (i, np.var(a[i]))

    # The following code tells numpy (via the axis=0) to calculate
    # the variance over the
    # separate data columns (x and y), where we're mostly interested in the
    # variance in y. Also, there are two ways to think of the
    # exercise as written in the lecture notes:
    # A) you are given the line and therefore the degrees of freedom
    # are equal to the number of data points, or
    # B) the variance should be calculated using the 'unbiased'
    # estimator (shown on slide 3) which corrects the
    # degrees of freedom to be N-1. By default numpy uses
    # that the change to the degrees of freedom (ddof) is zero.
    # Ergo, for an unbiased estimator we maybe, possibly, kinda, sort of,
    # should use N-1 stead of N. Also, Troels said that he stressed this
    # in his class, so all of the students from his course should
    # know this.
    
    print "Variance for dataset %i: %f (CORRECT VALUE FOR BIASED VARIANCE)" % (i, np.var(a[i], axis=0, ddof=0)[1])
    print "Variance for dataset %i: %f (CORRECT VALUE FOR UNBIASED VARIANCE)" % (i, np.var(a[i], axis=0, ddof=1)[1])
    print "linear regression:  y=%0.2fx + %0.2f" % (stats.linregress(a[i])[0], stats.linregress(a[i])[1])
    
    # just get the y-values, i.e. the observed data.
    # Note that this is more easily done if the data sets
    # have the exact numbers of entries, unlike here. The
    # difference is where you put the [:,1] and whether it
    # is necessary to 'recreate' a new numpy array.
    
    observed = sp.array(a[i])[:,1]
    expected = []
    chisq_value     = 0
    chisq_valuewith = 0

    # loop over all the data points in the data set
    # to calculate the expected values of y at each
    # value of x.
    for j in range(0, len(a[i])):
        x = a[i][j][0]
        y = x*slope + intercept
        expected.append(y)
        chisq_value += (y - observed[j])*(y - observed[j])/y
        chisq_valuewith += (y - observed[j])*(y - observed[j])/(1.22*1.22)
    # end for x,y
    
    print "chi-squared By hand:    ", chisq_value
    print "chi-squared From SciPy: ", stats.chisquare(observed,expected)[0]
    print "chi-squared (w/ \pm 1.22 uncertainty):    ", (chisq_valuewith)
    print "Reduced chi-squared:    ", (chisq_value)/(len(a[i]))
    print "Reduced chi-squared (w/ \pm 1.22 uncertainty):    ", (chisq_valuewith)/(len(a[i]))
    print "\n\n"
    
# end for i

# There is a larger questions here related to calculation
# of the chi-squared value; we can do it, but if we do not know
# actually what the data is (money, number of cows, speed of a toddler, etc.)
# can the chi-squared or the reduced chi-squared tell use
# anything meaningful?