#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#  Python/ROOT script testing randomness of single digits (i.e. [0-9]).
#  
#  Given a series of seemingly random digits, how would you go about testing, if they
#  were truly random? This exercise explores the many tests that such data can be
#  submitted to.
#  
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   10th of December 2015
#  
# ----------------------------------------------------------------------------------- #

#Load modules here
from ROOT import *

#Set plotting style
gStyle.SetOptStat("eMR")
gStyle.SetOptFit(1111)

# ----------------------------------------------------------------------------------- #
# Define your functions here
# ----------------------------------------------------------------------------------- #

#Calculate the sqr of a function
def sqr( a ) : return a*a

#Calculate the chi2 between a 2D histogram and a constant
#Assumes Poisson errors on histogram bin contents
def get_chi2_ndf( hist = TH2D(), const = -99999.9, D1 = True ) : 
    chi2 = 0.0
    ndf  = 0

    if D1 : 
        for ibin in range( hist.GetNbinsX() ) : 
            ndf  += 1
            chi2 += sqr( hist.GetBinContent( ibin+1 ) - const ) / sqr(hist.GetBinError( ibin+1 ))

    else : 
        for ibin in range( hist.GetNbinsX() ) : 
            for jbin in range( hist.GetNbinsY() ) : 
                ndf  += 1
                chi2 += sqr( hist.GetBinContent( ibin+1, jbin+1 ) - const ) / sqr(hist.GetBinError( ibin+1, jbin+1 ))

    return [chi2, ndf]

# ----------------------------------------------------------------------------------- #
# Define and get input
# ----------------------------------------------------------------------------------- #

#Define list of input files
infiles = ["./data_RandomDigits2015A.txt" ]
#           "./data_RandomDigits2011.txt" , #Add these when you got a good feeling
#           "./data_RandomDigits2012.txt" ] #what is going on in the 2013 case

#List containing all numbers
numbers = []

#Loop over input files open them in read mode
for ifile in infiles : 
    with open( ifile, "r" ) as current_file : 
        # Extract current file info : loop through each line in the file, loop through each character
        # in the line, demand character is not empty ("") and convert the result to an integer
        # Finally add result to the numbers list
        numbers += [int(char) for line in current_file for char in line.strip() if char is not ""]

# ----------------------------------------------------------------------------------- #
# Define your output
# ----------------------------------------------------------------------------------- #

#Where everything is draw

hist_number   = TH1D("hist_number"  , "Numbers posted"                    , 10, -0.5, 9.5               ) #Plot all digits
corr_number   = TH2D("corr_number"  , "Correlation to previous number"    , 10, -0.5, 9.5, 10, -0.5, 9.5) #Correlation to prev.

hist_odd_even = TH1D("hist_odd_even", "Even and odd numbers"              ,  2, -0.5, 1.5               ) #Is number even or odd
corr_odd_even = TH2D("corr_odd_even", "Correlation to last odd or even"   ,  2, -0.5, 1.5,  2, -0.5, 1.5) #Correlation to prev.

hist_high_low = TH1D("hist_high_low", "Above and equal to or below 5"     ,  2, -0.5, 1.5               ) #Is number >= or < 5
corr_high_low = TH2D("corr_high_low", "Corrlation to last high/low number",  2, -0.5, 1.5,  2, -0.5, 1.5) #Correlation to prev.

#Make sure your histograms scale the errors correctly when normalizing
hist_number.Sumw2()  
corr_number.Sumw2()  
hist_odd_even.Sumw2()
corr_odd_even.Sumw2()
hist_high_low.Sumw2()
corr_high_low.Sumw2()

#Fill 1d histograms
for i in range(len(numbers)) : 
    hist_number.Fill(      numbers[i]   )
    hist_odd_even.Fill(    numbers[i]%2 )
    hist_high_low.Fill(    numbers[i]/5 ) #Note use of inter devision!

    #Fill correlation plot with current and previous number
    if i > 0 : 
        corr_number.Fill(  numbers[i]  , numbers[i-1]  )
        corr_odd_even.Fill(numbers[i]%2, numbers[i-1]%2)
        corr_high_low.Fill(numbers[i]/5, numbers[i-1]/5)

#Normalize histograms
hist_number.Scale( 1.0 / hist_number.Integral() )
corr_number.Scale( 1.0 / corr_number.Integral() )
hist_odd_even.Scale( 1.0 / hist_odd_even.Integral() )
corr_odd_even.Scale( 1.0 / corr_odd_even.Integral() )
hist_high_low.Scale( 1.0 / hist_high_low.Integral() )
corr_high_low.Scale( 1.0 / corr_high_low.Integral() )

# ----------------------------------------------------------------------------------- #
# Calculate the chi^2 given a constant value (Constant is not a free parameter here)
# ----------------------------------------------------------------------------------- #

chi2_ndf_number_hist   = get_chi2_ndf( hist_number  , 1.0/10 , D1 = True  )
chi2_ndf_odd_even_hist = get_chi2_ndf( hist_odd_even, 1.0/2  , D1 = True  )
chi2_ndf_high_low_hist = get_chi2_ndf( hist_high_low, 1.0/2  , D1 = True  )

chi2_ndf_number_corr   = get_chi2_ndf( corr_number  , 1.0/100, D1 = False )
chi2_ndf_odd_even_corr = get_chi2_ndf( corr_odd_even, 1.0/4  , D1 = False )
chi2_ndf_high_low_corr = get_chi2_ndf( corr_high_low, 1.0/4  , D1 = False )

# Offset to number of degrees of freedom (choosing if the overall number of events is know, which I would say it is):
dNdof = 0

#Write your result : 
print "Compatability that distributions stem from random numbers : "
print "Raw numbers                : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_number_hist[0], chi2_ndf_number_hist[1]-dNdof, 
                                                                                           TMath.Prob(chi2_ndf_number_hist[0], chi2_ndf_number_hist[1]-dNdof))

print "Raw numbers correlation    : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_number_corr[0], chi2_ndf_number_corr[1]-dNdof, 
                                                                                           TMath.Prob(chi2_ndf_number_corr[0], chi2_ndf_number_corr[1]-dNdof))

print "Odd even                   : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_odd_even_hist[0], chi2_ndf_odd_even_hist[1]-dNdof, 
                                                                                           TMath.Prob(chi2_ndf_odd_even_hist[0], chi2_ndf_odd_even_hist[1]-dNdof))

print "Odd even correlation       : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_odd_even_corr[0], chi2_ndf_odd_even_corr[1]-dNdof, 
                                                                                           TMath.Prob(chi2_ndf_odd_even_corr[0], chi2_ndf_odd_even_corr[1]-dNdof))

print "High Low                   : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_high_low_hist[0], chi2_ndf_high_low_hist[1]-dNdof, 
                                                                                           TMath.Prob(chi2_ndf_high_low_hist[0], chi2_ndf_high_low_hist[1]-dNdof))

print "High Low correlation       : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_high_low_corr[0], chi2_ndf_high_low_corr[1]-dNdof, 
                                                                                           TMath.Prob(chi2_ndf_high_low_corr[0], chi2_ndf_high_low_corr[1]-dNdof))
                                                                                          

# ----------------------------------------------------------------------------------- #
# Fit 1D distributions and draw output
# ----------------------------------------------------------------------------------- #

canvas = TCanvas("canvas", "canvas", 1200, 900)
canvas.Divide(3,2)

hist_number.SetMinimum(0.0)
hist_odd_even.SetMinimum(0.0)
hist_high_low.SetMinimum(0.0)


canvas.cd(1); hist_number.Fit("pol0"); hist_number.Draw("e")
canvas.cd(2); hist_odd_even.Fit("pol0"); hist_odd_even.Draw("e")   
canvas.cd(3); hist_high_low.Fit("pol0"); hist_high_low.Draw("e")   
canvas.cd(4); corr_number.Draw("colz")
canvas.cd(5); corr_odd_even.Draw("colz")
canvas.cd(6); corr_high_low.Draw("colz")

canvas.Update()


# ----------------------------------------------------------------------------------- #
# Test for sequences using Poisson hypothesis (for Nseq = 2, 3, and 4):
# ----------------------------------------------------------------------------------- #

# Poisson:
# -------------
def func_Poisson(x, p) :
    if (x[0] > -0.5) :
        return p[0] * TMath.PoissonI(x[0]+0.5, p[1])
    else : return 0.0


# Count how many of each sequency:
# ------------------------------------------------------------------------ #
Hist_Dist3 = TH1F("Hist_Dist3", "Hist_Dist3", 1000, -0.5, 999.5)
for i in range(-2, len(numbers)-2) : 
    seq = 100*numbers[i] + 10*numbers[i+1] + numbers[i+2]
    Hist_Dist3.Fill(seq)

# Put all counts into histogram, which should be Poisson:
Hist_Pois3 = TH1F("Hist_Pois3", "Hist_Pois3",   41, -0.5,  40.5)
for i in range( Hist_Dist3.GetNbinsX() ) : 
    Hist_Pois3.Fill( Hist_Dist3.GetBinContent(i+1) )

fPoisson3 = TF1("fPoisson3", func_Poisson, -0.5, 15+0.5, 2)
fPoisson3.SetParameters(1000.0, 2.725)
fPoisson3.SetNpx(1000)        # Set the number of intervals used when drawing function!
fPoisson3.SetLineColor(kMagenta)
fPoisson3.Draw("same")

canvas3 = TCanvas("canvas3", "", 20, 20, 1200, 900)

Hist_Pois3.Fit("fPoisson3", "LR")
Hist_Pois3.GetXaxis().SetRangeUser(-0.5, 25.5)
Hist_Pois3.Draw("e")

canvas3.Update()


"""
# Count how many of each sequency:
# ------------------------------------------------------------------------ #
Hist_Dist4 = TH1F("Hist_Dist4", "Hist_Dist4", 1000, -0.5, 999.5)
for i in range(-3, len(numbers)-3) : 
    seq = 1000*numbers[i] + 100*numbers[i+1] + 10*numbers[i+2] + numbers[i+3]
    Hist_Dist4.Fill(seq)

# Put all counts into histogram, which should be Poisson:
Hist_Pois4 = TH1F("Hist_Pois4", "Hist_Pois4", 41, -0.5,  40.5)
for i in range( Hist_Dist4.GetNbinsX() ) : 
    Hist_Pois4.Fill( Hist_Dist4.GetBinContent(i+1) )

fPoisson4 = TF1("fPoisson4", func_Poisson, -0.5, 15+0.5, 2)
fPoisson4.SetParameters(10000.0, 0.2725)
fPoisson4.SetNpx(1000)        # Set the number of intervals used when drawing function!
fPoisson4.SetLineColor(kMagenta)
fPoisson4.Draw("same")

canvas4 = TCanvas("canvas4", "", 40, 40, 1200, 900)
canvas4.SetLogy()

Hist_Pois4.Fit("fPoisson4", "LR")
Hist_Pois4.GetXaxis().SetRangeUser(-0.5, 25.5)
Hist_Pois4.Draw("e")

canvas4.Update()
"""


raw_input( ' ... ' )


# ----------------------------------------------------------------------------------- #
# 
# First look at the random digits, and see if you see any patterns? Probably rarely, mostly
# because there are many numbers, and patterns are not that visible. So, you will have to
# work a bit...  with statistical tests.
# 
# Before even looking at the above program, think about what statistical tests you
# could submit the sample to. Then consider, how to actually carry out these tests.
# We will try in class to compile and discuss a list, before we start working on it!!!
# 
# 
# 
# Questions:
# ----------
# 1) Are each digit represented roughly equally many times? Are there as many
#    even as odd numbers? And do people have a tedency to choose an even number
#    after an odd and vice versa? How about high vs. low numbers?
# 
# 2) Are people "afraid" of putting the same digit twice in a row? And how about three
#    or four identical digits in a row?
# 
# 3) Try to count, which digits follows which digits. That should be 100 counts in total.
#    If the numbers were truly random, what would you expect then? Is that what you
#    observe?
#    Can you test this for example with a Chi-Square (think about how many numbers you
#    expect in each bin) and/or a likelihood?
# 
# 4) Were the students of 2011 and 2012 better or worse at picking random numbers?
# 
# ----------------------------------------------------------------------------------- #