#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#  Python macro for testing Raphael Weldon's dices.
# ----------------------------------------------------------------------------------- #
#
#  Walter Frank Raphael Weldon DSc FRS (15 March 1860 - 13 April 1906)
#  generally called Raphael Weldon, was an English evolutionary
#  biologist and a founder of biometry. He was the joint founding
#  editor of Biometrika, with Francis Galton and Karl Pearson.
#
#  http://en.wikipedia.org/wiki/Walter_Frank_Raphael_Weldon
#
#  Weldon and his two collegues (Francis Galton and Karl Pearson) were
#  interested in statistics (to say the least!), and in order to do a
#  simple hypothesis test, Weldon rolled 12 dices 26306 times
#  (i.e. 315672 throws), which he wrote about in a letter to Galton
#  dated 2nd of February 1894 (which is how we have this data).
#
#  There were actually four data sets as follows:
#
#  I: 12 dice were rolled 26306 times, and number of 5s and 6s was
#     counted with the following result:
#  0	1	2	3	4	5	6	7	8	9	10	11	12
#  185	1149	3265	5475	6114	5194	3067	1331	403	105	14	4	0
#
#  II: 7006 of the 26306 experiments were performed by a clerk, deemed
#      by Galton to be "reliable and accurate", yielding:
#  0	1	2	3	4	5	6	7	8	9	10	11	12
#  45	327	886	1475	1571	1404	787	367	112	29	2	1	0
#
#  III: In this subset of the data, 4096 of the rolls were scrutinized
#       with only a 6 counting as a success:
#  0	1	2	3	4	5	6	7	8	9	10	11	12
#  447	1145	1181	796	380	115	24	8	0	0	0	0	0
#
#  IV: Finally, a subset of 4096 rolls were considered counting 4s, 5s
#      and 6s as successes:
#  0	1	2	3	4	5	6	7	8	9	10	11	12
#  0	7	60	198	430	731	948	847	536	257	71	11	0
#
#  References:
#    Kemp, A.W. and Kemp, C.D. (1991), "Weldon's dice data revisited",
#    American Statistician, 45, 216-222.
#
# ----------------------------------------------------------------------------------- #
#
#  Author:  Troels C. Petersen
#  Email:   petersen@nbi.dk
#  Date:    4th of December 2015
#
# ----------------------------------------------------------------------------------- #

from ROOT import *
from array import array


# ----------------------------------------------------------------------------------- #
def sqr(a):
    return a*a
# ----------------------------------------------------------------------------------- #

# Setting what to be shown in statistics box:
gStyle.SetOptStat("")
gStyle.SetOptFit(0)

# Random numbers from the Mersenne-Twister:
r = TRandom3()
r.SetSeed(0)

# Settings for script:
verbose   = True
SavePlots = False
Ndices   = 12;
Noutcome = 13


# ------------------------------------------------------------------ #
# Data:
# ------------------------------------------------------------------ #

sum1 = 0
data1 = [185, 1149, 3265, 5475, 6114, 5194, 3067, 1331, 403, 105, 14,  4, 0]
data2 = [ 45,  327,  886, 1475, 1571, 1404,  787,  367, 112,  29,  2,  1, 0]
data3 = [447, 1145, 1181,  796,  380,  115,   24,    8,   0,   0,  0,  0, 0]
data4 = [  0,    7,   60,  198,  430,  731,  948,  847, 536, 257, 71, 11, 0]

print "   N   data1  data2  data3  data4"
for i in range (Noutcome):
    if (verbose):
        print "  %2d:  %5d  %5d  %5d  %5d"%(i, data1[i], data2[i], data3[i], data4[i])
    sum1 += data1[i]


# ------------------------------------------------------------------ #
# Testing simple hypothesis (p_naive = 1/3):
# ------------------------------------------------------------------ #

# Make histograms with data and prediction:
Hist_data1 = TH1F("Hist_data1", "Weldon's dices", Noutcome, -0.5, Noutcome-0.5)
Hist_bino1 = TH1F("Hist_bino1", "Distribution of hypothesis", Noutcome, -0.5, Noutcome-0.5)

# Make factorial array:
factorial = [0]*Noutcome
factorial[0] = 1.0
for i in range (1, Noutcome): factorial[i] = factorial[i-1] * i

# Make prediction with "naive" (i.e. p(5 or 6) = 1/3) guess:
p_naive = 1.0 / 3.0
chi2 = 0.0
Ndof = 0
print " - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
print "   N      data         naive pred (p=1/3)    dChi2"

for i in range (0, Noutcome):
    p_bin = factorial[Ndices] / (factorial[i] * factorial[Ndices-i]) * pow(p_naive,i) * pow(1.0-p_naive,Ndices-i)
    bino = sum1 * p_bin

    Hist_data1.SetBinContent(i+1,data1[i])
    Hist_data1.SetBinError(i+1,sqrt(data1[i]))
    Hist_bino1.SetBinContent(i+1,bino)
    Hist_bino1.SetBinError(i+1,0.0)

    if (data1[i] > 0.0) :
        dchi2 = sqr(data1[i]-bino)/data1[i]
        chi2 += dchi2
        Ndof += 1

        if (verbose):
            print "  %2d:  %7.1f +- %5.1f   vs.   %7.1f    %6.2f"%(i, data1[i], sqrt(data1[i]), bino, dchi2)


# Plot histograms with data and prediction:
canvas = TCanvas("canvas","",20,20,900,600)

# Hist_data1.SetMinimum(0.0)
Hist_data1.SetLineWidth(2)
Hist_data1.SetLineColor(kRed)
Hist_data1.Draw("e")
Hist_bino1.SetLineWidth(2)
Hist_bino1.SetLineColor(kBlue)
Hist_bino1.SetLineStyle(2)
Hist_bino1.Draw("same")

pvalue_naive = TMath.Prob(chi2,Noutcome)
print "  ChiSquare between data and prediction:  %7.2f"%(chi2)
print "  Probability of matching data:          %8.6f"%(pvalue_naive)

canvas.Update()
if (SavePlots): canvas.SaveAs("Binomial_WeldonDices_NaiveHypothesis.pdf")


# ------------------------------------------------------------------ #
# Chi2 and LLH fit:
# ------------------------------------------------------------------ #

# Choose range and number of values to be tested:
Nbins = 200
min = 0.33
max = 0.35
delta = (max-min) / Nbins

Hist_Chi2 = TH1F("Hist_Chi2", "Distribution of Chi2 values", Nbins+1, min-delta/2.0, max+delta/2.0)
Hist_LLH = TH1F("Hist_LLH", "Distribution of -2ln(likelihood) values", Nbins+1, min-delta/2.0, max+delta/2.0)
for ip in range (0, Nbins+1):
    p_fit = min + (max-min) * ip / Nbins       # Value of p(5or6) to be considered
    chi2 = 0.0
    llh  = -99.9                               # Shift likelihood to match Chi2 (from running once!)

    for i in range (0, Noutcome):
        p_bin = factorial[Ndices] / (factorial[i] * factorial[Ndices-i]) * pow(p_fit, i) * pow(1.0-p_fit, Ndices-i)
        Nexp_bino = sum1 * p_bin
        Nobs_data = data1[i]
        if (Nobs_data > 0.0) : chi2 += sqr(Nobs_data - Nexp_bino) / Nobs_data
        llh += -2.0*log(TMath.Poisson(Nobs_data, Nexp_bino))

    Hist_Chi2.SetBinContent(ip+1, chi2)
    Hist_Chi2.SetBinError(ip+1, 0.001)
    Hist_LLH.SetBinContent(ip+1, llh)
    Hist_LLH.SetBinError(ip+1, 0.001)


# Plot histograms with data and prediction:
canvas2 = TCanvas("canvas2","", 50, 50, 900, 600)

chi2fit = TF1("chi2fit", "[0] + [1]*(x-[2])*(x-[2])", 0.333, 0.340)
chi2fit.SetParameters(Noutcome, 1000000.0, 0.3378)
Hist_Chi2.Fit("chi2fit", "r")
Hist_Chi2.GetXaxis().SetRangeUser(0.3328,0.3402)
Hist_Chi2.SetLineWidth(2)
Hist_Chi2.SetLineColor(kBlue)

llhfit = TF1("llhfit", "[0] + [1]*(x-[2])*(x-[2])", 0.333, 0.340)
llhfit.SetParameters(Noutcome, 1000000.0, 0.3378)
Hist_LLH.Fit("llhfit", "r")
Hist_LLH.SetLineWidth(2)
Hist_LLH.SetLineColor(kRed)

Hist_Chi2.Draw()
Hist_LLH.Draw("same")

canvas2.Update()
if (SavePlots) : canvas2.SaveAs("Chi2andLLHparabolas.pdf")


# Print some central information:
chi2_fit_chi2 = chi2fit.GetParameter(0)
Ndof = Noutcome - 1         # There are no free fit parameters, but one bin is empty!
p_fit_chi2 = chi2fit.GetParameter(2)
ep_fit_chi2 = 1.0 / sqrt(chi2fit.GetParameter(1))
print "\n  Result for the ChiSquare fit:"
print "    The probability p(5 or 6) is estimated to be: %7.5f +- %7.5f"%(p_fit_chi2, ep_fit_chi2)
print "    The distance to the naive hypothesis is: %4.1f sigma"%((p_fit_chi2-p_naive) / ep_fit_chi2)
print "    The probability of the best fit hypothesis is:  Prob(%4.1f, %2d) = %5.3f"%(chi2_fit_chi2, Ndof, TMath.Prob(chi2_fit_chi2, Ndof))
# Note that the distance (in N sigmas) to the naive hypothesis could also have been obtained as:
#   Nsigma = sqrt(chi2_naive - chi2_bestfit)
# The numbers here are: sqrt(35.7 - 9.8) = sqrt(25.9) = 5.1 sigma

p_fit_llh = llhfit.GetParameter(2)
ep_fit_llh = 1.0 / sqrt(llhfit.GetParameter(1))
print "\n  Result for the likelihood fit:"
print "    The probability p(5 or 6) is estimated to be: %7.5f +- %7.5f"%(p_fit_llh, ep_fit_llh)
print "    The distance to the naive hypothesis is: %4.1f sigma"%((p_fit_llh-p_naive) / ep_fit_llh)
print "    The probability of the best fit hypothesis...  cannot be calculated directly as for the Chi2!"

canvas2.Update()
if (SavePlots): canvas2.SaveAs("Binomial_WeldonDices_Chi2andLLHfit.pdf")


# ------------------------------------------------------------------ #
# Simple one-sample test (using average number as test statistic):
# ------------------------------------------------------------------ #

Ntotal  = 0.0
average = 0.0
for i in range (Noutcome):
    Ntotal += data1[i]
    average += i*data1[i]
average = average / Ntotal

rms = 0.0
for i in range (Noutcome):
    rms += data1[i] * sqr(float(i) - average)
rms = sqrt(rms) / (Noutcome-1)
error_ave = rms/sqrt(Ntotal)

print "  Rough estimate of p(5 or 6) = %6.4f +- %6.4f \n"%(average/Ndices, error_ave/Ndices)


# ------------------------------------------------------------------ #
# Simple ROOT-based fit to obtain best value of p(5 or 6):
#
# This is of course the best way to do things, as ROOT is MUCH
# better at fitting, than I am, even for the simplest 1D case!
# ------------------------------------------------------------------ #

def func_Binomial(x, p) :
    if (x[0] > -0.5) :
        return p[0] * TMath.Binomial(int(p[1]), int(x[0]+0.5)) * TMath.Power(p[2],int(x[0]+0.5)) * TMath.Power(1-p[2], int(p[1])-int(x[0]+0.5))
    else : return 0.0

fit_bino = TF1("fit_bino", func_Binomial, -0.5, Ndices+0.5, 3)
fit_bino.SetParameters(sum1, Ndices, 1.0/3.0)
fit_bino.FixParameter(0, sum1)
fit_bino.FixParameter(1, Ndices)
fit_bino.SetNpx(1000)        # Set the number of intervals used when drawing function!
fit_bino.SetLineColor(kBlue)


canvas3 = TCanvas("canvas3","", 80, 80, 900, 600)
# canvas3.SetLogy()

Hist_data1.Fit("fit_bino", "")
Hist_data1.Fit("fit_bino", "L+")
Hist_bino1.Draw("same")

# Legend:
leg = TLegend( 0.50, 0.65, 0.89, 0.89 )
leg.SetFillColor(kWhite)
leg.SetLineColor(kWhite)
leg.AddEntry(Hist_data1, " Data", "L")
leg.AddEntry(Hist_bino1, " Naive prediction (p56 = 1/3, pChi2 = 0.0007)", "L")
leg.AddEntry(fit_bino,   " Best fit (p56 = 0.3377, pChi2 = 0.64)", "L")
leg.Draw()

canvas3.Update()
if (SavePlots) : canvas3.SaveAs("ROOTfits.pdf")


raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
#
# Questions:
# ----------
# 0) What distribution should the number of 5s and 6s make? And what uncertainty would
#    you assign each bin? 
#
# 1) Consider first just the first (and largest) dataset. Plot the data along with the
#    "naive" prediction, and calculate the Chi2. What is the probability of this Chi2
#    given the number of degrees of freedom? Is the naive p=1/3 very likely?
#
# 2) Next, consider an alternative hypothesis of a "non-naive" value of the probability
#    for a 5 or a 6, and find the optimal value of this probability by scanning through
#    the possible values and calculating the Chi2 for each of them.
#    Fitting (the minimum of) this Chi2 distribution, what is the uncertainty on this
#    probability? And how probable is the Chi2 of this new hypothesis?
#
# 3) Many of the bins don't have much data in them. Therefore, repeat the above scan,
#    but now calculating the likelihood (or rather -2*log(llh)). Does this improve the
#    measurement of the probability?
#
# 4) Do the same exercise for the other three datasets. Do they all show the same
#    behavior, and is the large statistics needed?
#
#
# Advanced questions:
# -------------------
# 1) Assuming that the dices rolled all had the same probabilities associated to each
#    value, do a "global fit" of all four datasets, finding the probabilities which
#    best explains all four datasets.
#
# ----------------------------------------------------------------------------------- #