#!/usr/bin/env python
#
# ----------------------------------------------------------------------------------- #
#
#  Python macro for illustrating test statistics among them the Kolmogorov-Smirnov test.
#
#  The Kolmogorov-Smirnov test (KS-test) is a general test for two distributions in 1D
#  are the same. This program applies both a binned and an unbinned KS test, and compares
#  it to a Chi2 test and a simple comparison of means.
#
#  The distributions compared are two unit Gaussians, where one is then modified, and
#  the sensitivity of each test is considered.
#
#  References:
#    Barlow: 
#    http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   22nd of September 2014
#
# ----------------------------------------------------------------------------------- #

from ROOT import *
from array import array


# ----------------------------------------------------------------------------------- #
def sqr(a) : return a*a
# ----------------------------------------------------------------------------------- #

# Setting what to be shown in statistics box:
gStyle.SetOptStat("emr")
gStyle.SetOptFit(1111)

# Random numbers from the Mersenne-Twister:
r = TRandom3()
r.SetSeed(1)

verbose = True


# ----------------------------------------------------------------------------------- #
# Make Histograms:
# ----------------------------------------------------------------------------------- #

# Resulting p-values for each test (means, Chi-Squared binned, KS binned, KS unbinned):
Hist_ProbMu  = TH1F("Hist_ProbMu",  "Hist_ProbMu",   50,  0.0, 1.0)
Hist_ProbCSb = TH1F("Hist_ProbCSb", "Hist_ProbCSb",  50,  0.0, 1.0)
Hist_ProbKSb = TH1F("Hist_ProbKSb", "Hist_ProbKSb",  50,  0.0, 1.0)
Hist_ProbKS  = TH1F("Hist_ProbKS" , "Hist_ProbKS",   50,  0.0, 1.0)


# ----------------------------------------------------------------------------------- #
# Generate and fit data:
# ----------------------------------------------------------------------------------- #

# How many experiments, and how many events in each:
Nexp = 1
Nevents = 1000

# Define the two Gaussians to be generated:
dist_meanA  =  0.0
dist_widthA =  1.0
dist_meanB  =  0.0
dist_widthB =  1.0


# Two distributions to be compared:
Hist_gaussA  = TH1F("Hist_gaussA", "Hist_gaussA",  100, -5.0, 5.0)
Hist_gaussB  = TH1F("Hist_gaussB", "Hist_gaussB",  100, -5.0, 5.0)


for iexp in range (Nexp) :

    # Make empty arrays, lists, and sums:
    x_gaussA_array = array( 'd' ) 
    x_gaussB_array = array( 'd' ) 
    x_gaussA = []
    x_gaussB = []
    sumA = [0.0, 0.0, 0.0]
    sumB = [0.0, 0.0, 0.0]

    # Generate data:
    # --------------
    for i in range (Nevents) :
        xA = r.Gaus(dist_meanA, dist_widthA)
        sumA[1] += xA
        sumA[2] += xA*xA
        x_gaussA_array.append(xA)
        Hist_gaussA.Fill(xA)
    
        xB = r.Gaus(dist_meanB, dist_widthB)
        sumB[1] += xB
        sumB[2] += xB*xB
        x_gaussB_array.append(xB)
        Hist_gaussB.Fill(xB)

    # Test distributions:
    # -------------------

    # Test mean and width:
    meanA = sumA[1] / float(Nevents)
    widthA = sqrt(sumA[2] / float(Nevents) - meanA*meanA)
    meanB = sumB[1] / float(Nevents)
    widthB = sqrt(sumB[2] / float(Nevents) - meanB*meanB)
    dMean = meanA - meanB
    dMeanError = sqrt(sqr(widthA)/float(Nevents) + sqr(widthB)/float(Nevents))
    pMean = TMath.Erfc(dMean / dMeanError / sqrt(2.0)) / 2.0        # Turn a number of sigmas into a probability!
    Hist_ProbMu.Fill(pMean)

    # Binned Chi2 and Kolmogorov-Smirnov Test:
    # (see options at http://root.cern.ch/root/html/TH1.html#TH1:KolmogorovTest):
    pCSbinned = Hist_gaussA.Chi2Test(Hist_gaussB, "UU")
    pKSbinned = Hist_gaussA.KolmogorovTest(Hist_gaussB, "UU")
    Hist_ProbCSb.Fill(pCSbinned)
    Hist_ProbKSb.Fill(pKSbinned)

    # Unbinned Kolmogorov-Smirnov Test on two sorted arrays (requires sorting!):
    x_gaussA_array = array('d', sorted(x_gaussA_array))
    x_gaussB_array = array('d', sorted(x_gaussB_array))
    pKS = TMath.KolmogorovTest(Nevents, x_gaussA_array, Nevents, x_gaussB_array, "D")
    Hist_ProbKS.Fill(pKS)

    if (verbose and iexp < 10) :
      print " %4d:  pMean: %6.4f   pCSbinned: %6.4f   pKSbinned: %6.4f   pKS: %6.4f"%(iexp, pMean, pCSbinned, pKSbinned, pKS)

    if (Nexp > 1) :
        Hist_gaussA.Reset("ICE")
        Hist_gaussB.Reset("ICE")

    else :
        # Make a white canvas and plot the two distributions:
        c0 = TCanvas("c0","",120,20,900,600)
        c0.SetFillColor(0)
        
        Hist_gaussA.SetLineWidth(2)
        Hist_gaussA.SetLineColor(2)
        Hist_gaussA.Draw("e")           # Option "e" means "with errors (Poisson, i.e. sqrt(N))"
        
        Hist_gaussB.SetLineWidth(2)
        Hist_gaussB.SetLineColor(4)
        Hist_gaussB.Draw("e same")


# ------------------------------------------------------------------ #
# Show the distribution of fitting results:
# ------------------------------------------------------------------ #

if (Nexp > 1) :

    # Make a white canvas divided into four regions:
    c1 = TCanvas("c1","",150,70,500,700)
    c1.SetFillColor(0)
    c1.Divide(1,4)
    
    c1.cd(1)
    Hist_ProbMu.SetMinimum(0.0)
    Hist_ProbMu.SetLineWidth(2)
    Hist_ProbMu.SetLineColor(4)
    Hist_ProbMu.Draw()
    
    c1.cd(2)
    Hist_ProbCSb.SetMinimum(0.0)
    Hist_ProbCSb.SetLineWidth(2)
    Hist_ProbCSb.SetLineColor(4)
    Hist_ProbCSb.Draw()
    
    c1.cd(3)
    Hist_ProbKSb.SetMinimum(0.0)
    Hist_ProbKSb.SetLineWidth(2)
    Hist_ProbKSb.SetLineColor(4)
    Hist_ProbKSb.Draw()
    
    c1.cd(4)
    Hist_ProbKS.SetMinimum(0.0)
    Hist_ProbKS.SetLineWidth(2)
    Hist_ProbKS.SetLineColor(4)
    Hist_ProbKS.Draw()
    
    c1.Update()
    # c1.SaveAs("TestDist.pdf")


# ---------------------------------------------------------------------------------- 
raw_input('Press Enter to exit')

# ---------------------------------------------------------------------------------- 
# 
# Questions:
# ----------
#  1) First run the program to display the two distributions A and B, when
#     - They are the same.
#     - The mean of A is increased.
#     - The width of A is enlarged.
#     Get a feel for how much you need to change the distribution, before you can
#     by eye see that they are not the same.
#     Could you for the test of the means have calculated how much of a change in the
#     mean is needed for a difference to be statistically significant? Do so, and see if
#     it somewhat matches you guess/estimate from above!
# 
#  2) Now run the tests 100 times, where A and B are unit Gaussians and thus identical.
#     How should the distributions of the test probabilities come out? And is this the
#     case, approximately?
#     As it happens, the binning has a small impact on the KS-probability:
#     From: http://www-d0.fnal.gov/~yinh/worknote/root/k-test.txt
#     //     1. The value of PROB should not be expected to have exactly the correct
#     //  distribution for binned data.
#     //     2. The user is responsible for seeing to it that the bin widths are
#     //  small compared with any physical phenomena of interest.
#     //     3. The effect of binning (if any) is always to make the value of PROB
#     //  slightly too big. That is, setting an acceptance criterion of (PROB>0.05
#     //  will assure that at most 5% of truly compatible histograms are rejected,
#     //  and usually somewhat less.
#     (NOTE: I'm sorry that I haven't found a way to make the warning go away!)
# 
#  3) Repeat the changes in question 1), and see which tests "reacts" most to these
#     modifications.
#     How much of a change in the mean is required for 95% of the tests (of each kind)
#     to give a probability below 5%? How much is required for the width?
# 
#  4) Try to test different distributions than the Gaussian one (e.g. exponential,
#     binomial, etc.), and see how the tests performs.
# 
# NOTE: The unbinned Kolmogorov-Smirnov test has the great advantage that it can handle
#       ANY distribution (even the Cauchy distribution - remind yourself of that one!).
#       The reason is, that it doesn't care about how far out an outlier is.
# 
#  Advanced:
#  5) Obviously, the test of the means is not sensitive the a change in the width.
#     Make such a test yourself by calculating the widths and the uncertainty on the
#     widths (or perhaps try the F-test!).
#     Note that in a (unit) Gaussian the uncertainty on the width is of the same order
#     as that of the means! That is generally good to know :-)
#     
#  Very advanced:
#  6) Implement in ROOT the following tests:
#      - Lilliefors test
#      - Shapiro-Wilk test
#      - Anderson-Darling test
#      - Cramer-von-Mises test
#      - Jarque-Bera test
#      - Kuiper's test
#      - Mann-Whitney-Wilcoxon test
#      - Siegel-Tukey test
#     and quantify under various conditions the power of each and the correlation
#     among them. Write it up, and send it to a statistics journal. :-)
# 
# ----------------------------------------------------------------------------------