// ----------------------------------------------------------------------------------- // /* ROOT macro for illustrating the power of the Kolmogorov-Smirnov test. The Kolmogorov-Smirnov test (KS-test) is a general test for two distributions in 1D (for example to histograms) are the same. This program applies both a binned and an unbinned KS test, and compares it to a Chi2 test. References: http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test Author: Troels C. Petersen (CERN) Email: Troels.Petersen@cern.ch Date: 6th of October 2011 */ // ----------------------------------------------------------------------------------- // #include #include #include "TMath.h" double sqr(double a) { return a*a; } // ----------------------------------------------------------------------------------- // KolmogorovSmirnovTest() { // ----------------------------------------------------------------------------------- // gROOT->Reset(); gStyle->SetOptStat(1111); // gStyle->SetOptStat(0); gStyle->SetOptFit(1111); // gStyle->SetOptFit(0); gStyle->SetStatBorderSize(1); gStyle->SetStatFontSize(0.055); gStyle->SetCanvasColor(4); gStyle->SetPalette(1); // Random numbers from the Mersenne-Twister: TRandom3 r; // r.SetSeed(0); // Initializes with the current time (i.e. random!) r.SetSeed(1); // Initializes with 1, "fixing" random number array. gSystem->Load("libMathCore"); bool verbose = true; const int Nexp = 1; const int Nevents = 1000; double meanA = 0.1; double widthA = 1.0; double meanB = 0.0; double widthB = 1.0; // ------------------------------------------------------------------ // // Make Histograms and vectors: // ------------------------------------------------------------------ // TH1F* Hist_gaussA = new TH1F("Hist_gaussA", "Hist_gaussA", 100, -5.0, 5.0); TH1F* Hist_gaussB = new TH1F("Hist_gaussB", "Hist_gaussB", 100, -5.0, 5.0); TH1F* Hist_ProbMu = new TH1F("Hist_ProbMu", "Hist_ProbMu", 50, 0.0, 1.0); TH1F* Hist_ProbCSb = new TH1F("Hist_ProbCSb", "Hist_ProbCSb", 50, 0.0, 1.0); TH1F* Hist_ProbKSb = new TH1F("Hist_ProbKSb", "Hist_ProbKSb", 50, 0.0, 1.0); TH1F* Hist_ProbKS = new TH1F("Hist_ProbKS" , "Hist_ProbKS", 50, 0.0, 1.0); // ------------------------------------------------------------------ // // Loop over generating and fitting data: // ------------------------------------------------------------------ // Int_t Ndof, igood; Double_t chi2, res; for (int iexp=0; iexp < Nexp; iexp++) { vector x_gaussA; vector x_gaussB; double x_gaussA_array[Nevents]; double x_gaussB_array[Nevents]; double sumA[3] = {0.0, 0.0, 0.0}; double sumB[3] = {0.0, 0.0, 0.0}; // Generate data: // -------------- for (int i=0; i < Nevents; i++) { double xA = r.Gaus(meanA,widthA); sumA[1] += xA; sumA[2] += xA*xA; x_gaussA.push_back(xA); Hist_gaussA->Fill(xA); } for (int i=0; i < Nevents; i++) { double xB = r.Gaus(meanB,widthB); sumB[1] += xB; sumB[2] += xB*xB; x_gaussB.push_back(xB); Hist_gaussB->Fill(xB); } // Test distributions: // ------------------- // Test mean and width: sumA[1] = sumA[1] / double(Nevents); sumA[2] = sqrt(sumA[2] / double(Nevents) - sumA[1]*sumA[1]); sumB[1] = sumB[1] / double(Nevents); sumB[2] = sqrt(sumB[2] / double(Nevents) - sumB[1]*sumB[1]); double dMean = sumA[1] - sumB[1]; double dMeanError = sqrt(sqr(sumA[2])/double(Nevents) + sqr(sumB[2])/double(Nevents)); double pMean = TMath::Erfc(dMean / dMeanError / sqrt(2.0)) / 2.0; Hist_ProbMu->Fill(pMean); // Unbinned Kolmogorov-Smirnov Test (requires sorting): sort(x_gaussA.begin(), x_gaussA.end()); sort(x_gaussB.begin(), x_gaussB.end()); copy(x_gaussA.begin(), x_gaussA.end(), x_gaussA_array); copy(x_gaussB.begin(), x_gaussB.end(), x_gaussB_array); double pKS = TMath::KolmogorovTest(Nevents, x_gaussA_array, Nevents, x_gaussB_array, ""); Hist_ProbKS->Fill(pKS); // Binned Chi2 and Kolmogorov-Smirnov Test: double pCSbinned = Hist_gaussA->Chi2Test(Hist_gaussB, "UU"); double pKSbinned = Hist_gaussA->KolmogorovTest(Hist_gaussB, "UU"); Hist_ProbCSb->Fill(pCSbinned); Hist_ProbKSb->Fill(pKSbinned); if (verbose) printf(" %4d: pMean: %6.4f pCSbinned: %6.4f pKSbinned: %6.4f pKS: %6.4f \n", iexp, pMean, pCSbinned, pKSbinned, pKS); if (Nexp > 1) Hist_gaussA->Reset("ICE"); if (Nexp > 1) Hist_gaussB->Reset("ICE"); } // ------------------------------------------------------------------ // // Show the distribution of fitting results: // ------------------------------------------------------------------ // if (Nexp == 1) { // Make a white canvas: c0 = new TCanvas("c0","",220,20,500,300); c0->SetFillColor(0); Hist_gaussA->SetLineWidth(2); Hist_gaussA->SetLineColor(2); Hist_gaussA->Draw(); Hist_gaussB->SetLineWidth(2); Hist_gaussB->SetLineColor(4); Hist_gaussB->Draw("same"); c0->Update(); // c0->SaveAs("Dist.eps"); } // ------------------------------------------------------------------ // // Show the distribution of fitting results: // ------------------------------------------------------------------ // else { // Make a white canvas: c1 = new TCanvas("c1","",250,70,500,800); c1->SetFillColor(0); c1->Divide(1,4); c1->cd(1); Hist_ProbMu->SetMinimum(0.0); Hist_ProbMu->SetLineWidth(2); Hist_ProbMu->SetLineColor(4); Hist_ProbMu->Draw(); c1->cd(2); Hist_ProbCSb->SetMinimum(0.0); Hist_ProbCSb->SetLineWidth(2); Hist_ProbCSb->SetLineColor(4); Hist_ProbCSb->Draw(); c1->cd(3); Hist_ProbKSb->SetMinimum(0.0); Hist_ProbKSb->SetLineWidth(2); Hist_ProbKSb->SetLineColor(4); Hist_ProbKSb->Draw(); c1->cd(4); Hist_ProbKS->SetMinimum(0.0); Hist_ProbKS->SetLineWidth(2); Hist_ProbKS->SetLineColor(4); Hist_ProbKS->Draw(); c1->Update(); // c1->SaveAs("TestDist.eps"); } } //---------------------------------------------------------------------------------- /* Questions: ---------- 1) First run the program to display the two distributions A and B, when - They are the same. - The mean of A is increased. - The width of A is enlarged. Get a feel for how much you need to change the distribution, before you can by eye see that they are not the same. Could you for the test of the means have calculated this? Do so, and see if it somewhat matches you number from above! 2) Now run the tests 100 times, where A and B are unit Gaussians and thus identical. How should the distributions of the test probabilities come out? And is this the case? 3) Repeat the changes in question 1), and see which tests "reacts" most to these modifications. How much of a change in the mean is required for 95% of the tests (of each kind) to give a probability below 5%? How much is required for the width? 4) Obviously, the test of the means is not sensitive the a change in the width. Make such a test yourself by calculating the widths and the uncertainty on the widths (see Cowan, end of chapter 5). Note that the uncertainty on the widths is close to that of the means! That is generally good to know :-) 5) The unbinned Kolmogorov-Smirnov test has the great advantage that it can handle ANY distribution (even the Cauchy distribution - remind yourself of that one!). Try to test different distributions than the Gaussian one (e.g. exponential, binomial, etc.), and see how the tests performs. Advanced questions: ------------------- 1) Implement in ROOT the following tests: - Lilliefors test - Shapiro-Wilk test - Anderson-Darling test - Cramér-von-Mises test - Jarque-Bera test - Kuiper's test - Mann-Whitney-Wilcoxon test - Siegel-Tukey test and quantify under various conditions the power of each and the correlation among them. Write it up, and send it to a statistics journal. :-) */ //----------------------------------------------------------------------------------