#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # Python/ROOT script testing randomness of single digits (i.e. [0-9]). # # Given a series of seemingly random digits, how would you go about testing, if they # were truly random? This exercise explores the many tests that such data can be # submitted to. # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 10th of December 2015 # # ----------------------------------------------------------------------------------- # #Load modules here from ROOT import * #Set plotting style gStyle.SetOptStat("eMR") gStyle.SetOptFit(1111) # ----------------------------------------------------------------------------------- # # Define your functions here # ----------------------------------------------------------------------------------- # #Calculate the sqr of a function def sqr( a ) : return a*a #Calculate the chi2 between a 2D histogram and a constant #Assumes Poisson errors on histogram bin contents def get_chi2_ndf( hist = TH2D(), const = -99999.9, D1 = True ) : chi2 = 0.0 ndf = 0 if D1 : for ibin in range( hist.GetNbinsX() ) : ndf += 1 chi2 += sqr( hist.GetBinContent( ibin+1 ) - const ) / sqr(hist.GetBinError( ibin+1 )) else : for ibin in range( hist.GetNbinsX() ) : for jbin in range( hist.GetNbinsY() ) : ndf += 1 chi2 += sqr( hist.GetBinContent( ibin+1, jbin+1 ) - const ) / sqr(hist.GetBinError( ibin+1, jbin+1 )) return [chi2, ndf] # ----------------------------------------------------------------------------------- # # Define and get input # ----------------------------------------------------------------------------------- # #Define list of input files infiles = ["./data_RandomDigits2015A.txt" ] # "./data_RandomDigits2011.txt" , #Add these when you got a good feeling # "./data_RandomDigits2012.txt" ] #what is going on in the 2013 case #List containing all numbers numbers = [] #Loop over input files open them in read mode for ifile in infiles : with open( ifile, "r" ) as current_file : # Extract current file info : loop through each line in the file, loop through each character # in the line, demand character is not empty ("") and convert the result to an integer # Finally add result to the numbers list numbers += [int(char) for line in current_file for char in line.strip() if char is not ""] # ----------------------------------------------------------------------------------- # # Define your output # ----------------------------------------------------------------------------------- # #Where everything is draw hist_number = TH1D("hist_number" , "Numbers posted" , 10, -0.5, 9.5 ) #Plot all digits corr_number = TH2D("corr_number" , "Correlation to previous number" , 10, -0.5, 9.5, 10, -0.5, 9.5) #Correlation to prev. hist_odd_even = TH1D("hist_odd_even", "Even and odd numbers" , 2, -0.5, 1.5 ) #Is number even or odd corr_odd_even = TH2D("corr_odd_even", "Correlation to last odd or even" , 2, -0.5, 1.5, 2, -0.5, 1.5) #Correlation to prev. hist_high_low = TH1D("hist_high_low", "Above and equal to or below 5" , 2, -0.5, 1.5 ) #Is number >= or < 5 corr_high_low = TH2D("corr_high_low", "Corrlation to last high/low number", 2, -0.5, 1.5, 2, -0.5, 1.5) #Correlation to prev. #Make sure your histograms scale the errors correctly when normalizing hist_number.Sumw2() corr_number.Sumw2() hist_odd_even.Sumw2() corr_odd_even.Sumw2() hist_high_low.Sumw2() corr_high_low.Sumw2() #Fill 1d histograms for i in range(len(numbers)) : hist_number.Fill( numbers[i] ) hist_odd_even.Fill( numbers[i]%2 ) hist_high_low.Fill( numbers[i]/5 ) #Note use of inter devision! #Fill correlation plot with current and previous number if i > 0 : corr_number.Fill( numbers[i] , numbers[i-1] ) corr_odd_even.Fill(numbers[i]%2, numbers[i-1]%2) corr_high_low.Fill(numbers[i]/5, numbers[i-1]/5) #Normalize histograms hist_number.Scale( 1.0 / hist_number.Integral() ) corr_number.Scale( 1.0 / corr_number.Integral() ) hist_odd_even.Scale( 1.0 / hist_odd_even.Integral() ) corr_odd_even.Scale( 1.0 / corr_odd_even.Integral() ) hist_high_low.Scale( 1.0 / hist_high_low.Integral() ) corr_high_low.Scale( 1.0 / corr_high_low.Integral() ) # ----------------------------------------------------------------------------------- # # Calculate the chi^2 given a constant value (Constant is not a free parameter here) # ----------------------------------------------------------------------------------- # chi2_ndf_number_hist = get_chi2_ndf( hist_number , 1.0/10 , D1 = True ) chi2_ndf_odd_even_hist = get_chi2_ndf( hist_odd_even, 1.0/2 , D1 = True ) chi2_ndf_high_low_hist = get_chi2_ndf( hist_high_low, 1.0/2 , D1 = True ) chi2_ndf_number_corr = get_chi2_ndf( corr_number , 1.0/100, D1 = False ) chi2_ndf_odd_even_corr = get_chi2_ndf( corr_odd_even, 1.0/4 , D1 = False ) chi2_ndf_high_low_corr = get_chi2_ndf( corr_high_low, 1.0/4 , D1 = False ) # Offset to number of degrees of freedom (choosing if the overall number of events is know, which I would say it is): dNdof = 0 #Write your result : print "Compatability that distributions stem from random numbers : " print "Raw numbers : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_number_hist[0], chi2_ndf_number_hist[1]-dNdof, TMath.Prob(chi2_ndf_number_hist[0], chi2_ndf_number_hist[1]-dNdof)) print "Raw numbers correlation : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_number_corr[0], chi2_ndf_number_corr[1]-dNdof, TMath.Prob(chi2_ndf_number_corr[0], chi2_ndf_number_corr[1]-dNdof)) print "Odd even : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_odd_even_hist[0], chi2_ndf_odd_even_hist[1]-dNdof, TMath.Prob(chi2_ndf_odd_even_hist[0], chi2_ndf_odd_even_hist[1]-dNdof)) print "Odd even correlation : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_odd_even_corr[0], chi2_ndf_odd_even_corr[1]-dNdof, TMath.Prob(chi2_ndf_odd_even_corr[0], chi2_ndf_odd_even_corr[1]-dNdof)) print "High Low : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_high_low_hist[0], chi2_ndf_high_low_hist[1]-dNdof, TMath.Prob(chi2_ndf_high_low_hist[0], chi2_ndf_high_low_hist[1]-dNdof)) print "High Low correlation : Chi^2/NDF = %6.2f / %6.2f -> P(Chi2^2, NDF) = %10.8f"%(chi2_ndf_high_low_corr[0], chi2_ndf_high_low_corr[1]-dNdof, TMath.Prob(chi2_ndf_high_low_corr[0], chi2_ndf_high_low_corr[1]-dNdof)) # ----------------------------------------------------------------------------------- # # Fit 1D distributions and draw output # ----------------------------------------------------------------------------------- # canvas = TCanvas("canvas", "canvas", 1200, 900) canvas.Divide(3,2) hist_number.SetMinimum(0.0) hist_odd_even.SetMinimum(0.0) hist_high_low.SetMinimum(0.0) canvas.cd(1); hist_number.Fit("pol0"); hist_number.Draw("e") canvas.cd(2); hist_odd_even.Fit("pol0"); hist_odd_even.Draw("e") canvas.cd(3); hist_high_low.Fit("pol0"); hist_high_low.Draw("e") canvas.cd(4); corr_number.Draw("colz") canvas.cd(5); corr_odd_even.Draw("colz") canvas.cd(6); corr_high_low.Draw("colz") canvas.Update() # ----------------------------------------------------------------------------------- # # Test for sequences using Poisson hypothesis (for Nseq = 2, 3, and 4): # ----------------------------------------------------------------------------------- # # Poisson: # ------------- def func_Poisson(x, p) : if (x[0] > -0.5) : return p[0] * TMath.PoissonI(x[0]+0.5, p[1]) else : return 0.0 # Count how many of each sequency: # ------------------------------------------------------------------------ # Hist_Dist3 = TH1F("Hist_Dist3", "Hist_Dist3", 1000, -0.5, 999.5) for i in range(-2, len(numbers)-2) : seq = 100*numbers[i] + 10*numbers[i+1] + numbers[i+2] Hist_Dist3.Fill(seq) # Put all counts into histogram, which should be Poisson: Hist_Pois3 = TH1F("Hist_Pois3", "Hist_Pois3", 41, -0.5, 40.5) for i in range( Hist_Dist3.GetNbinsX() ) : Hist_Pois3.Fill( Hist_Dist3.GetBinContent(i+1) ) fPoisson3 = TF1("fPoisson3", func_Poisson, -0.5, 15+0.5, 2) fPoisson3.SetParameters(1000.0, 2.725) fPoisson3.SetNpx(1000) # Set the number of intervals used when drawing function! fPoisson3.SetLineColor(kMagenta) fPoisson3.Draw("same") canvas3 = TCanvas("canvas3", "", 20, 20, 1200, 900) Hist_Pois3.Fit("fPoisson3", "LR") Hist_Pois3.GetXaxis().SetRangeUser(-0.5, 25.5) Hist_Pois3.Draw("e") canvas3.Update() """ # Count how many of each sequency: # ------------------------------------------------------------------------ # Hist_Dist4 = TH1F("Hist_Dist4", "Hist_Dist4", 1000, -0.5, 999.5) for i in range(-3, len(numbers)-3) : seq = 1000*numbers[i] + 100*numbers[i+1] + 10*numbers[i+2] + numbers[i+3] Hist_Dist4.Fill(seq) # Put all counts into histogram, which should be Poisson: Hist_Pois4 = TH1F("Hist_Pois4", "Hist_Pois4", 41, -0.5, 40.5) for i in range( Hist_Dist4.GetNbinsX() ) : Hist_Pois4.Fill( Hist_Dist4.GetBinContent(i+1) ) fPoisson4 = TF1("fPoisson4", func_Poisson, -0.5, 15+0.5, 2) fPoisson4.SetParameters(10000.0, 0.2725) fPoisson4.SetNpx(1000) # Set the number of intervals used when drawing function! fPoisson4.SetLineColor(kMagenta) fPoisson4.Draw("same") canvas4 = TCanvas("canvas4", "", 40, 40, 1200, 900) canvas4.SetLogy() Hist_Pois4.Fit("fPoisson4", "LR") Hist_Pois4.GetXaxis().SetRangeUser(-0.5, 25.5) Hist_Pois4.Draw("e") canvas4.Update() """ raw_input( ' ... ' ) # ----------------------------------------------------------------------------------- # # # First look at the random digits, and see if you see any patterns? Probably rarely, mostly # because there are many numbers, and patterns are not that visible. So, you will have to # work a bit... with statistical tests. # # Before even looking at the above program, think about what statistical tests you # could submit the sample to. Then consider, how to actually carry out these tests. # We will try in class to compile and discuss a list, before we start working on it!!! # # # # Questions: # ---------- # 1) Are each digit represented roughly equally many times? Are there as many # even as odd numbers? And do people have a tedency to choose an even number # after an odd and vice versa? How about high vs. low numbers? # # 2) Are people "afraid" of putting the same digit twice in a row? And how about three # or four identical digits in a row? # # 3) Try to count, which digits follows which digits. That should be 100 counts in total. # If the numbers were truly random, what would you expect then? Is that what you # observe? # Can you test this for example with a Chi-Square (think about how many numbers you # expect in each bin) and/or a likelihood? # # 4) Were the students of 2011 and 2012 better or worse at picking random numbers? # # ----------------------------------------------------------------------------------- #