#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # ROOT macro for illustrating the importance of VISUAL INSPECTION of a dataset. # # The example is called "Anscombe's Quartet" (F.J. Anscombe, "Graphs in Statistical # Analysis," American Statistician, 27 [February 1973], 17-21) and consists of four # datasets, which have the same: # - Mean of each x variable 9.0 # - Variance of each x variable 10.0 # - Mean of each y variable 7.5 # - Variance of each y variable 3.75 # - Correlation between each x and y variable 0.816 # - Linear regression line y = 3 + 0.5x # # However, they are very different! For more information on Anscombe's Quartet, see: # http:#en.wikipedia.org/wiki/Anscombe's_quartet # # Author: Troels C. Petersen (NBI/CERN) # Email: Troels.Petersen@cern.ch # Date: 26th of August 2013 # # Author: Lars Egholm Pedersen (NBI/CERN) # Email: egholm@cern.ch # Date: ----------------------------- # # ----------------------------------------------------------------------------------- # from ROOT import * from array import array #---------------------------------------------------------------------------------- # Run by ./anscombes_quartet.py # Output: AnscombesQuartet.eps #---------------------------------------------------------------------------------- gROOT.Reset(); # Set the showing of statistics and fitting results (0 means off, 1111 means all on): gStyle.SetOptStat(1111); # gStyle.SetOptStat(0); gStyle.SetOptFit(1111); # gStyle.SetOptFit(0); # Statistics and fitting results replaced in: # gStyle.SetStatX(0.52); # Top left corner. # gStyle.SetStatY(0.86); gStyle.SetStatX(0.89); # Bottom right corner. gStyle.SetStatY(0.33); # Set the graphics: gStyle.SetStatBorderSize(1); gStyle.SetStatFontSize(0.055); gStyle.SetCanvasColor(4); gStyle.SetPalette(1); # ------------------------------------------------------------------ # # Get data in arrays: # ------------------------------------------------------------------ # Ndatasets = 4 Npoints = 11 #Define data samples. Root objects wants arrays with a specified data type (here floats 'f') x = [ array( 'f', [ 10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0 ] ) , array( 'f', [ 10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0 ] ) , array( 'f', [ 10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0 ] ) , array( 'f', [ 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0 ] ) ] y = [ array( 'f', [ 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68 ] ) , array( 'f', [ 9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74 ] ) , array( 'f', [ 7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73 ] ) , array( 'f', [ 6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89 ] ) ] # ------------------------------------------------------------------ # # Fit data: # ------------------------------------------------------------------ # #Create a lists of TGraphs. TGraphs are initialized by (number of points, x-coord, ycoord) graph = [TGraph( Npoints, x[idataset], y[idataset]) for idataset in range( Ndatasets ) ] fit_p1 = TF1("fit_p1", "[0]*x + [1]", 3.5, 20.0) # Make a linear function fit_p1.SetParameters(1.0, 1.0) # Set its parameters # Make canvas: canvas = TCanvas("canvas","",650,20,800,600) # Make a new window canvas.SetFillColor(0) # Make it white canvas.Divide(2,2) # Divide it into a 2x2 window # Fit and plot graphs: for idataset in range( Ndatasets ) : # Loop over data sets canvas.cd( idataset+1 ) # Point at the relevant window fit_p1.SetLineColor(4) # Set function line color! fit_p1.SetLineWidth(2) # Set function line width! graph[idataset].SetMarkerColor(2) # Set data marker color graph[idataset].SetMarkerStyle(20) # Set data marker type graph[idataset].SetMarkerSize(1.0) # Set data marker size graph[idataset].Fit("fit_p1","r") # Fit data with function graph[idataset].Draw("AP") # Draw data in window canvas.Update() # Make sure everything is in! canvas.SaveAs("AnscombesQuartet.eps") # Print window to file #Tell python to wait here untill you do something raw_input( ' ... Press enter to exit ... ' ) #---------------------------------------------------------------------------------- # # #First acquaint yourself with the program, and get yourself a "free" (possibly first) #look at how ROOT works. Understand that each of the four distributions are being fitted #with a linear function (here called "fit_p1") and the results plottet. There are comments #for most lines in the macro! # #Run the macro, and then take a close look at each of the four results. # # #Questions: #---------- # 1) Looking closely at each of the four fits, determine which points gives the largest # contribution to the "mismatch" (that is chi-square) between the data and the fit. # # 2) Consider how YOU would treat each of the four datasets and fit them! # # #Advanced questions: #------------------- # 1) How would you with (smarter) statistical techniques detect that something was # not right without looking? # # #----------------------------------------------------------------------------------