#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # ROOT macro for fitting the number of urban areas with populations above a # certain number. # # The urban areas of the world have almost always grown in size, and while the largest # areas (> 3.000.000 inhabitants) are easily identified, it is harder to keep track of # the medium size ones (2.000.000 < inhabitants < 3.000.000) and especially of the # smaller (1.000.000 < inhabitants < 2.000.000). # A way to estimate the number of medium size areas is by extrapolating from the those # more populous, which is the goal of this exercise. Given a dataset, which lists the # largest urban areas, these are to be fitted in order to give an estimate of the # number of medium size urban areas. # # References: # Cowan: Chapter 7 # Bevington: Chapter 9 # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 29th of September 2013 # # ----------------------------------------------------------------------------------- # from ROOT import * from array import array # ----------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------- # # Setting what to be shown in statistics box: gStyle.SetOptStat("emr") gStyle.SetOptFit(1111) # Random numbers from the Mersenne-Twister: r = TRandom3() r.SetSeed(0) verbose = True Nverbose = 10 # ----------------------------------------------------------------------------------- # def func_pop(x, par) : # ----------------------------------------------------------------------------------- # return par[0] * exp(-x[0]/ par[1]) # ----------------------------------------------------------------------------------- # # ----------------------------------------------------------------------------------- # # Define file, lists and counter: file = "data_CityPopulation1.txt" cityrank = [] citysize = [] Nread = 0 Nmax = 250 # Define histogram (for once it doesn't matter if it is a histogram or a graph): Hist_pop = TH1F("Hist_pop", "Size and ranking of world cities", Nmax, 0.5, float(Nmax)+0.5) # Loop over files and open them with open( file, 'r' ) as f : # Loop over lines and extract variables of interest for line in f: line = line.strip() columns = line.split() if (len(columns) == 7) : # Put the values read into a list and histogram: cityrank.append(int(columns[0])) citysize.append(float(columns[1])) Hist_pop.SetBinContent(cityrank[-1], citysize[-1]) uncertainty = sqrt(citysize[-1]) # NOTE: This needs YOUR evaluation!!! Hist_pop.SetBinError(cityrank[-1], uncertainty) if (verbose and Nread < Nverbose) : print " City rank %3d: Population = %10.1f "%(cityrank[-1], citysize[-1]) Nread += 1 f.close() print " Number of entries read: ", Nread # ------------------------------------------------------------------ # # Fit data: # ------------------------------------------------------------------ # # Define fitting function (see above) and its range and number of parameters: fit_pop = TF1("fit_pop", func_pop, 1.0, 115.0, 2) # Give the fit a good starting point: fit_pop.SetParameters(1000000.0, 50.0) # Make canvas and histogram for defining ranges: canvas = TCanvas("canvas","", 650, 20, 600, 450) canvas.SetFillColor(0) canvas.SetLogy() Hist_pop.GetXaxis().SetTitle("City ranking by population") Hist_pop.GetYaxis().SetTitle("Number of inhabitants") Hist_pop.SetMinimum(500000.0) # Fit and plot graph: fit_pop.SetLineColor(2) fit_pop.SetLineWidth(2) Hist_pop.Fit("fit_pop", "R") Hist_pop.Draw("e") line1 = TLine(0.0, 1000000.0, 240.0, 1000000.0) line1.Draw("same") line2 = TLine(0.0, 2000000.0, 240.0, 2000000.0) line2.Draw("same") line3 = TLine(0.0, 3000000.0, 150.0, 3000000.0) line3.Draw("same") canvas.Update() # canvas.SaveAs("Population_fit.png") # ----------------------------------------------------------------------------------- # raw_input('Press Enter to exit') # ---------------------------------------------------------------------------------- # # Start by taking a close look at the data, both by inspecting the numbers, and then by # considering the histograms produced by running the macro. # # # Questions: # ---------- # 1) Think about what uncertainties to assign the population number! This is not # easy, as it is NOT an uncertainty on the actual number of citizens, but rather # how much one can expect this to jump around from one to the next. Consider the # data, and then decide on some expression for it. # Once you have put your estimate into the program, try to run the program and # see if your estimate is reasonable. If not, then change it. Eventually, the fit # Chi-square will tell you, if you did OK! # # 2) Define a (better) fitting function, which actually matches the data, and then # make an estimate at how many urban areas in the world with a population above # 2.000.000 inhabitants. Remember to assign an uncertainty to your estimate, # also including a basic systematic uncertainty if possible. # # 3) Consider again the data, this time visually. Are there any irregularities, or # other features, which affects the fit quality? Try to change the fitting range, # such that this is avoided. Does this result in a better fit? Would you like to # change your answer from above? # # 4) Once you have an answer, ask your teacher for the true answer (which we have!), # and then consider if you did OK or not. # # 5) Repeat the exercise with DataSet2.txt, which contains further data, and try to # give an estimate of the number of cities above 1.000.000 inhabitants. This time, # we do NOT know the answer! But try to compare with others, who have an estimate. # # # Advanced Questions: # ------------------- # 1) The dataset (population and area) is obtained by using many different techniques, # coded by the letters A through H, defined as follows: # A: National census authority data. # B: Demographia land area estimate based upon map or satellite photograph analysis. # C: Demographia population "build up" from third, fourth or fifth order jurisdictions. # D: Population estimate based upon United Nations agglomeration estimate. # E: Demographia population estimate from national census authority agglomeration data. # F: Other Demographia population estimate. # G: Estimate based upon projected growth rate from last census. # H: Combination of adjacent national census authority agglomerations. # The most reliable population estimates are "A" and "H." Population estimates "C" are # generally reliable. Population estimates "D" and "E" are less reliable. Population # estimates coded "F" and "G" are the least reliable. # # Investigate how much of an effect (and thus systematic error) a variation of the # uncertainties depending on the classification would have on the result both for # determination of the medium size and the smaller urban areas. # # ----------------------------------------------------------------------------------