#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  ROOT macro for fitting the number of urban areas with populations above a
#  certain number.
#
#  The urban areas of the world have almost always grown in size, and while the largest
#  areas (> 3.000.000 inhabitants) are easily identified, it is harder to keep track of
#  the medium size ones (2.000.000 < inhabitants < 3.000.000) and especially of the
#  smaller (1.000.000 < inhabitants < 2.000.000).
#  A way to estimate the number of medium size areas is by extrapolating from the those
#  more populous, which is the goal of this exercise. Given a dataset, which lists the
#  largest urban areas, these are to be fitted in order to give an estimate of the
#  number of medium size urban areas.
#
#  References:
#    Cowan: Chapter 7
#    Bevington: Chapter 9
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   29th of September 2013
#
# ----------------------------------------------------------------------------------- #


from ROOT import *
from array import array


# ----------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------- #

# Setting what to be shown in statistics box:
gStyle.SetOptStat("emr")
gStyle.SetOptFit(1111)

# Random numbers from the Mersenne-Twister:
r = TRandom3()
r.SetSeed(0)

verbose = True
Nverbose = 10


# ----------------------------------------------------------------------------------- #
def func_pop(x, par) :
# ----------------------------------------------------------------------------------- #
    return par[0] * exp(-x[0]/ par[1])


# ----------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------- #

# Define file, lists and counter:
file = "data_CityPopulation1.txt"
cityrank = []
citysize = []
Nread = 0
Nmax = 250

# Define histogram (for once it doesn't matter if it is a histogram or a graph):
Hist_pop = TH1F("Hist_pop", "Size and ranking of world cities", Nmax, 0.5, float(Nmax)+0.5)

# Loop over files and open them
with open( file, 'r' ) as f : 

    # Loop over lines and extract variables of interest
    for line in f:
        line = line.strip()
        columns = line.split()
        if (len(columns) == 7) :
            
            # Put the values read into a list and histogram:
            cityrank.append(int(columns[0]))
            citysize.append(float(columns[1]))
            Hist_pop.SetBinContent(cityrank[-1], citysize[-1])
            uncertainty = sqrt(citysize[-1])          # NOTE: This needs YOUR evaluation!!!
            Hist_pop.SetBinError(cityrank[-1], uncertainty)

            if (verbose and Nread < Nverbose) :
                print "  City rank %3d:   Population = %10.1f "%(cityrank[-1], citysize[-1])
            Nread += 1

f.close()
print "  Number of entries read: ", Nread


# ------------------------------------------------------------------ #
# Fit data:
# ------------------------------------------------------------------ #

# Define fitting function (see above) and its range and number of parameters:
fit_pop = TF1("fit_pop", func_pop, 1.0, 115.0, 2)
# Give the fit a good starting point:
fit_pop.SetParameters(1000000.0, 50.0)

# Make canvas and histogram for defining ranges:
canvas = TCanvas("canvas","", 650, 20, 600, 450)
canvas.SetFillColor(0)
canvas.SetLogy()
Hist_pop.GetXaxis().SetTitle("City ranking by population")
Hist_pop.GetYaxis().SetTitle("Number of inhabitants")
Hist_pop.SetMinimum(500000.0)

# Fit and plot graph:
fit_pop.SetLineColor(2)
fit_pop.SetLineWidth(2)
Hist_pop.Fit("fit_pop", "R")
Hist_pop.Draw("e")

line1 = TLine(0.0, 1000000.0, 240.0, 1000000.0)
line1.Draw("same")
line2 = TLine(0.0, 2000000.0, 240.0, 2000000.0)
line2.Draw("same")
line3 = TLine(0.0, 3000000.0, 150.0, 3000000.0)
line3.Draw("same")

canvas.Update()
# canvas.SaveAs("Population_fit.png")


# ----------------------------------------------------------------------------------- #
raw_input('Press Enter to exit')


# ---------------------------------------------------------------------------------- 
#
# Start by taking a close look at the data, both by inspecting the numbers, and then by
# considering the histograms produced by running the macro.
#
#
# Questions:
# ----------
# 1) Think about what uncertainties to assign the population number! This is not
#    easy, as it is NOT an uncertainty on the actual number of citizens, but rather
#    how much one can expect this to jump around from one to the next. Consider the
#    data, and then decide on some expression for it.
#    Once you have put your estimate into the program, try to run the program and
#    see if your estimate is reasonable. If not, then change it. Eventually, the fit
#    Chi-square will tell you, if you did OK!
#
# 2) Define a (better) fitting function, which actually matches the data, and then
#    make an estimate at how many urban areas in the world with a population above
#    2.000.000 inhabitants. Remember to assign an uncertainty to your estimate,
#    also including a basic systematic uncertainty if possible.
#
# 3) Consider again the data, this time visually. Are there any irregularities, or
#    other features, which affects the fit quality? Try to change the fitting range,
#    such that this is avoided. Does this result in a better fit? Would you like to
#    change your answer from above?
#
# 4) Once you have an answer, ask your teacher for the true answer (which we have!),
#    and then consider if you did OK or not.
#
# 5) Repeat the exercise with DataSet2.txt, which contains further data, and try to
#    give an estimate of the number of cities above 1.000.000 inhabitants. This time,
#    we do NOT know the answer! But try to compare with others, who have an estimate.
#
#
# Advanced Questions:
# -------------------
# 1) The dataset (population and area) is obtained by using many different techniques,
#    coded by the letters A through H, defined as follows:
#      A: National census authority data.
#      B: Demographia land area estimate based upon map or satellite photograph analysis.
#      C: Demographia population "build up" from third, fourth or fifth order jurisdictions.
#      D: Population estimate based upon United Nations agglomeration estimate.
#      E: Demographia population estimate from national census authority agglomeration data.
#      F: Other Demographia population estimate.
#      G: Estimate based upon projected growth rate from last census.
#      H: Combination of adjacent national census authority agglomerations.
#    The most reliable population estimates are "A" and "H." Population estimates "C" are
#    generally reliable. Population estimates "D" and "E" are less reliable. Population
#    estimates coded "F" and "G" are the least reliable.
#
#    Investigate how much of an effect (and thus systematic error) a variation of the
#    uncertainties depending on the classification would have on the result both for
#    determination of the medium size and the smaller urban areas.
#
# ----------------------------------------------------------------------------------