#!/usr/bin/env python

# ---------------------------------------------------------------------------------- #
#                                  "Fitting is an art!"
# ---------------------------------------------------------------------------------- #
# 
#  Python script for testing which fitting proceedure is likely to give the "best"
#  results. The three cases in question are:
#   * Gaussian distribution on constant background (peak searching)
#   * Double exponential distribution (high correlations)
#   * Polynomial fit with possible discontinuity
#
#  Consider each case and argue/discuss which fitting function and method should be used,
#  and if the fits come out correctly.
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   6th of January 2018
# 
# ---------------------------------------------------------------------------------- #

from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from iminuit import Minuit
from probfit import Chi2Regression
plt.close('all')

SavePlots = False         # Determining if plots are saved or not

r = np.random
r.seed(5)


# function to create a nice string output
def nice_string_output(names, values, extra_spacing = 2):
    max_values = len(max(values, key=len))
    max_names = len(max(names, key=len))
    string = ""
    for name, value in zip(names, values):
        string += "{0:s} {1:>{spacing}} \n".format(name, value,
                   spacing = extra_spacing + max_values + max_names - len(name))
    return string[:-2]

# function to draw parameters from a fit on the figure in one line
def draw_fit_parameters(minuit_obj, axis, x, y, title=None, color='k', chi2=None, ndof=None, prob=None) :
    names = []
    values = []
    if title is not None :
        names += [title]
        values += ['']
    if chi2 is not None and ndof is not None and prob is not None :
        names += ['Chi2 / ndof','Prob']
        values += ["{:.3f} / {:d}".format(chi2, ndof), "{:.3f}".format(prob)]
    names += minuit_obj.values.keys()
    values += ["{:.3f} +/- {:.3f}".format(minuit_obj.values[val], minuit_obj.errors[val]) for val in minuit_obj.values.keys()]
    axis.text(x, y, nice_string_output(names, values), family='monospace', transform=axis.transAxes, fontsize=10, verticalalignment='top', color=color)

# function to print correlation matrix with parameter names
def pretty_print_correlation_matrix(minuit_obj) :
    corr = minuit_obj.matrix(correlation=True)
    names = minuit_obj.values.keys()
    string = '         |  '
    string += ' |  '.join('{:^7s}'.format(k) for k in names) + ' |\n'
    string += '---------' + ''.join(np.repeat('-----------',len(corr))) + '-\n'
    for i in range(len(corr)) : string += ' {:>7s} | '.format(names[i]) + ' | '.join('{: 6.5f}'.format(k) for k in corr[i]) + ' |\n'
    print(string)


#----------------------------------------------------------------------------------- #
# CASE 1: Gaussian distribution on a constant background:
#         f1(x) = N / sqrt(2pi) / sigma * exp(-0.5 * sqr((x-mu)/sigma)) for x in [-inf,inf]
#----------------------------------------------------------------------------------- #

Npoints_gauss = 200
mux    = 3.50
sigmax = 0.25
Npoints_pol0 = 1000
minx   = 0.0
maxx   = 10.0
Nbins  = 100
binwidth_gauss = (maxx-minx) / Nbins

# Fill histogram with signal and background events:
Array_gauss = np.concatenate(( r.normal(mux,sigmax,Npoints_gauss), r.uniform(minx,maxx,Npoints_pol0), r.normal(mux+2.0,sigmax,int(Npoints_gauss/3)) ))

# Make histogram of randomly generated data
Hist_gauss, Edges_gauss = np.histogram(Array_gauss, bins=Nbins, range=(minx, maxx))
Centers_gauss = (Edges_gauss[1:] + Edges_gauss[:-1])/2
Errors_gauss = np.sqrt(Hist_gauss)
Indexes_gauss = Hist_gauss > 0

# Plot the randomly generated data
fig2, ax2 = plt.subplots(figsize=(10, 6))
ax2.set_ylabel("Frequency")
ax2.errorbar(Centers_gauss[Indexes_gauss], Hist_gauss[Indexes_gauss], Errors_gauss[Indexes_gauss], fmt='k_', label='randomly generated exponential', ecolor='k', elinewidth=1, capsize=1, capthick=1)
ax2.set_ylim(0,ax2.get_ylim()[1]*1.3)
ax2.set_xlim(left=0)

# Define function (including bin width to get normalisation right):
def f_gauss(x, N, mu, sigma, background) : 
    return binwidth_gauss*N / np.sqrt(2.0*np.pi) / sigma * np.exp(-0.5*(x-mu)**2/sigma**2) + background

# Perform first fit
FitObject_gauss = Chi2Regression(f_gauss, Centers_gauss[Indexes_gauss], Hist_gauss[Indexes_gauss], Errors_gauss[Indexes_gauss])
Minuit_gauss = Minuit(FitObject_gauss, pedantic=False, N=Npoints_gauss, mu=mux, sigma=sigmax, background=Npoints_pol0/Nbins, print_level=0)
Minuit_gauss.migrad()
chi2_gauss = Minuit_gauss.fval
ndof_gauss = FitObject_gauss.ndof
prob_gauss = stats.chi2.sf(chi2_gauss, ndof_gauss)

# Plot the first fit
x_fit = np.linspace(minx, maxx, 1000)
y_fit = f_gauss(x_fit,*Minuit_gauss.args)
ax2.plot(x_fit, y_fit, 'r-', label='one peak fit')
draw_fit_parameters(Minuit_gauss, ax2, 0.05, 0.95, 'One peak fit:','r',chi2_gauss,ndof_gauss,prob_gauss)


# NOTE: If one wanted to test the G+pol0 vs. the G+G+pol0 models against each other,
#       which would of course be very relevant, then a likelihood ratio test would be
#       obvious.
#       f_gauss  would be your simple (null) hypothesis, G+pol0
#       f_gauss2 would be your advanced (alternative) hypothesis, G+G+pol0
#       Now, the test statistic D = LLH1 - LLH2 will be Chi2 distributed with Ndof = 3,
#       as the advanced model has three more parameters (from the second Gaussian).


# Finalize the figure
plt.legend(loc='best')
plt.tight_layout()
plt.show(block=False)
if (SavePlots) :
    fig2.savefig("Hist_gauss.pdf", dpi=600)
"""
"""

# ----------------------------------------------------------------------------------- #
#
# Looking at the first case, but make sure that you also do at least the second case,
# as this is at least as important as the first one! Third case is a revisit.
#
# Questions on CASE 1:
# --------------------
#  1) Look at the data plot and the corresponding fit. What type of fit is it? Does it
#     run well (or at all) without some non-zero input parameters? And once it runs, does
#     it seem to be reasonable? Why? And does the fitting function include all features
#     of the data? Why/why not? Try for 10-15 minutes and discuss it with your neighbor,
#     before reading on!
#
#            - - - - - -     10-15 minutes later    - - - - - -
#
#  2) As it happens, there seem to be a suspectable bump around 5 < x < 6. Try to write
#     an additional fitting function, which includes this bump in the model, and get the
#     fit to run. How significant is the bump, based on significance of the amplitude of
#     the second Gaussian peak? And what test would you apply to this if you wanted to
#     make a full-fledged hypothesis test between the two models? Are they nested? Can
#     you actually get a number out?
#
#            - - - - - -     20-30 minutes (success or failure) later - - - - - -
#
#  3) Imagine that you concluded that there was a new peak, and that you were sure that
#     it had the same width as the original peak. Does that help you in the fit, and if
#     so, how? Does the significance of the peak increase? Would it always do that?
#     Also imagine, that the parameter of interest is the distance between the peaks. How
#     would you now write your fitting function?
#----------------------------------------------------------------------------------- #


#----------------------------------------------------------------------------------- #
# CASE 2: Double exponential distribution:
#
#   The "bad" fitting function:
#     f2_bad(x)  = N1*exp(-t/r1) + N2*exp(-t/r2) for t in [0,inf]
#
#   The "good" fitting function:
#     f2_good(x) = N * (f/r1*exp(-t/r1) + (1-f)/r2*exp(-t/r2)) for t in [0,inf]
#
#----------------------------------------------------------------------------------- #
# NOTE: The parameters r1 and r2 need to be positive, and f in [0,1],
#       in order for this to be a PDF.
#----------------------------------------------------------------------------------- #

# The whole case have been commented out, just to ensure smooth running of CASE 1.
"""
Npoints_2exp = 4000
frac = 0.9   # Fraction that "belongs to" first exponential
r1 = 9.0
r2 = 3.0

minx_2exp = 0.0
maxx_2exp = 20.0
Nbins_2exp = 200
binwidth_2exp = (maxx_2exp - minx_2exp) / Nbins_2exp

# Transformation method:
Array_2exp = np.zeros(Npoints_2exp)
for i in xrange(Npoints_2exp) :
    if (r.uniform() < frac) : t = r.exponential(r1)
    else :                    t = r.exponential(r2)
    Array_2exp[i] = t

# Make histogram of randomly generated data
Hist_2exp, Edges_2exp = np.histogram(Array_2exp, bins=Nbins_2exp, range=(minx_2exp, maxx_2exp))
Centers_2exp = (Edges_2exp[1:] + Edges_2exp[:-1])/2
Errors_2exp = np.sqrt(Hist_2exp)
Indexes_2exp = Hist_2exp > 0

# Plot the randomly generated data
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_ylabel("Frequency")
ax.errorbar(Centers_2exp[Indexes_2exp], Hist_2exp[Indexes_2exp], Errors_2exp[Indexes_2exp], fmt='k_', label='randomly generated exponential', ecolor='k', elinewidth=1, capsize=1, capthick=1)
ax.set_ylim(bottom=0)
ax.set_xlim(left=0)

# The BAD fitting function and fit:
# ----------------------------------
def BadFitFunc_2exp(x, N1, N2, tau1, tau2) :
    return N1*np.exp(-x/tau1) + N2*np.exp(-x/tau2)

BadFitObject_2exp = Chi2Regression(BadFitFunc_2exp, Centers_2exp[Indexes_2exp], Hist_2exp[Indexes_2exp], Errors_2exp[Indexes_2exp])
BadMinuit_2exp = Minuit(BadFitObject_2exp, pedantic=False, N1=0.0, N2=0.0, tau1=0.0, tau2=0.0, print_level=0)
BadMinuit_2exp.migrad()

# Plot the bad fit
x_fit = np.linspace(minx_2exp, maxx_2exp, 1000)
y_fit = BadFitFunc_2exp(x_fit,*BadMinuit_2exp.args)
ax.plot(x_fit, y_fit, 'r-', label='the bad fit')
draw_fit_parameters(BadMinuit_2exp, ax, 0.55, 0.80, 'Bad fit:', 'r')

# Print the correlations for the bad fit
print("Correlation matrix for the bad fit - Everything is heavily correlated:")
pretty_print_correlation_matrix(BadMinuit_2exp)

# Finalize the figure
plt.legend(loc='best')
plt.tight_layout()
plt.show(block=False)
if (SavePlots) :
    fig.savefig("Hist_2exp.pdf", dpi=600)
"""

# ----------------------------------------------------------------------------------- #
# Questions on CASE 2:
# --------------------
#  1) OK, the "bad" fit doesn't work to begin with. Can you see what is missing? There
#     are in fact several things, but one is very simple to remedy. Think and discuss...
#
#            - - - - - -     5-10 minutes later    - - - - - -
#
#     Of course you need to give the fit good initial values! Do this (for example those
#     the data was produced with!), and run it again. It might work now, but actually that
#     is not always the case.
#     The reason is that the "bad" fitting function has two flaws:
#       * It does not have a correct normalisation, thus making N(1/2) and r(1/2) correlated.
#       * It does not have one overall normalisation, thus making N1 and N2 correlated.
#     This gives very high correlations between the parameters, as can be seen from the
#     correlation matrix printed.
#
#  2) Both of these problems can be avoided by rewriting the fitting function to include
#     the correct normalisation (i.e. dividing by the lifetime) and by putting only one
#     overall normalisation and then dividing the two lifetimes with a fraction (i.e. use
#     "frac" and "(1.0-frac)" as parameters in front of each exponential term).
#     Try this (define a "good" function), and see if your fit improves. The way to see
#     this would in general be to try a lot of different data, but here we will simply see
#     that the correlations are smaller (especially for the overall normalisation).
#
#            - - - - - -     20-30 minutes (success or failure) later - - - - - -
#
#     If you didn't manage to get this fit going, I've included a "good" fitting function
#     at the bottom of the program.
#
#  3) The two lifetimes are naturally very correlated with each other (and the fraction),
#     when they are very alike. The only thing one can do about this is to fix one parameter.
#     This is of course not desirable, but one can be forced to do it, if the fit does not
#     converge otherwise. Note that since the correlation is very high, it is not a great
#     loss of freedom in the fitting function.
#     A very common similar example is fitting a "Gaussian-like" peak, which happens to have
#     more than one width, for example if the data is obtained from two or more sources with
#     different resolutions. Here, one may choose to let the two (or more) Gaussians have
#     the same mean, but two different widths (the "precise" and the "poor" measurements).
#     Typically, the parameter to fix (if any) is the fraction, but never fix a parameter
#     without first having tried to let it "float".
# ----------------------------------------------------------------------------------- #


#----------------------------------------------------------------------------------- #
# CASE 3: Fitting (possible) discontinuity in graph
#         f3(x) = for x in [-1.25,1.25] modulo an offset
#----------------------------------------------------------------------------------- #

"""
# Define points for a graph with errors and a small discontinuity:
Ndata = 51
offset = 0.0

x = np.array([-1.15+offset + 0.05*i for i in xrange(Ndata)])
y = np.array([(x[i]-offset)**3 - (x[i]-offset) + r.normal(0.0,0.1) for i in xrange(Ndata)])
y[x>0.375+offset] += 0.4         # Adding a small hike
ex = np.zeros(Ndata)      # Deciding that x is precise.
ey = np.ones_like(ex)*0.1 # Stating the uncertainty in 

# Plot the randomly generated data
fig3, ax3 = plt.subplots(figsize=(10, 6))
ax3.set_ylabel("Y")
ax3.set_xlabel("X")
ax3.errorbar(x,y,xerr=ex,yerr=ey, fmt='k.', label='randomly generated data', ecolor='k', elinewidth=1, capsize=2, capthick=1)
ax3.set_xlim(-1.3+offset, 1.5+offset)

# Define fitting function (with parameter for the size of the discontinuity):
def func_discfit(x, p0, p1, p2, p3, p4) :
    if (x > 0.375+offset) :
        return p0 + p1*x + p2*x**2 + p3*x**3 + p4
    else :
        return p0 + p1*x + p2*x**2 + p3*x**3
func_discfit_vec = np.vectorize(func_discfit)

# Here we perform the fit
# NOTE that p5 is now fixed to 0.0
# Unfix p5 to take the discontinuity into account
FitObject = Chi2Regression(func_discfit, x, y, ey)
MinuitObj = Minuit(FitObject, pedantic=False, p0=0.0, p1=-1.0, p2=0.0, p3=1.0, p4=0.0, fix_p4=True, print_level=0)
MinuitObj.migrad()
chi2 = MinuitObj.fval
ndof = FitObject.ndof
prob = stats.chi2.sf(chi2, ndof)

# Draw the fit
x_fit = np.linspace(-1.3+offset, 1.5+offset, 1000)
y_fit = func_discfit_vec(x_fit,*MinuitObj.args)
ax3.plot(x_fit, y_fit, 'r-', label='the fit')
draw_fit_parameters(MinuitObj, ax3, 0.55, 0.80, chi2=chi2, ndof=ndof, prob=prob)
ax3.text(0.15, 2.0, "Set fix_p4=False to take the discontinuity into account!", fontsize=10, verticalalignment='top')

# Finalize the figure
plt.legend(loc='best')
plt.tight_layout()
plt.show(block=False)
if (SavePlots) :
    fig3.savefig("Graph_DiscotinuousFit.pdf", dpi=600)
"""

# ----------------------------------------------------------------------------------- #
# Questions on CASE 3:
# --------------------
#  1) The fit works, but gives a poor model of the data. There seem to be a discontinuity
#     around 0.375, so put this into the fit function (check that it is there), and allow
#     the parameter controlling this discontinuity to float (i.e. set fix_p4 = False).
#     Did this make the fit work better? Hopefully.
#
#  2) The "offset" is set to 0.0 to begin with. Try to increase it to 3.0, and see how
#     it is simply a translation in x. One would think that since it is a 3rd degree
#     polynomial fit, it should work nicely for ANY value of the offset.
#     However, now try to set the offset to 5.0, and 10.0. Does the fit converge nicely
#     as before? What would it require to get it to converge? Try to see, if you can make
#     it converge by doing what you suggest.
#
#            - - - - - -     10-15 minutes (success or failure) later - - - - - -
#
#  3) The problem is both in terms of the initial values of the fit, but also in the
#     intercorrelations that a significant offset introduces. With a very shifted
#     x-axis, any change in say p3 (the term in front of x**3) requires huge changes
#     in the other fitting parameters (too see this, image that the offset is 1000.0).
#     You may think that such an offset is rare, but it is in fact not, and the classic
#     case is when fitting data as a function of YEAR, e.g. 1970-2016. Always shift the
#     x-axis towards zero, in this case fx. by -2000.
#     
# ----------------------------------------------------------------------------------- #


# Finally, ensure that the program does not termine (and the plot disappears), before you press enter:
try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


# The GOOD fitting function and fit (CASE 2):
# -------------------------------------------
#def GoodFitFunc_2exp(x, N, frac, tau1, tau2) :
#    return N*binwidth_2exp * ( frac/tau1*np.exp(-x/tau1) + (1-frac)/tau2*np.exp(-x/tau2) )
#
#GoodFitObject_2exp = Chi2Regression(GoodFitFunc_2exp, Centers_2exp[Indexes_2exp], Hist_2exp[Indexes_2exp], Errors_2exp[Indexes_2exp])
#GoodMinuit_2exp = Minuit(GoodFitObject_2exp, pedantic=False, N=Npoints_2exp, frac=frac, tau1=r1, tau2=r2, limit_frac=(0.0,1.0))
#GoodMinuit_2exp.migrad()
#x_fit = np.linspace(minx_2exp, maxx_2exp, 1000)
#y_fit = GoodFitFunc_2exp(x_fit,*GoodMinuit_2exp.args)
#ax.plot(x_fit, y_fit, 'b--', label='the good fit')
#draw_fit_parameters(GoodMinuit_2exp, ax, 0.55, 0.60, 'Good fit:', 'b')
#pretty_print_correlation_matrix(GoodMinuit_2exp)