#!/usr/bin/env python

# ----------------------------------------------------------------------------------- #
#
#  Python macro for exercise on calibration.
#
#  You're developping a new method for measuring the distance to stars, and want to
#  calibrate and thus improve this method, such that the precision obtained is unbiased
#  and has a minimal variance. You know that the method depends on several factors, such as:
#   * Amount of signal light from the star (lsig)
#   * Amount of background light in the surrounding sky (lbkg)
#   * Temperature of star (temp)
#   * Transparency of sky (tsky)
#
#  In order to determine the influence of these factors, and how much you need to correct
#  for each of them, you consider 10.000 stars with known distances (measured by another
#  method, e.g. triangulation). From these, you can find how well your own method works,
#  make corrections to biases as needed, and finally find out how precise your calibrated
#  method is. Happy calibration.
#
#  Author: Troels C. Petersen (NBI)
#  Email:  petersen@nbi.dk
#  Date:   5th of December 2017
#
# ----------------------------------------------------------------------------------- #

from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
from iminuit import Minuit                         
from probfit import Chi2Regression 
plt.close('all')

verbose = True
Nverbose = 10
SaveFigures = False


# ----------------------------------------------------------------------------------- #
# Define functions:
# ----------------------------------------------------------------------------------- #

# Function to create a nice string output for figures:
def nice_string_output(names, values, extra_spacing = 2):
    max_values = len(max(values, key=len))
    max_names = len(max(names, key=len))
    string = ""
    for name, value in zip(names, values):
        string += "{0:s} {1:>{spacing}} \n".format(name, value,
                   spacing = extra_spacing + max_values + max_names - len(name))
    return string[:-2]

# Profile x of the two arrays x and y with defined number of bins and range
# returns the x-values of the profile, the means and the standard deviations.
def profile_x(x, y, bins=(50, 50), xyrange=[(0, 50), (-1,1)]):
    
    H, xedges, yedges = np.histogram2d(x, y, bins=bins, range=xyrange)
    x_center = 0.5*(xedges[1:] + xedges[:-1])
    y_center = 0.5*(yedges[1:] + yedges[:-1])
    
    wsums = H.sum(1)
    
    mask = wsums > 0
    
    mean = (H*y_center).sum(1)[mask] / wsums[mask]
    mean_squared = (H*y_center**2).sum(1)[mask] / wsums[mask]
    std = np.sqrt( mean_squared - mean**2 ) / np.sqrt(wsums[mask]) 

    return x_center[mask], mean, std

# Above function written out with loop:
#    dim = 0
#    Ndim = H.shape[dim]
#    y = np.zeros(Ndim)
#    ey = np.zeros(Ndim)
#    for i in range(Ndim):
#        weights = H[i, :]
#        wsumi = weights.sum()
#        mean = (weights*y_center).sum() / wsumi
#        mean_squared = (weights*y_center**2).sum() / wsumi
#        std = np.sqrt( mean_squared - mean**2 ) / np.sqrt(wsumi) 
#        y[i] = mean
#        ey[i] = std


# ----------------------------------------------------------------------------------- #
# Define files, lists, counters and read data:
# ----------------------------------------------------------------------------------- #

filename = "data_calib.txt"
dknown = []
dmeas  = []
lsig   = []
lbkg   = []
temp   = []
tsky   = []

Nread = 0

# Loop over files and open them:
with open( filename, 'r' ) as f : 

    # Loop over lines and extract variables of interest
    for line in f:
        line = line.strip()
        columns = line.split()
        if (len(columns) == 6) :
            
            # Put the values read into lists:
            dknown.append(float(columns[0]))
            dmeas.append(float(columns[1]))
            lsig.append(float(columns[2]))
            lbkg.append(float(columns[3]))
            temp.append(float(columns[4]))
            tsky.append(float(columns[5]))
            if (verbose and Nread < Nverbose) :
                print("  Distance (known) = {:8.2f}    Distance (measured) = {:6.3f} ".format(dknown[-1], dmeas[-1]))
            Nread += 1

f.close()


# The numpy way (without prints, etc.):
dknown = np.array(dknown)
dmeas  = np.array(dmeas)
lsig   = np.array(lsig)
lbkg   = np.array(lbkg)
temp   = np.array(temp)
tsky   = np.array(tsky)

dknown, dmeas, lsig, lbkg, temp, tsky = np.loadtxt(filename, unpack=True)


# ------------------------------------------------------------------ #
# Make Histograms and vectors:
# ------------------------------------------------------------------ #

fig_rel, ax_rel = plt.subplots(figsize=(12, 8))
ax_rel.set_title('Hist relative resolution')
Nbins = 200
xmin, xmax = -2,2
binwidth = (xmax-xmin) / Nbins

fig_lsig2D, ax_lsig2D = plt.subplots()
ax_lsig2D.set_title('Hist lsig 2D')


# ------------------------------------------------------------------ #
# Loop over data and make plots:
# ------------------------------------------------------------------ #

# Note the initial relative resolution:
distrel = (dmeas - dknown) / dknown

# Do calibration (expand this yourself!):
# ---------------------------------------
dmeas_calib = dmeas       # So here is so far no calibration - that is what you have to change!!!

# Note the final relative resolution:
distrel_calib = (dmeas_calib - dknown) / dknown


mask_distrel       = (xmin < distrel) & (distrel < xmax)
mask_distrel_calib = (xmin < distrel_calib) & (distrel_calib < xmax)

print("  The initial and final resolutions are:  {:6.3f}  and  {:6.3f} \n".format(
        distrel[mask_distrel].std(ddof=1), distrel_calib[mask_distrel_calib].std(ddof=1)))


# =============================================================================
# Relative distance figure
# =============================================================================


ax_rel.hist(distrel, bins=Nbins, range=(xmin, xmax), histtype='step', label='Raw')
hist_rel_calib = ax_rel.hist(distrel_calib, bins=Nbins, range=(xmin, xmax), histtype='step', label='Calibration')

ax_rel.set_xlim(xmin, xmax)
ax_rel.set_xlabel('Realitive precision (dmeas - dknown) / dknown')
ax_rel.set_ylabel('Frequency')
ax_rel.legend(loc='best')

names = ['Entries', 'Mean', 'STD Dev.']
values = ["{:d}".format(len(distrel[mask_distrel])), 
          "{:.3f}".format(distrel[mask_distrel].mean()), 
          "{:.3f}".format(distrel[mask_distrel].std(ddof=1)),
          ] 

ax_rel.text(0.02, 0.95, nice_string_output(names, values), family='monospace', 
        transform=ax_rel.transAxes, fontsize=12, verticalalignment='top')
fig_rel.tight_layout()

if (SaveFigures):
    fig_rel.savefig('UncalibratedCalibrated.pdf', dpi=600)


# =============================================================================
# Calibration 2D histogram
# =============================================================================


# Histograms for making calibration:
ax_lsig2D.hist2d(lsig, distrel, bins=50, range=[(0, 50), (-1,1)], cmin=1, alpha=0.5)
ax_lsig2D.set_xlabel('Signal light from star (lsig)')
ax_lsig2D.set_ylabel('Realitive precision (dmeas - dknown) / dknown')

x_center_lsig2D, mean_lsig2D, std_lsig2D = profile_x(lsig, distrel, bins=(50, 50), xyrange=[(0, 50), (-1,1)])
x_binwidth_lsig2D = x_center_lsig2D[1] - x_center_lsig2D[0]

ax_lsig2D.errorbar(x_center_lsig2D, mean_lsig2D, xerr=x_binwidth_lsig2D/2, yerr=std_lsig2D,  fmt='r.', 
                   ecolor='r', elinewidth=1, capsize=1, capthick=1)

# Fit of the effect from "lsig":


names = ['Entries', 'Mean x', 'Mean y', 'RMS x', 'RMS y']
values = ["{:d}".format(len(lsig)), 
          "{:.3f}".format(lbkg.mean()), 
          "{:.5f}".format(distrel.mean()), 
          "{:.3f}".format(lbkg.std(ddof=1)), 
          "{:.5f}".format(distrel.std(ddof=1)) 
          ] 

ax_lsig2D.text(0.02, 0.2, nice_string_output(names, values), family='monospace', 
        transform=ax_lsig2D.transAxes, fontsize=8, verticalalignment='top')

fig_lsig2D.tight_layout()
plt.show(block=False)


try:
    __IPYTHON__
except:
    raw_input('Press Enter to exit')


# ----------------------------------------------------------------------------------- #
# 
#  As always look at the data and get a feel for each of the variables. A good idea might
#  be to plot them all to know what range to expect them in.
# 
#  Next, consider the distribution of "relative differences" (RD) between the observed
#  and actual distance: RD = (dist_obs - dist_known) / dist_known
#  The RMS is 0.27, i.e. a 27% precision, and it is neither centered at zero (as it
#  should, not to be biased), nor very Gaussian. There is also a long tail towards too
#  high values. This is what you want to improve upon!
# 
#  Finally, there is the distribution of the bias and relative precision as a function of
#  the signal luminosity (lsig). As you can see, the response does not depend on lsig,
#  and so there seems to be no (varying) bias from this variable. Check the other three
#  variables, and if you find some bias, try to correct for it, such that you get the
#  relative difference to be the most narrow Gaussian centered around 0.
# 
#  Note that if you are on average say 50% too high compared to the true values, then
#  you need to correct by 50%, i.e. divide by (1 + 0.5), and in general, if your
#  measurement is f(x) off, where f(x) describes the offset, then you need to divide
#  by (1 + f(T)). Why:
#
#    RD = (d - d_true) / d_true  =>  d_true = d / (1 + f(x))
#
#  Thus define d_calib = d / (1 + f(x)), and continue using d_calib when considering
#  other effects.
# 
# 
#  Questions:
#  ----------
#   1) What corrections do you apply for each of the variables, and how much do each of
#      them improve on the result? What is the final resolution you obtain? And do you
#      think that further improvements are possible?
#
#  Advanced Questions:
#  -------------------
#   2) Try at the end to figure out, which variables the final resolution depends on.
#      Can you give an estimate of the uncertainty for each single star?
# 
# ----------------------------------------------------------------------------------- #