#!/usr/bin/env python # ----------------------------------------------------------------------------------- # # # Python macro for exercise on calibration. # # You're developping a new method for measuring the distance to stars, and want to # calibrate and thus improve this method, such that the precision obtained is unbiased # and has a minimal variance. You know that the method depends on several factors, such as: # * Amount of signal light from the star (lsig) # * Amount of background light in the surrounding sky (lbkg) # * Temperature of star (temp) # * Transparency of sky (tsky) # # In order to determine the influence of these factors, and how much you need to correct # for each of them, you consider 10.000 stars with known distances (measured by another # method, e.g. triangulation). From these, you can find how well your own method works, # make corrections to biases as needed, and finally find out how precise your calibrated # method is. Happy calibration. # # Author: Troels C. Petersen (NBI) # Email: petersen@nbi.dk # Date: 5th of December 2017 # # ----------------------------------------------------------------------------------- # from __future__ import division, print_function import numpy as np import matplotlib.pyplot as plt from iminuit import Minuit from probfit import Chi2Regression plt.close('all') verbose = True Nverbose = 10 SaveFigures = False # ----------------------------------------------------------------------------------- # # Define functions: # ----------------------------------------------------------------------------------- # # Function to create a nice string output for figures: def nice_string_output(names, values, extra_spacing = 2): max_values = len(max(values, key=len)) max_names = len(max(names, key=len)) string = "" for name, value in zip(names, values): string += "{0:s} {1:>{spacing}} \n".format(name, value, spacing = extra_spacing + max_values + max_names - len(name)) return string[:-2] # Profile x of the two arrays x and y with defined number of bins and range # returns the x-values of the profile, the means and the standard deviations. def profile_x(x, y, bins=(50, 50), xyrange=[(0, 50), (-1,1)]): H, xedges, yedges = np.histogram2d(x, y, bins=bins, range=xyrange) x_center = 0.5*(xedges[1:] + xedges[:-1]) y_center = 0.5*(yedges[1:] + yedges[:-1]) wsums = H.sum(1) mask = wsums > 0 mean = (H*y_center).sum(1)[mask] / wsums[mask] mean_squared = (H*y_center**2).sum(1)[mask] / wsums[mask] std = np.sqrt( mean_squared - mean**2 ) / np.sqrt(wsums[mask]) return x_center[mask], mean, std # Above function written out with loop: # dim = 0 # Ndim = H.shape[dim] # y = np.zeros(Ndim) # ey = np.zeros(Ndim) # for i in range(Ndim): # weights = H[i, :] # wsumi = weights.sum() # mean = (weights*y_center).sum() / wsumi # mean_squared = (weights*y_center**2).sum() / wsumi # std = np.sqrt( mean_squared - mean**2 ) / np.sqrt(wsumi) # y[i] = mean # ey[i] = std # ----------------------------------------------------------------------------------- # # Define files, lists, counters and read data: # ----------------------------------------------------------------------------------- # filename = "data_calib.txt" dknown = [] dmeas = [] lsig = [] lbkg = [] temp = [] tsky = [] Nread = 0 # Loop over files and open them: with open( filename, 'r' ) as f : # Loop over lines and extract variables of interest for line in f: line = line.strip() columns = line.split() if (len(columns) == 6) : # Put the values read into lists: dknown.append(float(columns[0])) dmeas.append(float(columns[1])) lsig.append(float(columns[2])) lbkg.append(float(columns[3])) temp.append(float(columns[4])) tsky.append(float(columns[5])) if (verbose and Nread < Nverbose) : print(" Distance (known) = {:8.2f} Distance (measured) = {:6.3f} ".format(dknown[-1], dmeas[-1])) Nread += 1 f.close() # The numpy way (without prints, etc.): dknown = np.array(dknown) dmeas = np.array(dmeas) lsig = np.array(lsig) lbkg = np.array(lbkg) temp = np.array(temp) tsky = np.array(tsky) dknown, dmeas, lsig, lbkg, temp, tsky = np.loadtxt(filename, unpack=True) # ------------------------------------------------------------------ # # Make Histograms and vectors: # ------------------------------------------------------------------ # fig_rel, ax_rel = plt.subplots(figsize=(12, 8)) ax_rel.set_title('Hist relative resolution') Nbins = 200 xmin, xmax = -2,2 binwidth = (xmax-xmin) / Nbins fig_lsig2D, ax_lsig2D = plt.subplots() ax_lsig2D.set_title('Hist lsig 2D') # ------------------------------------------------------------------ # # Loop over data and make plots: # ------------------------------------------------------------------ # # Note the initial relative resolution: distrel = (dmeas - dknown) / dknown # Do calibration (expand this yourself!): # --------------------------------------- dmeas_calib = dmeas # So here is so far no calibration - that is what you have to change!!! # Note the final relative resolution: distrel_calib = (dmeas_calib - dknown) / dknown mask_distrel = (xmin < distrel) & (distrel < xmax) mask_distrel_calib = (xmin < distrel_calib) & (distrel_calib < xmax) print(" The initial and final resolutions are: {:6.3f} and {:6.3f} \n".format( distrel[mask_distrel].std(ddof=1), distrel_calib[mask_distrel_calib].std(ddof=1))) # ============================================================================= # Relative distance figure # ============================================================================= ax_rel.hist(distrel, bins=Nbins, range=(xmin, xmax), histtype='step', label='Raw') hist_rel_calib = ax_rel.hist(distrel_calib, bins=Nbins, range=(xmin, xmax), histtype='step', label='Calibration') ax_rel.set_xlim(xmin, xmax) ax_rel.set_xlabel('Realitive precision (dmeas - dknown) / dknown') ax_rel.set_ylabel('Frequency') ax_rel.legend(loc='best') names = ['Entries', 'Mean', 'STD Dev.'] values = ["{:d}".format(len(distrel[mask_distrel])), "{:.3f}".format(distrel[mask_distrel].mean()), "{:.3f}".format(distrel[mask_distrel].std(ddof=1)), ] ax_rel.text(0.02, 0.95, nice_string_output(names, values), family='monospace', transform=ax_rel.transAxes, fontsize=12, verticalalignment='top') fig_rel.tight_layout() if (SaveFigures): fig_rel.savefig('UncalibratedCalibrated.pdf', dpi=600) # ============================================================================= # Calibration 2D histogram # ============================================================================= # Histograms for making calibration: ax_lsig2D.hist2d(lsig, distrel, bins=50, range=[(0, 50), (-1,1)], cmin=1, alpha=0.5) ax_lsig2D.set_xlabel('Signal light from star (lsig)') ax_lsig2D.set_ylabel('Realitive precision (dmeas - dknown) / dknown') x_center_lsig2D, mean_lsig2D, std_lsig2D = profile_x(lsig, distrel, bins=(50, 50), xyrange=[(0, 50), (-1,1)]) x_binwidth_lsig2D = x_center_lsig2D[1] - x_center_lsig2D[0] ax_lsig2D.errorbar(x_center_lsig2D, mean_lsig2D, xerr=x_binwidth_lsig2D/2, yerr=std_lsig2D, fmt='r.', ecolor='r', elinewidth=1, capsize=1, capthick=1) # Fit of the effect from "lsig": names = ['Entries', 'Mean x', 'Mean y', 'RMS x', 'RMS y'] values = ["{:d}".format(len(lsig)), "{:.3f}".format(lbkg.mean()), "{:.5f}".format(distrel.mean()), "{:.3f}".format(lbkg.std(ddof=1)), "{:.5f}".format(distrel.std(ddof=1)) ] ax_lsig2D.text(0.02, 0.2, nice_string_output(names, values), family='monospace', transform=ax_lsig2D.transAxes, fontsize=8, verticalalignment='top') fig_lsig2D.tight_layout() plt.show(block=False) try: __IPYTHON__ except: raw_input('Press Enter to exit') # ----------------------------------------------------------------------------------- # # # As always look at the data and get a feel for each of the variables. A good idea might # be to plot them all to know what range to expect them in. # # Next, consider the distribution of "relative differences" (RD) between the observed # and actual distance: RD = (dist_obs - dist_known) / dist_known # The RMS is 0.27, i.e. a 27% precision, and it is neither centered at zero (as it # should, not to be biased), nor very Gaussian. There is also a long tail towards too # high values. This is what you want to improve upon! # # Finally, there is the distribution of the bias and relative precision as a function of # the signal luminosity (lsig). As you can see, the response does not depend on lsig, # and so there seems to be no (varying) bias from this variable. Check the other three # variables, and if you find some bias, try to correct for it, such that you get the # relative difference to be the most narrow Gaussian centered around 0. # # Note that if you are on average say 50% too high compared to the true values, then # you need to correct by 50%, i.e. divide by (1 + 0.5), and in general, if your # measurement is f(x) off, where f(x) describes the offset, then you need to divide # by (1 + f(T)). Why: # # RD = (d - d_true) / d_true => d_true = d / (1 + f(x)) # # Thus define d_calib = d / (1 + f(x)), and continue using d_calib when considering # other effects. # # # Questions: # ---------- # 1) What corrections do you apply for each of the variables, and how much do each of # them improve on the result? What is the final resolution you obtain? And do you # think that further improvements are possible? # # Advanced Questions: # ------------------- # 2) Try at the end to figure out, which variables the final resolution depends on. # Can you give an estimate of the uncertainty for each single star? # # ----------------------------------------------------------------------------------- #