123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359 |
- # ------------------------------------------------------------------------------ #
- # @Author: F. Paul Spitzner
- # @Email: paul.spitzner@ds.mpg.de
- # @Created: 2020-01-20 17:26:55
- # @Last Modified: 2020-07-06 12:13:44
- # ------------------------------------------------------------------------------ #
- # load the merged hdf5 file that contains correlation coefficients
- # for different tau and time series length. the fitting is done here.
- # print an error for datasets that seem incorrect
- # ------------------------------------------------------------------------------ #
- import os
- import sys
- import fcntl
- import h5py
- import glob
- import numpy as np
- import matplotlib
- import matplotlib.pyplot as plt
- import matplotlib.patheffects as pe
- import mrestimator as mre # v0.1.6b4
- from itertools import product
- from time import gmtime, strftime
- from tqdm import tqdm
- from humanize import naturalsize
- import pickle
- import utility as ut
- # ------------------------------------------------------------------------------ #
- # helper functions
- # ------------------------------------------------------------------------------ #
- def load_and_fit(h5pattern):
- """
- Load data form the hdf5 files matching a pattern e.g.
- "h5pattern = "./dat/merged_sm*.hdf5"
- Returns a dict with the fit results and original data.
- Fit results layout: tau_index, ts_length_index
- """
- print(f"Processing {h5pattern}")
- # get a file list
- h5files = glob.glob(os.path.expanduser(h5pattern))
- # number of target tau values matches file list
- path_dict = dict()
- for h5file in h5files:
- tau = ut.h5_load(h5file, "target_tau")
- path_dict[tau] = h5file
- # key = tau, then numpy arrays because the number of coefficients changes
- # each arrays layout: relative_ts_length * repetition * rk
- data_dict = dict()
- # sorted order by tau just for good measure
- for tau in sorted(path_dict.keys()):
- h5file = path_dict[tau]
- data = ut.h5_load(h5file, "data")
- num_lens, num_reps, num_steps = data.shape
- data_dict[tau] = data
- len_vals = ut.h5_load(h5file, "relative_length")
- tau_vals = np.array(sorted(path_dict.keys()))
- num_taus = len(tau_vals)
- # check for dodgy data
- for tau in data_dict.keys():
- # we know that entries will be Nan for when the ts length < tau / 2
- # so, do not look at the first few values of relative length
- min_len = int(len(len_vals) / 2)
- fishy = np.where(np.isnan(data_dict[tau][min_len:, :, :]))
- if len(fishy[0]) > 0:
- print(f"tau = {tau:.2f}, fishy repetitions:\n{np.unique(fishy[1])}")
- shape = (num_taus, num_lens, num_reps)
- fitres_e = np.ones(shape) * np.nan
- fitres_eo = np.ones(shape) * np.nan
- fitres_e_shift = np.ones(shape) * np.nan
- fitres_eo_shift = np.ones(shape) * np.nan
- # do the fits
- print("Fitting ...")
- for tdx, tau in enumerate(tqdm(tau_vals, leave=False)):
- for ldx, lens in enumerate(tqdm(len_vals, leave=False)):
- for rdx in range(num_reps):
- rk = data_dict[tau][ldx][rdx]
- steps = np.arange(1, len(rk) + 1)
- # avoid the Nans
- valid = np.where(np.isfinite(rk))[0]
- steps = steps[valid]
- rk = rk[valid]
- try:
- tmp = mre.fit(rk, steps=steps, fitfunc="e").tau
- except:
- tmp = np.nan
- fitres_e[tdx, ldx, rdx] = tmp
- try:
- tmp = mre.fit(rk, steps=steps, fitfunc="eo").tau
- except:
- tmp = np.nan
- fitres_eo[tdx, ldx, rdx] = tmp
- # analytically motivated bias
- # not shown in the main figure
- tlen = int(lens * tau)
- bias = (1 / tlen) * (1 + np.exp(-1 / tau)) / (1 - np.exp(-1 / tau))
- rk = rk + bias
- try:
- tmp = mre.fit(rk, steps=steps, fitfunc="e").tau
- except:
- tmp = np.nan
- fitres_e_shift[tdx, ldx, rdx] = tmp
- try:
- tmp = mre.fit(rk, steps=steps, fitfunc="eo").tau
- except:
- tmp = np.nan
- fitres_eo_shift[tdx, ldx, rdx] = tmp
- res = dict()
- res["tau_vals"] = tau_vals
- res["len_vals"] = len_vals
- res["raw_data"] = data_dict
- res["medi_e"] = np.nanmedian(fitres_e, axis=2)
- res["mean_e"] = np.nanmean(fitres_e, axis=2)
- res["stdd_e"] = np.nanstd(fitres_e, axis=2)
- res["medi_eo"] = np.nanmedian(fitres_eo, axis=2)
- res["mean_eo"] = np.nanmean(fitres_eo, axis=2)
- res["stdd_eo"] = np.nanstd(fitres_eo, axis=2)
- res["medi_e_shift"] = np.nanmedian(fitres_e_shift, axis=2)
- res["mean_e_shift"] = np.nanmean(fitres_e_shift, axis=2)
- res["stdd_e_shift"] = np.nanstd(fitres_e_shift, axis=2)
- res["medi_eo_shift"] = np.nanmedian(fitres_eo_shift, axis=2)
- res["mean_eo_shift"] = np.nanmean(fitres_eo_shift, axis=2)
- res["stdd_eo_shift"] = np.nanstd(fitres_eo_shift, axis=2)
- # keep all the fits so we can debug later
- res['fits_e'] = fitres_e;
- res['fits_eo'] = fitres_eo;
- res['fits_e_shift'] = fitres_e_shift;
- res['fits_eo_shift'] = fitres_eo_shift;
- return res
- # Marriott and Pope (1954) Eq 407
- def rk_analytic_407(k_max, m, numsteps):
- k_range = np.arange(1, int(k_max + 1))
- geom = np.zeros(k_max)
- for idx, k in enumerate(k_range):
- geom[idx] = np.sum(np.power(m, k_range[:k] - 1))
- res = np.power(m, k_range) - 1.0 / numsteps * (
- (1 + m) * geom - 2 * k_range * np.power(m, k_range) / numsteps
- )
- return res
- def rk_analytic_408(k_max, m, numsteps):
- k_range = np.arange(1, int(k_max + 1))
- return np.power(m, k_range) - 2 * k_range * np.power(m, k_range) / numsteps
- # plot the coefficients, rk vs k
- def plot_coefficients(ax, data, rel_len_idx=0, **kwargs):
- # data = data_dict[tau]
- # fig, ax = plt.subplots()
- # print(f"relative length: {len_vals[rel_len_idx]} x tau")
- steps = np.arange(1, data.shape[2] + 1)
- for i in range(data.shape[1]):
- ax.plot(steps, data[rel_len_idx, i, :], **kwargs)
- ax.set_xlim(1, steps[-1])
- ax.set_xlabel(r"Steps $k$")
- ax.set_ylabel(r"Coefficients $r_k$")
- # ------------------------------------------------------------------------------ #
- # main script
- # ------------------------------------------------------------------------------ #
- mre.ut._logstreamhandler.setLevel("ERROR")
- mre.ut._logfilehandler.setLevel("ERROR")
- # set directory to the location of this script file to use relative paths
- os.chdir(os.path.dirname(__file__))
- load_pickled = False
- if not load_pickled:
- # load, analyze, save as pickle
- sm = load_and_fit("../dat/merged_sm*.hdf5")
- ts = load_and_fit("../dat/merged_ts*.hdf5")
- with open("../dat/sm_fits.pickle", "wb") as handle:
- pickle.dump(sm, handle)
- with open("../dat/ts_fits.pickle", "wb") as handle:
- pickle.dump(ts, handle)
- else:
- # load from pickle
- with open("../dat/ts_fits.pickle", "rb") as handle:
- ts = pickle.load(handle)
- with open("../dat/sm_fits.pickle", "rb") as handle:
- sm = pickle.load(handle)
- fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
- # ax.clear()
- cdx = 0 # color index
- for tdx in [0, 2, 4]:
- tau = ts["tau_vals"][tdx]
- # ts default
- ax.plot(
- ts["len_vals"],
- ts["medi_eo"][tdx] / tau,
- label=f"tau={tau}",
- color=f"C{cdx}",
- ls="-",
- alpha=1.0,
- )
- # sm default
- ax.plot(
- sm["len_vals"],
- sm["medi_eo"][tdx] / tau,
- label=None,
- color=f"C{cdx}",
- ls="--",
- alpha=1.0,
- )
- # errors
- if tdx <= 4:
- ut.plot_stdd(
- ax,
- ts["len_vals"],
- ts["medi_eo"][tdx] / tau,
- ts["stdd_eo"][tdx] / tau,
- mode='bar',
- color=f"C{cdx}",
- )
- ut.plot_stdd(
- ax,
- sm["len_vals"],
- sm["medi_eo"][tdx] / tau,
- sm["stdd_eo"][tdx] / tau,
- mode='bar',
- eb_ls='--',
- color=f"C{cdx}",
- )
- print(sm["medi_eo"][tdx] / tau)
- print(sm["stdd_eo"][tdx] / tau)
- cdx += 1
- # first order approximation based on Marriott and Pope (1954) Eq 407
- targetlength = ts["len_vals"] # in multiples of tau
- ax.plot(targetlength, 1 / (1 + (4 / 1) * (1 / targetlength)), color="black", label=r"analytic")
- # ------------------------------------------------------------------------------ #
- # main figure styling
- # ------------------------------------------------------------------------------ #
- ax.legend(loc=4)
- ax.set_title(r"$A=1000$, no subsampling", fontname="Arial")
- ax.set_yscale("log")
- ax.set_xscale("log")
- ax.set_xlabel(f"Relative trial length $\\,T/\\tau$", fontname="Arial")
- ax.set_ylabel(f"Estimated timescale $\\,\\hat{{\\tau}}/\\tau$", fontname="Arial")
- ax.spines["top"].set_visible(False)
- ax.spines["right"].set_visible(False)
- ax.spines["left"].set_visible(False)
- ax.spines["bottom"].set_visible(False)
- ut.arrowed_spines(ax, ["bottom", "left"])
- fig.tight_layout()
- ax.set_xlim((1, None))
- ax.set_ylim((0.1, 1.5))
- fig.savefig("../fig/triallength.pdf", transparent=True)
- # ------------------------------------------------------------------------------ #
- # the TS shifting issue
- # ------------------------------------------------------------------------------ #
- tau = 100.0
- rel_lens = [10, 2, 1]
- fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
- ax.set_title("SM")
- ax.set_rasterization_zorder(-1) # we dont want 100 vector lines
- for idx, i in enumerate(rel_lens):
- plot_coefficients(
- ax, sm["raw_data"][tau], rel_lens[idx], alpha=0.1, lw=0.5, color=f"C{idx}", zorder=-2
- )
- # y = rk_analytic_407(k_max=k_max, m=np.exp(-1.0 / tau), numsteps=rel_lens[idx] * tau)
- # ax.plot(x, y, color=f"C{idx}", ls="-")
- x = np.arange(1, 2001)
- y = np.exp(-x / tau)
- ax.plot(x, y, color="black", zorder=1)
- ax.set_xlim((1, None))
- ax.set_xticks([1, 500, 1000, 1500, 2000])
- ax.set_ylim((-.325, 1.0))
- fig.tight_layout()
- fig.savefig("../fig/rk_sm.pdf", transparent=True)
- fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
- ax.set_title("TS")
- ax.set_rasterization_zorder(-1) # we dont want 100 vector lines
- for idx, i in enumerate(rel_lens):
- plot_coefficients(
- ax, ts["raw_data"][tau], rel_lens[idx], alpha=0.1, lw=0.5, color=f"C{idx}", zorder=-2
- )
- # Marriott and Pope solutions
- numsteps = int(ts["len_vals"][i] * tau)
- # 20*tau was the parameter of the simulation
- # 0.5 * numstep is the safety check implemented in the mrestimator toolbox
- k_max = int(np.clip(a=20 * tau, a_min=None, a_max=0.5 * numsteps))
- x = np.arange(1, k_max + 1)
- outline=[pe.Stroke(linewidth=3, foreground=f"white"), pe.Normal()]
- y = rk_analytic_407(k_max=k_max, m=np.exp(-1.0 / tau), numsteps=numsteps)
- ax.plot(x, y, color=f"C{idx}", ls="-", path_effects=outline, zorder=2, label=f"tau/T={ts['len_vals'][i]}")
- # rk from 408 have the same bending as our sim data, but at different numsteps.
- # likely due to the 'known mean' not being zero.
- # numsteps = int(numsteps / 4)
- # y = rk_analytic_408(k_max=20*tau, m=np.exp(-1.0 / tau), numsteps=numsteps)
- # ax.plot(x, y[0:len(x)], color=f"C{idx}", ls="-", alpha=1)
- x = np.arange(1, 2001)
- y = np.exp(-x / tau)
- ax.plot(x, y, color="black", zorder=2)
- ax.legend(loc=1)
- ax.set_xlim((1, None))
- ax.set_ylim((-.325, 1.0))
- ax.set_xticks([1, 500, 1000, 1500, 2000])
- fig.tight_layout()
- fig.savefig("../fig/rk_ts.pdf", transparent=True)
|