plot_merged.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. # ------------------------------------------------------------------------------ #
  2. # @Author: F. Paul Spitzner
  3. # @Email: paul.spitzner@ds.mpg.de
  4. # @Created: 2020-01-20 17:26:55
  5. # @Last Modified: 2020-07-06 12:13:44
  6. # ------------------------------------------------------------------------------ #
  7. # load the merged hdf5 file that contains correlation coefficients
  8. # for different tau and time series length. the fitting is done here.
  9. # print an error for datasets that seem incorrect
  10. # ------------------------------------------------------------------------------ #
  11. import os
  12. import sys
  13. import fcntl
  14. import h5py
  15. import glob
  16. import numpy as np
  17. import matplotlib
  18. import matplotlib.pyplot as plt
  19. import matplotlib.patheffects as pe
  20. import mrestimator as mre # v0.1.6b4
  21. from itertools import product
  22. from time import gmtime, strftime
  23. from tqdm import tqdm
  24. from humanize import naturalsize
  25. import pickle
  26. import utility as ut
  27. # ------------------------------------------------------------------------------ #
  28. # helper functions
  29. # ------------------------------------------------------------------------------ #
  30. def load_and_fit(h5pattern):
  31. """
  32. Load data form the hdf5 files matching a pattern e.g.
  33. "h5pattern = "./dat/merged_sm*.hdf5"
  34. Returns a dict with the fit results and original data.
  35. Fit results layout: tau_index, ts_length_index
  36. """
  37. print(f"Processing {h5pattern}")
  38. # get a file list
  39. h5files = glob.glob(os.path.expanduser(h5pattern))
  40. # number of target tau values matches file list
  41. path_dict = dict()
  42. for h5file in h5files:
  43. tau = ut.h5_load(h5file, "target_tau")
  44. path_dict[tau] = h5file
  45. # key = tau, then numpy arrays because the number of coefficients changes
  46. # each arrays layout: relative_ts_length * repetition * rk
  47. data_dict = dict()
  48. # sorted order by tau just for good measure
  49. for tau in sorted(path_dict.keys()):
  50. h5file = path_dict[tau]
  51. data = ut.h5_load(h5file, "data")
  52. num_lens, num_reps, num_steps = data.shape
  53. data_dict[tau] = data
  54. len_vals = ut.h5_load(h5file, "relative_length")
  55. tau_vals = np.array(sorted(path_dict.keys()))
  56. num_taus = len(tau_vals)
  57. # check for dodgy data
  58. for tau in data_dict.keys():
  59. # we know that entries will be Nan for when the ts length < tau / 2
  60. # so, do not look at the first few values of relative length
  61. min_len = int(len(len_vals) / 2)
  62. fishy = np.where(np.isnan(data_dict[tau][min_len:, :, :]))
  63. if len(fishy[0]) > 0:
  64. print(f"tau = {tau:.2f}, fishy repetitions:\n{np.unique(fishy[1])}")
  65. shape = (num_taus, num_lens, num_reps)
  66. fitres_e = np.ones(shape) * np.nan
  67. fitres_eo = np.ones(shape) * np.nan
  68. fitres_e_shift = np.ones(shape) * np.nan
  69. fitres_eo_shift = np.ones(shape) * np.nan
  70. # do the fits
  71. print("Fitting ...")
  72. for tdx, tau in enumerate(tqdm(tau_vals, leave=False)):
  73. for ldx, lens in enumerate(tqdm(len_vals, leave=False)):
  74. for rdx in range(num_reps):
  75. rk = data_dict[tau][ldx][rdx]
  76. steps = np.arange(1, len(rk) + 1)
  77. # avoid the Nans
  78. valid = np.where(np.isfinite(rk))[0]
  79. steps = steps[valid]
  80. rk = rk[valid]
  81. try:
  82. tmp = mre.fit(rk, steps=steps, fitfunc="e").tau
  83. except:
  84. tmp = np.nan
  85. fitres_e[tdx, ldx, rdx] = tmp
  86. try:
  87. tmp = mre.fit(rk, steps=steps, fitfunc="eo").tau
  88. except:
  89. tmp = np.nan
  90. fitres_eo[tdx, ldx, rdx] = tmp
  91. # analytically motivated bias
  92. # not shown in the main figure
  93. tlen = int(lens * tau)
  94. bias = (1 / tlen) * (1 + np.exp(-1 / tau)) / (1 - np.exp(-1 / tau))
  95. rk = rk + bias
  96. try:
  97. tmp = mre.fit(rk, steps=steps, fitfunc="e").tau
  98. except:
  99. tmp = np.nan
  100. fitres_e_shift[tdx, ldx, rdx] = tmp
  101. try:
  102. tmp = mre.fit(rk, steps=steps, fitfunc="eo").tau
  103. except:
  104. tmp = np.nan
  105. fitres_eo_shift[tdx, ldx, rdx] = tmp
  106. res = dict()
  107. res["tau_vals"] = tau_vals
  108. res["len_vals"] = len_vals
  109. res["raw_data"] = data_dict
  110. res["medi_e"] = np.nanmedian(fitres_e, axis=2)
  111. res["mean_e"] = np.nanmean(fitres_e, axis=2)
  112. res["stdd_e"] = np.nanstd(fitres_e, axis=2)
  113. res["medi_eo"] = np.nanmedian(fitres_eo, axis=2)
  114. res["mean_eo"] = np.nanmean(fitres_eo, axis=2)
  115. res["stdd_eo"] = np.nanstd(fitres_eo, axis=2)
  116. res["medi_e_shift"] = np.nanmedian(fitres_e_shift, axis=2)
  117. res["mean_e_shift"] = np.nanmean(fitres_e_shift, axis=2)
  118. res["stdd_e_shift"] = np.nanstd(fitres_e_shift, axis=2)
  119. res["medi_eo_shift"] = np.nanmedian(fitres_eo_shift, axis=2)
  120. res["mean_eo_shift"] = np.nanmean(fitres_eo_shift, axis=2)
  121. res["stdd_eo_shift"] = np.nanstd(fitres_eo_shift, axis=2)
  122. # keep all the fits so we can debug later
  123. res['fits_e'] = fitres_e;
  124. res['fits_eo'] = fitres_eo;
  125. res['fits_e_shift'] = fitres_e_shift;
  126. res['fits_eo_shift'] = fitres_eo_shift;
  127. return res
  128. # Marriott and Pope (1954) Eq 407
  129. def rk_analytic_407(k_max, m, numsteps):
  130. k_range = np.arange(1, int(k_max + 1))
  131. geom = np.zeros(k_max)
  132. for idx, k in enumerate(k_range):
  133. geom[idx] = np.sum(np.power(m, k_range[:k] - 1))
  134. res = np.power(m, k_range) - 1.0 / numsteps * (
  135. (1 + m) * geom - 2 * k_range * np.power(m, k_range) / numsteps
  136. )
  137. return res
  138. def rk_analytic_408(k_max, m, numsteps):
  139. k_range = np.arange(1, int(k_max + 1))
  140. return np.power(m, k_range) - 2 * k_range * np.power(m, k_range) / numsteps
  141. # plot the coefficients, rk vs k
  142. def plot_coefficients(ax, data, rel_len_idx=0, **kwargs):
  143. # data = data_dict[tau]
  144. # fig, ax = plt.subplots()
  145. # print(f"relative length: {len_vals[rel_len_idx]} x tau")
  146. steps = np.arange(1, data.shape[2] + 1)
  147. for i in range(data.shape[1]):
  148. ax.plot(steps, data[rel_len_idx, i, :], **kwargs)
  149. ax.set_xlim(1, steps[-1])
  150. ax.set_xlabel(r"Steps $k$")
  151. ax.set_ylabel(r"Coefficients $r_k$")
  152. # ------------------------------------------------------------------------------ #
  153. # main script
  154. # ------------------------------------------------------------------------------ #
  155. mre.ut._logstreamhandler.setLevel("ERROR")
  156. mre.ut._logfilehandler.setLevel("ERROR")
  157. # set directory to the location of this script file to use relative paths
  158. os.chdir(os.path.dirname(__file__))
  159. load_pickled = False
  160. if not load_pickled:
  161. # load, analyze, save as pickle
  162. sm = load_and_fit("../dat/merged_sm*.hdf5")
  163. ts = load_and_fit("../dat/merged_ts*.hdf5")
  164. with open("../dat/sm_fits.pickle", "wb") as handle:
  165. pickle.dump(sm, handle)
  166. with open("../dat/ts_fits.pickle", "wb") as handle:
  167. pickle.dump(ts, handle)
  168. else:
  169. # load from pickle
  170. with open("../dat/ts_fits.pickle", "rb") as handle:
  171. ts = pickle.load(handle)
  172. with open("../dat/sm_fits.pickle", "rb") as handle:
  173. sm = pickle.load(handle)
  174. fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
  175. # ax.clear()
  176. cdx = 0 # color index
  177. for tdx in [0, 2, 4]:
  178. tau = ts["tau_vals"][tdx]
  179. # ts default
  180. ax.plot(
  181. ts["len_vals"],
  182. ts["medi_eo"][tdx] / tau,
  183. label=f"tau={tau}",
  184. color=f"C{cdx}",
  185. ls="-",
  186. alpha=1.0,
  187. )
  188. # sm default
  189. ax.plot(
  190. sm["len_vals"],
  191. sm["medi_eo"][tdx] / tau,
  192. label=None,
  193. color=f"C{cdx}",
  194. ls="--",
  195. alpha=1.0,
  196. )
  197. # errors
  198. if tdx <= 4:
  199. ut.plot_stdd(
  200. ax,
  201. ts["len_vals"],
  202. ts["medi_eo"][tdx] / tau,
  203. ts["stdd_eo"][tdx] / tau,
  204. mode='bar',
  205. color=f"C{cdx}",
  206. )
  207. ut.plot_stdd(
  208. ax,
  209. sm["len_vals"],
  210. sm["medi_eo"][tdx] / tau,
  211. sm["stdd_eo"][tdx] / tau,
  212. mode='bar',
  213. eb_ls='--',
  214. color=f"C{cdx}",
  215. )
  216. print(sm["medi_eo"][tdx] / tau)
  217. print(sm["stdd_eo"][tdx] / tau)
  218. cdx += 1
  219. # first order approximation based on Marriott and Pope (1954) Eq 407
  220. targetlength = ts["len_vals"] # in multiples of tau
  221. ax.plot(targetlength, 1 / (1 + (4 / 1) * (1 / targetlength)), color="black", label=r"analytic")
  222. # ------------------------------------------------------------------------------ #
  223. # main figure styling
  224. # ------------------------------------------------------------------------------ #
  225. ax.legend(loc=4)
  226. ax.set_title(r"$A=1000$, no subsampling", fontname="Arial")
  227. ax.set_yscale("log")
  228. ax.set_xscale("log")
  229. ax.set_xlabel(f"Relative trial length $\\,T/\\tau$", fontname="Arial")
  230. ax.set_ylabel(f"Estimated timescale $\\,\\hat{{\\tau}}/\\tau$", fontname="Arial")
  231. ax.spines["top"].set_visible(False)
  232. ax.spines["right"].set_visible(False)
  233. ax.spines["left"].set_visible(False)
  234. ax.spines["bottom"].set_visible(False)
  235. ut.arrowed_spines(ax, ["bottom", "left"])
  236. fig.tight_layout()
  237. ax.set_xlim((1, None))
  238. ax.set_ylim((0.1, 1.5))
  239. fig.savefig("../fig/triallength.pdf", transparent=True)
  240. # ------------------------------------------------------------------------------ #
  241. # the TS shifting issue
  242. # ------------------------------------------------------------------------------ #
  243. tau = 100.0
  244. rel_lens = [10, 2, 1]
  245. fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
  246. ax.set_title("SM")
  247. ax.set_rasterization_zorder(-1) # we dont want 100 vector lines
  248. for idx, i in enumerate(rel_lens):
  249. plot_coefficients(
  250. ax, sm["raw_data"][tau], rel_lens[idx], alpha=0.1, lw=0.5, color=f"C{idx}", zorder=-2
  251. )
  252. # y = rk_analytic_407(k_max=k_max, m=np.exp(-1.0 / tau), numsteps=rel_lens[idx] * tau)
  253. # ax.plot(x, y, color=f"C{idx}", ls="-")
  254. x = np.arange(1, 2001)
  255. y = np.exp(-x / tau)
  256. ax.plot(x, y, color="black", zorder=1)
  257. ax.set_xlim((1, None))
  258. ax.set_xticks([1, 500, 1000, 1500, 2000])
  259. ax.set_ylim((-.325, 1.0))
  260. fig.tight_layout()
  261. fig.savefig("../fig/rk_sm.pdf", transparent=True)
  262. fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
  263. ax.set_title("TS")
  264. ax.set_rasterization_zorder(-1) # we dont want 100 vector lines
  265. for idx, i in enumerate(rel_lens):
  266. plot_coefficients(
  267. ax, ts["raw_data"][tau], rel_lens[idx], alpha=0.1, lw=0.5, color=f"C{idx}", zorder=-2
  268. )
  269. # Marriott and Pope solutions
  270. numsteps = int(ts["len_vals"][i] * tau)
  271. # 20*tau was the parameter of the simulation
  272. # 0.5 * numstep is the safety check implemented in the mrestimator toolbox
  273. k_max = int(np.clip(a=20 * tau, a_min=None, a_max=0.5 * numsteps))
  274. x = np.arange(1, k_max + 1)
  275. outline=[pe.Stroke(linewidth=3, foreground=f"white"), pe.Normal()]
  276. y = rk_analytic_407(k_max=k_max, m=np.exp(-1.0 / tau), numsteps=numsteps)
  277. ax.plot(x, y, color=f"C{idx}", ls="-", path_effects=outline, zorder=2, label=f"tau/T={ts['len_vals'][i]}")
  278. # rk from 408 have the same bending as our sim data, but at different numsteps.
  279. # likely due to the 'known mean' not being zero.
  280. # numsteps = int(numsteps / 4)
  281. # y = rk_analytic_408(k_max=20*tau, m=np.exp(-1.0 / tau), numsteps=numsteps)
  282. # ax.plot(x, y[0:len(x)], color=f"C{idx}", ls="-", alpha=1)
  283. x = np.arange(1, 2001)
  284. y = np.exp(-x / tau)
  285. ax.plot(x, y, color="black", zorder=2)
  286. ax.legend(loc=1)
  287. ax.set_xlim((1, None))
  288. ax.set_ylim((-.325, 1.0))
  289. ax.set_xticks([1, 500, 1000, 1500, 2000])
  290. fig.tight_layout()
  291. fig.savefig("../fig/rk_ts.pdf", transparent=True)