# ------------------------------------------------------------------------------ #
# @Author:        F. Paul Spitzner
# @Email:         paul.spitzner@ds.mpg.de
# @Created:       2020-01-20 17:26:55
# @Last Modified: 2020-07-06 12:13:44
# ------------------------------------------------------------------------------ #
# load the merged hdf5 file that contains correlation coefficients
# for different tau and time series length. the fitting is done here.
# print an error for datasets that seem incorrect
# ------------------------------------------------------------------------------ #


import os
import sys
import fcntl
import h5py
import glob
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import mrestimator as mre  # v0.1.6b4
from itertools import product
from time import gmtime, strftime
from tqdm import tqdm
from humanize import naturalsize
import pickle
import utility as ut

# ------------------------------------------------------------------------------ #
# helper functions
# ------------------------------------------------------------------------------ #


def load_and_fit(h5pattern):
    """
        Load data form the hdf5 files matching a pattern e.g.
        "h5pattern = "./dat/merged_sm*.hdf5"

        Returns a dict with the fit results and original data.
        Fit results layout: tau_index, ts_length_index
    """

    print(f"Processing {h5pattern}")
    # get a file list
    h5files = glob.glob(os.path.expanduser(h5pattern))

    # number of target tau values matches file list
    path_dict = dict()

    for h5file in h5files:
        tau = ut.h5_load(h5file, "target_tau")
        path_dict[tau] = h5file

    # key = tau, then numpy arrays because the number of coefficients changes
    # each arrays layout: relative_ts_length * repetition * rk
    data_dict = dict()

    # sorted order by tau just for good measure
    for tau in sorted(path_dict.keys()):
        h5file = path_dict[tau]
        data = ut.h5_load(h5file, "data")
        num_lens, num_reps, num_steps = data.shape
        data_dict[tau] = data
        len_vals = ut.h5_load(h5file, "relative_length")

    tau_vals = np.array(sorted(path_dict.keys()))
    num_taus = len(tau_vals)

    # check for dodgy data
    for tau in data_dict.keys():
        # we know that entries will be Nan for when the ts length < tau / 2
        # so, do not look at the first few values of relative length
        min_len = int(len(len_vals) / 2)
        fishy = np.where(np.isnan(data_dict[tau][min_len:, :, :]))
        if len(fishy[0]) > 0:
            print(f"tau = {tau:.2f}, fishy repetitions:\n{np.unique(fishy[1])}")

    shape = (num_taus, num_lens, num_reps)
    fitres_e = np.ones(shape) * np.nan
    fitres_eo = np.ones(shape) * np.nan
    fitres_e_shift = np.ones(shape) * np.nan
    fitres_eo_shift = np.ones(shape) * np.nan

    # do the fits
    print("Fitting ...")
    for tdx, tau in enumerate(tqdm(tau_vals, leave=False)):
        for ldx, lens in enumerate(tqdm(len_vals, leave=False)):
            for rdx in range(num_reps):
                rk = data_dict[tau][ldx][rdx]
                steps = np.arange(1, len(rk) + 1)
                # avoid the Nans
                valid = np.where(np.isfinite(rk))[0]
                steps = steps[valid]
                rk = rk[valid]
                try:
                    tmp = mre.fit(rk, steps=steps, fitfunc="e").tau
                except:
                    tmp = np.nan
                fitres_e[tdx, ldx, rdx] = tmp

                try:
                    tmp = mre.fit(rk, steps=steps, fitfunc="eo").tau
                except:
                    tmp = np.nan
                fitres_eo[tdx, ldx, rdx] = tmp

                # analytically motivated bias
                # not shown in the main figure
                tlen = int(lens * tau)
                bias = (1 / tlen) * (1 + np.exp(-1 / tau)) / (1 - np.exp(-1 / tau))
                rk = rk + bias

                try:
                    tmp = mre.fit(rk, steps=steps, fitfunc="e").tau
                except:
                    tmp = np.nan
                fitres_e_shift[tdx, ldx, rdx] = tmp

                try:
                    tmp = mre.fit(rk, steps=steps, fitfunc="eo").tau
                except:
                    tmp = np.nan
                fitres_eo_shift[tdx, ldx, rdx] = tmp


    res = dict()
    res["tau_vals"] = tau_vals
    res["len_vals"] = len_vals
    res["raw_data"] = data_dict

    res["medi_e"] = np.nanmedian(fitres_e, axis=2)
    res["mean_e"] = np.nanmean(fitres_e, axis=2)
    res["stdd_e"] = np.nanstd(fitres_e, axis=2)

    res["medi_eo"] = np.nanmedian(fitres_eo, axis=2)
    res["mean_eo"] = np.nanmean(fitres_eo, axis=2)
    res["stdd_eo"] = np.nanstd(fitres_eo, axis=2)

    res["medi_e_shift"] = np.nanmedian(fitres_e_shift, axis=2)
    res["mean_e_shift"] = np.nanmean(fitres_e_shift, axis=2)
    res["stdd_e_shift"] = np.nanstd(fitres_e_shift, axis=2)

    res["medi_eo_shift"] = np.nanmedian(fitres_eo_shift, axis=2)
    res["mean_eo_shift"] = np.nanmean(fitres_eo_shift, axis=2)
    res["stdd_eo_shift"] = np.nanstd(fitres_eo_shift, axis=2)

    # keep all the fits so we can debug later
    res['fits_e'] = fitres_e;
    res['fits_eo'] = fitres_eo;
    res['fits_e_shift'] = fitres_e_shift;
    res['fits_eo_shift'] = fitres_eo_shift;

    return res


# Marriott and Pope (1954) Eq 407
def rk_analytic_407(k_max, m, numsteps):
    k_range = np.arange(1, int(k_max + 1))
    geom = np.zeros(k_max)
    for idx, k in enumerate(k_range):
        geom[idx] = np.sum(np.power(m, k_range[:k] - 1))

    res = np.power(m, k_range) - 1.0 / numsteps * (
        (1 + m) * geom - 2 * k_range * np.power(m, k_range) / numsteps
    )

    return res


def rk_analytic_408(k_max, m, numsteps):
    k_range = np.arange(1, int(k_max + 1))
    return np.power(m, k_range) - 2 * k_range * np.power(m, k_range) / numsteps


# plot the coefficients, rk vs k
def plot_coefficients(ax, data, rel_len_idx=0, **kwargs):
    # data = data_dict[tau]
    # fig, ax = plt.subplots()
    # print(f"relative length: {len_vals[rel_len_idx]} x tau")
    steps = np.arange(1, data.shape[2] + 1)
    for i in range(data.shape[1]):
        ax.plot(steps, data[rel_len_idx, i, :], **kwargs)

    ax.set_xlim(1, steps[-1])
    ax.set_xlabel(r"Steps $k$")
    ax.set_ylabel(r"Coefficients $r_k$")


# ------------------------------------------------------------------------------ #
# main script
# ------------------------------------------------------------------------------ #

mre.ut._logstreamhandler.setLevel("ERROR")
mre.ut._logfilehandler.setLevel("ERROR")

# set directory to the location of this script file to use relative paths
os.chdir(os.path.dirname(__file__))

load_pickled = False
if not load_pickled:
    # load, analyze, save as pickle
    sm = load_and_fit("../dat/merged_sm*.hdf5")
    ts = load_and_fit("../dat/merged_ts*.hdf5")

    with open("../dat/sm_fits.pickle", "wb") as handle:
        pickle.dump(sm, handle)
    with open("../dat/ts_fits.pickle", "wb") as handle:
        pickle.dump(ts, handle)
else:
    # load from pickle
    with open("../dat/ts_fits.pickle", "rb") as handle:
        ts = pickle.load(handle)
    with open("../dat/sm_fits.pickle", "rb") as handle:
        sm = pickle.load(handle)

fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
# ax.clear()

cdx = 0  # color index
for tdx in [0, 2, 4]:
    tau = ts["tau_vals"][tdx]

    # ts default
    ax.plot(
        ts["len_vals"],
        ts["medi_eo"][tdx] / tau,
        label=f"tau={tau}",
        color=f"C{cdx}",
        ls="-",
        alpha=1.0,
    )

    # sm default
    ax.plot(
        sm["len_vals"],
        sm["medi_eo"][tdx] / tau,
        label=None,
        color=f"C{cdx}",
        ls="--",
        alpha=1.0,
    )

    # errors
    if tdx <= 4:
        ut.plot_stdd(
            ax,
            ts["len_vals"],
            ts["medi_eo"][tdx] / tau,
            ts["stdd_eo"][tdx] / tau,
            mode='bar',
            color=f"C{cdx}",
        )
        ut.plot_stdd(
            ax,
            sm["len_vals"],
            sm["medi_eo"][tdx] / tau,
            sm["stdd_eo"][tdx] / tau,
            mode='bar',
            eb_ls='--',
            color=f"C{cdx}",
        )


        print(sm["medi_eo"][tdx] / tau)
        print(sm["stdd_eo"][tdx] / tau)

    cdx += 1

# first order approximation based on Marriott and Pope (1954) Eq 407
targetlength = ts["len_vals"]  # in multiples of tau
ax.plot(targetlength, 1 / (1 + (4 / 1) * (1 / targetlength)), color="black", label=r"analytic")


# ------------------------------------------------------------------------------ #
# main figure styling
# ------------------------------------------------------------------------------ #

ax.legend(loc=4)
ax.set_title(r"$A=1000$, no subsampling", fontname="Arial")
ax.set_yscale("log")
ax.set_xscale("log")
ax.set_xlabel(f"Relative trial length $\\,T/\\tau$", fontname="Arial")
ax.set_ylabel(f"Estimated timescale $\\,\\hat{{\\tau}}/\\tau$", fontname="Arial")

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ut.arrowed_spines(ax, ["bottom", "left"])

fig.tight_layout()
ax.set_xlim((1, None))
ax.set_ylim((0.1, 1.5))
fig.savefig("../fig/triallength.pdf", transparent=True)

# ------------------------------------------------------------------------------ #
# the TS shifting issue
# ------------------------------------------------------------------------------ #

tau = 100.0
rel_lens = [10, 2, 1]

fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
ax.set_title("SM")
ax.set_rasterization_zorder(-1) # we dont want 100 vector lines
for idx, i in enumerate(rel_lens):
    plot_coefficients(
        ax, sm["raw_data"][tau], rel_lens[idx], alpha=0.1, lw=0.5, color=f"C{idx}", zorder=-2
    )
    # y = rk_analytic_407(k_max=k_max, m=np.exp(-1.0 / tau), numsteps=rel_lens[idx] * tau)
    # ax.plot(x, y, color=f"C{idx}", ls="-")
x = np.arange(1, 2001)
y = np.exp(-x / tau)
ax.plot(x, y, color="black", zorder=1)
ax.set_xlim((1, None))
ax.set_xticks([1, 500, 1000, 1500, 2000])
ax.set_ylim((-.325, 1.0))
fig.tight_layout()
fig.savefig("../fig/rk_sm.pdf", transparent=True)

fig, ax = plt.subplots(figsize=(3.5, 2.55), dpi=300)
ax.set_title("TS")
ax.set_rasterization_zorder(-1) # we dont want 100 vector lines
for idx, i in enumerate(rel_lens):
    plot_coefficients(
        ax, ts["raw_data"][tau], rel_lens[idx], alpha=0.1, lw=0.5, color=f"C{idx}", zorder=-2
    )

    # Marriott and Pope solutions
    numsteps = int(ts["len_vals"][i] * tau)
    # 20*tau was the parameter of the simulation
    # 0.5 * numstep is the safety check implemented in the mrestimator toolbox
    k_max = int(np.clip(a=20 * tau, a_min=None, a_max=0.5 * numsteps))

    x = np.arange(1, k_max + 1)

    outline=[pe.Stroke(linewidth=3, foreground=f"white"), pe.Normal()]
    y = rk_analytic_407(k_max=k_max, m=np.exp(-1.0 / tau), numsteps=numsteps)
    ax.plot(x, y, color=f"C{idx}", ls="-", path_effects=outline, zorder=2, label=f"tau/T={ts['len_vals'][i]}")

    # rk from 408 have the same bending as our sim data, but at different numsteps.
    # likely due to the 'known mean' not being zero.
    # numsteps = int(numsteps / 4)
    # y = rk_analytic_408(k_max=20*tau, m=np.exp(-1.0 / tau), numsteps=numsteps)
    # ax.plot(x, y[0:len(x)], color=f"C{idx}", ls="-", alpha=1)


x = np.arange(1, 2001)
y = np.exp(-x / tau)
ax.plot(x, y, color="black", zorder=2)
ax.legend(loc=1)
ax.set_xlim((1, None))
ax.set_ylim((-.325, 1.0))
ax.set_xticks([1, 500, 1000, 1500, 2000])
fig.tight_layout()
fig.savefig("../fig/rk_ts.pdf", transparent=True)