merge_hdf5.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # ------------------------------------------------------------------ #
  2. # merge the single threaded hdf5 files (many repetitions) into one
  3. # ------------------------------------------------------------------ #
  4. import os
  5. import sys
  6. import fcntl
  7. import h5py
  8. import numpy as np
  9. import matplotlib.pyplot as plt
  10. # import mrestimator as mre # v0.1.6b4
  11. from itertools import product
  12. from time import gmtime, strftime
  13. from tqdm import tqdm
  14. # set directory to the location of this script file to use relative paths
  15. os.chdir(os.path.dirname(__file__))
  16. path_base = "../dat"
  17. numfiles = 17000
  18. numreps = 100
  19. targettau = np.logspace(1, 3, 5)
  20. targetlength = np.logspace(1, 4, 25) # in multiples of tau
  21. targetlength = np.insert(targetlength, 0, [1, 2, 3, 4, 5, 6, 7, 8, 9])
  22. files_processed = np.full((numfiles), False, dtype=bool)
  23. def h5_load(filename, dsetname, raise_ex=False):
  24. try:
  25. file = h5py.File(filename, "r")
  26. try:
  27. res = file[dsetname][:]
  28. # maybe it is a scalar
  29. except ValueError:
  30. res = file[dsetname][()]
  31. file.close()
  32. return res
  33. except Exception as e:
  34. # print(f"failed to load {dsetname} from {filename}")
  35. if raise_ex:
  36. raise e
  37. for method in ["sm", "ts"]:
  38. for tau_tar in tqdm(targettau):
  39. k_max = int(20 * tau_tar)
  40. f_tar = h5py.File(path_base + f"/merged_{method}_tau_{tau_tar:.2}.hdf5", "a")
  41. try:
  42. # this seems broken due to integer division
  43. # f_tar.create_dataset("absolute_length", data=absolute_length)
  44. f_tar.create_dataset("relative_length", data=targetlength)
  45. f_tar.create_dataset("target_tau", data=tau_tar)
  46. f_tar.create_dataset("method", data=f"{method}")
  47. except:
  48. pass
  49. try:
  50. # print("reusing exisiting dset 'data'")
  51. dset = f_tar['data']
  52. except Exception as e:
  53. # print(e)
  54. dset = f_tar.create_dataset(
  55. "data", (len(targetlength), numreps, k_max), dtype="f", fillvalue=np.nan
  56. )
  57. descrirption = "3-dimensional array where indices are assigend as follows:\n"
  58. descrirption += "0 = length_index, check other arrays to retrieve absolute or relative length from the entry matching this index\n"
  59. descrirption += "1 = repetition\n"
  60. descrirption += "2 = rk_values, padded with NaN if trial was too short"
  61. dset.attrs["description"] = descrirption
  62. for i in range(1, numfiles+1):
  63. file_name = f"{path_base}/{i}.hdf5"
  64. try:
  65. tau = h5_load(file_name, "meta/tau", raise_ex=True)
  66. except Exception as e:
  67. # print(f"Unable to open {file_name}: {e}")
  68. continue
  69. if tau != tau_tar:
  70. continue
  71. num_steps = h5_load(file_name, "meta/numsteps")
  72. src_length_index = np.where(targetlength.astype("i") == int(num_steps))[0][0]
  73. src_rep_index = int(h5_load(file_name, "meta/repetition"))
  74. src_rks = h5_load(file_name, f"rks/{method}")
  75. # print(src_rks)
  76. # write dset, keep in mind that rks_might be shorter than 20 tau due to
  77. # super short trial lengths.
  78. dset[src_length_index, src_rep_index, 0 : len(src_rks)] = src_rks
  79. files_processed[i-1] = True
  80. f_tar.close()
  81. if files_processed.all() == True:
  82. print("All files processed")
  83. else:
  84. print("Missing:")
  85. print(np.where(files_processed == False))