prepare_data.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import pandas as pd
  4. import numpy as np
  5. import scipy.io as sio
  6. import matplotlib as mpl
  7. import matplotlib.pyplot as plt
  8. import os
  9. import re
  10. FS = 192e3
  11. DEAF_BATS = {"b3", "b5", "b8"}
  12. def calculate_f0_slope(f0s):
  13. unmasked_indices, = np.where(~f0s.mask)
  14. if len(unmasked_indices) >= 2:
  15. first_index, last_index = unmasked_indices[0], unmasked_indices[-1]
  16. first_f0, last_f0 = f0s[first_index], f0s[last_index]
  17. return (last_f0 - first_f0) / (last_index - first_index) * 6000 # factor 6000: converts to Hz/seconds
  18. else:
  19. return np.nan
  20. def process_file(session_id, animals, session_start_time, filename):
  21. data = sio.loadmat(filename)
  22. if data["result"].size == 0:
  23. return pd.DataFrame()
  24. data = data["result"][0, 0]
  25. start_times = session_start_time + pd.to_timedelta(
  26. np.concatenate(data["call"][0, :]["call_start_sample"], axis=1)[0] / FS,
  27. unit="s")
  28. valid_indices = np.where(list(map(lambda x: x.shape[1] != 0, data["call"][0, :]["call_levels"])))[0]
  29. start_samples = np.concatenate(data["call"][0, valid_indices]["call_start_sample"], axis=1)[0]
  30. call_durations = np.concatenate(data["call"][0, valid_indices]["call_dur"], axis=1)[0]
  31. f_mins = np.concatenate(data["call"][0, valid_indices]["Fmin"], axis=1)[0]
  32. f_maxs = np.concatenate(data["call"][0, valid_indices]["Fmax"], axis=1)[0]
  33. f0s = [np.ma.masked_invalid(it[:, 1]) for it in data["call"][0, valid_indices]["f0"]]
  34. f0s_compressed = [it.compressed() for it in f0s]
  35. f0s_start = [it[np.isfinite(it)][0] if len(it) > 0 else np.nan for it in f0s_compressed]
  36. f0s_end = [it[np.isfinite(it)][-1] if len(it) > 0 else np.nan for it in f0s_compressed]
  37. f0s_slope = [calculate_f0_slope(it) for it in f0s]
  38. aperiodicities = [np.ma.masked_invalid(it[:, 2]).compressed() for it in data["call"][0, valid_indices]["f0"]]
  39. spectral_centroid = np.concatenate(data["call"][0, valid_indices]["SCF"], axis=1)[0]
  40. if animals:
  41. call_levels = np.concatenate(data["call"][0, valid_indices]["call_levels"])
  42. level_differences = np.mean(call_levels[:, [1, 4, 5]], axis=1) - np.mean(call_levels[:, [0, 2, 3]], axis=1)
  43. calling_bat = np.where(level_differences > 0, animals[1], animals[0])
  44. other_bat = np.where(level_differences > 0, animals[0], animals[1])
  45. else:
  46. level_differences = np.nan
  47. calling_bat = other_bat = ""
  48. call_rms = [10*np.log10(np.mean(it**2)) for it in data["call"][0, valid_indices]["loudest_call"]]
  49. return pd.DataFrame(dict(session_id=session_id,
  50. call_id=np.arange(len(start_times)),
  51. start_sample=start_samples,
  52. start_time=start_times,
  53. calling_bat=calling_bat,
  54. other_bat=other_bat,
  55. level_difference=level_differences,
  56. call_rms=call_rms,
  57. call_duration=call_durations,
  58. f_min=f_mins,
  59. f_max=f_maxs,
  60. mean_aperiodicity=[np.mean(it) for it in aperiodicities],
  61. f0_mean=[np.mean(it) for it in f0s],
  62. f0_min=[np.min(it) for it in f0s],
  63. f0_max=[np.max(it) for it in f0s],
  64. f0_start=f0s_start,
  65. f0_end=f0s_end,
  66. f0_slope=f0s_slope,
  67. spectral_centroid=spectral_centroid))
  68. ###
  69. # Pup sessions
  70. ###
  71. PUP_RESULTS_ROOT_DIR = "../raw_data/pups/results"
  72. sessions = []
  73. calls = []
  74. dn_re = re.compile(r"^vpl_(b\d)(..)$")
  75. fn_re = re.compile(r"^vpl_...._(\d\d)-(...)-(\d\d\d\d)_(\d\d)x(\d\d)x(\d\d)_m1.mat$")
  76. session_id = 0
  77. dlist = sorted(os.listdir(PUP_RESULTS_ROOT_DIR))
  78. for i, dn in enumerate(dlist):
  79. dpath = os.path.join(PUP_RESULTS_ROOT_DIR, dn)
  80. dn_mt = dn_re.match(dn)
  81. if not dn_mt:
  82. continue
  83. animals = [dn_mt.group(1), dn_mt.group(2)]
  84. for fn in sorted(os.listdir(dpath)):
  85. fn_mt = fn_re.match(fn)
  86. if not fn_mt:
  87. continue
  88. fpath = os.path.join(dpath, fn)
  89. print("\r{}/{}: {}... ".format(i + 1, len(dlist), fpath), end="")
  90. start_time = pd.to_datetime("{} {} {} {}:{}:{}".format(*[fn_mt.group(1 + i) for i in range(6)]))
  91. df = process_file(session_id, animals, start_time, fpath)
  92. calls.append(df)
  93. sessions.append((session_id, animals[0], animals[1], start_time))
  94. session_id += 1
  95. calls = pd.concat(calls)
  96. sessions = pd.DataFrame(sessions, columns=["session_id", "animal1", "animal2", "start_time"])
  97. sessions.set_index("session_id", inplace=True)
  98. calls.set_index(["session_id", "call_id"], inplace=True)
  99. calls.insert(calls.columns.get_loc("level_difference"),
  100. "calling_bat_deaf",
  101. calls["calling_bat"].isin(DEAF_BATS))
  102. sorted_sessions = sessions[sessions["animal2"].str.startswith("m")].sort_values("start_time")
  103. sessions["before_deafening"] = False
  104. calls.insert(calls.columns.get_loc("level_difference"),
  105. "before_deafening",
  106. False)
  107. for pup, sessions_per_pup in sorted_sessions.groupby("animal1"):
  108. first_id = sessions_per_pup.index[0]
  109. sessions.loc[first_id, "before_deafening"] = True
  110. calls.loc[calls.index.get_level_values(0) == first_id, "before_deafening"] = True
  111. calls.insert(calls.columns.get_loc("level_difference"),
  112. "calling_bat_mother",
  113. calls["calling_bat"].str.startswith("m"))
  114. for bool_column in ["before_deafening"]:
  115. sessions[bool_column] = sessions[bool_column].astype(np.int)
  116. for bool_column in ["calling_bat_deaf", "before_deafening", "calling_bat_mother"]:
  117. calls[bool_column] = calls[bool_column].astype(np.int)
  118. sessions.to_csv("../pup_sessions.csv")
  119. calls.to_csv("../pup_calls.csv")
  120. print("\npups done")
  121. ###
  122. # Adults
  123. ###
  124. ADULT_RESULTS_ROOT_DIR = "../raw_data/adults/"
  125. sessions = []
  126. calls = []
  127. fn_re = re.compile(r"^(\d\d\d\d)_(\d\d)_(\d\d)_(\d\d)_(\d\d)_(\d\d)_call_parameters.mat$")
  128. session_id = 0
  129. for dn in ["deaf", "hearing"]:
  130. dpath = os.path.join(ADULT_RESULTS_ROOT_DIR, dn)
  131. for fn in sorted(os.listdir(dpath)):
  132. fn_mt = fn_re.match(fn)
  133. if not fn_mt:
  134. continue
  135. print("\r{} {}... ".format(session_id, fn), end="")
  136. fpath = os.path.join(dpath, fn)
  137. groups = list(map(int, fn_mt.groups()))
  138. start_time = pd.Timestamp(year=groups[0], month=groups[1], day=groups[2],
  139. hour=groups[3], minute=groups[4], second=groups[5])
  140. df = process_file(session_id, None, start_time, fpath)
  141. if len(df) == 0:
  142. continue
  143. df.insert(df.columns.get_loc("level_difference"),
  144. "calling_bat_deaf",
  145. int(dn == "deaf"))
  146. calls.append(df)
  147. sessions.append((session_id, dn, start_time))
  148. session_id += 1
  149. calls = pd.concat(calls)
  150. calls.set_index(["session_id", "call_id"], inplace=True)
  151. calls.insert(calls.columns.get_loc("level_difference"),
  152. "before_deafening", 0)
  153. calls.insert(calls.columns.get_loc("level_difference"),
  154. "calling_bat_mother", 0)
  155. sessions = pd.DataFrame(sessions, columns=["session_id", "group", "start_time"])
  156. sessions.set_index(["session_id"], inplace=True)
  157. sessions.to_csv("../adult_sessions.csv")
  158. calls.to_csv("../adult_calls.csv")
  159. print("\nadults done")
  160. # vim:sw=4:sts=4:et: