metrics.py 8.3 KB


  1. #!usr/bin/env python
  2. # -*- coding: utf8 -*-
  3. # -----------------------------------------------------------------------------
  4. # File: metrics.py (as part of project URUMETRICS)
  5. # Created: 28/07/2022 13:58
  6. # Last Modified: 28/07/2022 13:58
  7. # -----------------------------------------------------------------------------
  8. # Author: William N. Havard
  9. # Postdoctoral Researcher
  10. #
  11. # Mail : william.havard@ens.fr / william.havard@gmail.com
  12. #
  13. # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
  14. #
  15. # ------------------------------------------------------------------------------
  16. # Description:
  17. # •
  18. # -----------------------------------------------------------------------------
  19. import logging
  20. import os
  21. import sys
  22. from typing import List, Union
  23. import numpy as np
  24. import pandas as pd
  25. from ChildProject.annotations import AnnotationManager
  26. from ChildProject.pipelines.metrics import Metrics
  27. from ChildProject.projects import ChildProject
  28. from metrics_functions import avg_pr_pm_speaker, avg_sr_pm_speaker, avg_wr_pm_speaker, \
  29. mean_mean_pitch_speaker, mean_pitch_range_speaker, \
  30. mlup_speaker, mlus_speaker, mluw_speaker, \
  31. chi_adu_turn_transitions_ph
  32. logger = logging.getLogger(__name__)
  33. class UruguayMetrics(Metrics):
  34. SUBCOMMAND = "uruguay"
  35. def __init__(
  36. self,
  37. project: ChildProject,
  38. vtc: str = "vtc",
  39. alice: str = "alice",
  40. vcm: str = "vcm",
  41. acoustic: str="acoustic",
  42. recordings: Union[str, List[str], pd.DataFrame] = None,
  43. from_time: str = None,
  44. to_time: str = None,
  45. rec_cols: str = None,
  46. child_cols: str = None,
  47. period: str = None,
  48. by: str = "recording_filename",
  49. threads: int = 1,
  50. ):
  51. self.vtc = vtc
  52. self.alice = alice
  53. self.vcm = vcm
  54. self.acoustic = acoustic
  55. self.alice_vtc = 'alice_vtc'
  56. self.conversations = 'conversations'
  57. # Creating merged alice_vtc set
  58. am = AnnotationManager(project)
  59. am.read()
  60. am.merge_sets(
  61. left_set="vtc",
  62. right_set="alice",
  63. left_columns=["speaker_type"],
  64. right_columns=["phonemes", "syllables", "words"],
  65. output_set=self.alice_vtc,
  66. full_set_merge=False,
  67. skip_existing=True,
  68. )
  69. METRICS = np.array([]).reshape(0, 3)
  70. # VTC
  71. if (expected_set:= self.vtc) not in am.annotations["set"].values:
  72. print(f"The expected set '{expected_set}' was not found in the index.")
  73. else:
  74. METRICS = np.concatenate((METRICS, np.array(
  75. [["voc_speaker_ph", expected_set, 'FEM'],
  76. ["voc_speaker_ph", expected_set, 'CHI'],
  77. ["voc_dur_speaker_ph", expected_set, 'FEM'],
  78. ["voc_dur_speaker_ph", expected_set, 'CHI'],
  79. ["avg_voc_dur_speaker", expected_set, 'FEM'],
  80. ["avg_voc_dur_speaker", expected_set, 'CHI'],
  81. ])))
  82. # VCM
  83. if (expected_set:= self.vcm )not in am.annotations["set"].values:
  84. print(f"The expected set '{expected_set}' was not found in the index.")
  85. else:
  86. METRICS = np.concatenate((METRICS, np.array(
  87. [["cry_voc_speaker_ph", expected_set, 'CHI'],
  88. ["cry_voc_dur_speaker_ph", expected_set, 'CHI'],
  89. ["avg_cry_voc_dur_speaker", expected_set, 'CHI'],
  90. ["can_voc_speaker_ph", expected_set, 'CHI'],
  91. ["can_voc_dur_speaker_ph", expected_set, 'CHI'],
  92. ["avg_can_voc_dur_speaker", expected_set, 'CHI'],
  93. ["non_can_voc_speaker_ph", expected_set, 'CHI'],
  94. ["non_can_voc_dur_speaker_ph", expected_set, 'CHI'],
  95. ["avg_non_can_voc_dur_speaker", expected_set, 'CHI'],
  96. ["lp_n", expected_set, pd.NA],
  97. ["lp_dur", expected_set, pd.NA],
  98. ["cp_n", expected_set, pd.NA],
  99. ["cp_dur", expected_set, pd.NA],
  100. ])))
  101. # ALICE+VTC
  102. if (expected_set := self.alice_vtc) not in am.annotations["set"].values:
  103. print(f"The expected set '{expected_set}' was not found in the index.")
  104. else:
  105. METRICS = np.concatenate((METRICS, np.array(
  106. [
  107. [avg_pr_pm_speaker, expected_set, 'FEM'],
  108. [avg_sr_pm_speaker, expected_set, 'FEM'],
  109. [avg_wr_pm_speaker, expected_set, 'FEM'],
  110. ["wc_speaker_ph", expected_set, 'FEM'],
  111. ["sc_speaker_ph", expected_set, 'FEM'],
  112. ["pc_speaker_ph", expected_set, 'FEM'],
  113. [mluw_speaker, expected_set, 'FEM'],
  114. [mlus_speaker, expected_set, 'FEM'],
  115. [mlup_speaker, expected_set, 'FEM'],
  116. ])))
  117. # Acoustic
  118. if (expected_set := self.acoustic) not in am.annotations["set"].values:
  119. print(f"The expected set '{expected_set}' was not found in the index.")
  120. else:
  121. METRICS = np.concatenate((METRICS, np.array(
  122. [
  123. [mean_mean_pitch_speaker, expected_set, 'FEM'],
  124. [mean_mean_pitch_speaker, expected_set, 'CHI'],
  125. [mean_pitch_range_speaker, expected_set, 'FEM'],
  126. [mean_pitch_range_speaker, expected_set, 'CHI'],
  127. ])))
  128. # Turns
  129. if (expected_set := self.conversations) not in am.annotations["set"].values:
  130. print(f"The expected set '{expected_set}' was not found in the index.")
  131. else:
  132. METRICS = np.concatenate((METRICS, np.array(
  133. [
  134. [chi_adu_turn_transitions_ph, expected_set, None],
  135. ])))
  136. METRICS = pd.DataFrame(METRICS, columns=["callable", "set", "speaker"])
  137. super().__init__(project, METRICS, by=by, recordings=recordings,
  138. period=period, from_time=from_time, to_time=to_time,
  139. rec_cols=rec_cols, child_cols=child_cols, threads=threads)
  140. def get_metrics(project_path, metrics_file):
  141. project = ChildProject(project_path)
  142. project.read()
  143. metrics_path = os.path.dirname(metrics_file)
  144. if not os.path.exists(metrics_path):
  145. os.makedirs(metrics_path)
  146. logger.info('Created {}'.format(metrics_path))
  147. if not os.path.exists(metrics_file):
  148. new_recordings = set(project.recordings['recording_filename'])
  149. existing_metrics = None
  150. else:
  151. existing_metrics = pd.read_csv(metrics_file)
  152. new_recordings = set(project.recordings['recording_filename']) - set(existing_metrics['recording_filename'])
  153. if new_recordings:
  154. urumetrics = UruguayMetrics(project=project, recordings=list(new_recordings))
  155. metrics = urumetrics.extract()
  156. if isinstance(existing_metrics, pd.DataFrame):
  157. metrics = pd.concat([existing_metrics, metrics])
  158. metrics.to_csv(metrics_file, index=False)
  159. def main(project_path):
  160. # Check if running the script from the root of the data set
  161. expected_metrics_path = os.path.join(project_path, 'extra')
  162. assert os.path.exists(expected_metrics_path), \
  163. ValueError('Expected metrics ({}) path not found. Are you sure to be running this '
  164. 'command from the root of the data set?'.format(expected_metrics_path))
  165. metrics_file = os.path.join(project_path, 'extra', 'metrics', 'metrics.csv')
  166. get_metrics(project_path, metrics_file)
  167. logger.info('Saved to {}.'.format(metrics_file))
  168. def _parse_args(argv):
  169. import argparse
  170. parser = argparse.ArgumentParser(description='Create a ChildProject data set.')
  171. parser.add_argument('--project-path', required=False, type=str, default='',
  172. help="Path to a ChildProject/datalad project (useful for debugging purposes).")
  173. args = parser.parse_args(argv)
  174. return vars(args)
  175. if __name__ == '__main__':
  176. import sys
  177. pgrm_name, argv = sys.argv[0], sys.argv[1:]
  178. args = _parse_args(argv)
  179. logging.basicConfig(level=logging.INFO)
  180. try:
  181. main(**args)
  182. sys.exit(0)
  183. except Exception as e:
  184. print(e)
  185. sys.exit(1)