import_recordings.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. #!usr/bin/env python
  2. # -*- coding: utf8 -*-
  3. # -----------------------------------------------------------------------------
  4. # File: import_recordings.py (as part of project URUMETRICS)
  5. # Created: 20/05/2022 16:25
  6. # Last Modified: 20/05/2022 16:25
  7. # -----------------------------------------------------------------------------
  8. # Author: William N. Havard
  9. # Postdoctoral Researcher
  10. #
  11. # Mail : william.havard@ens.fr / william.havard@gmail.com
  12. #
  13. # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
  14. #
  15. # ------------------------------------------------------------------------------
  16. # Description:
  17. # •
  18. # -----------------------------------------------------------------------------
  19. import csv
  20. import logging
  21. import os
  22. from datetime import datetime
  23. import pandas as pd
  24. from ChildProject.utils import get_audio_duration
  25. from consts import CHILDREN_DEFAULT_DOB
  26. from utils import walk_dir
  27. logger = logging.getLogger(__name__)
  28. def _get_recordings(recordings_path):
  29. """
  30. Returns a DataFrame of all the recordings already imported or an empty DataFrame if `recordings.csv` does not
  31. exist
  32. :param recordings_path: Path to the `recordings.csv` metadata file
  33. :type recordings_path: str
  34. :return: dataframe of already imported recordings or empty dataframe
  35. :rtype: pandas.DataFrame
  36. """
  37. try:
  38. data = pd.read_csv(recordings_path)
  39. #TODO, check that data has wanted columns?
  40. except:
  41. columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
  42. 'recording_device_type', 'recording_filename', 'session_id']
  43. data = pd.DataFrame(columns=columns)
  44. return data
  45. def _get_children(children_path):
  46. """
  47. Returns a DataFrame of all the children already imported or an empty DataFrame if `children.csv` does not
  48. exist
  49. :param recordings_path: Path to the `children.csv` metadata file
  50. :type children_path: str
  51. :return: dataframe of already imported children or empty dataframe
  52. :rtype: pandas.DataFrame
  53. """
  54. try:
  55. data = pd.read_csv(children_path)
  56. #TODO, check that data has wanted columns?
  57. except:
  58. columns = ['experiment', 'child_id', 'child_dob']
  59. data = pd.DataFrame(columns=columns)
  60. return data
  61. #ac2pl
  62. def _get_correspondance(correspondance_path):
  63. """
  64. Returns a DataFrame of correspondances across child ID (phone numbers) or an empty DataFrame if `correspondance.csv` does not
  65. exist
  66. :param recordings_path: Path to the `correspondance.csv` metadata file
  67. :type correspondance_path: str
  68. :return: dataframe of correspondances across child ID (phone numbers) or empty dataframe
  69. :rtype: pandas.DataFrame
  70. """
  71. try:
  72. data = pd.read_csv(correspondance_path)
  73. #TODO, check that data has wanted columns?
  74. except:
  75. columns = ['number', 'original']
  76. data = pd.DataFrame(columns=columns)
  77. return data
  78. def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type):
  79. """
  80. Return the metadata corresponding to a given file (date, time, duration, etc.)
  81. :param recordings_path: path to the directory storing the WAV files
  82. :type recordings_path: str
  83. :param recording: name of the WAV file
  84. :type recording: str
  85. :param experiment: name of the experiment the recording belongs to
  86. :type experiment: str
  87. :param recording_device_type: type of recording device used
  88. :type recording_device_type: str
  89. :return: metadata for the given file (possibly none)
  90. :rtype: dict or bool
  91. """
  92. raw_filename, _ = os.path.splitext(os.path.basename(recording))
  93. try:
  94. child_id_, *experiment_stage, date_iso_, start_time_ = raw_filename.split('_')
  95. child_id = 'chi_{}'.format(child_id_) # coerce ID to be a string (prevent later mistakes)
  96. date_iso = datetime.strptime(date_iso_, '%Y%m%d').strftime('%Y-%m-%d')
  97. start_time = datetime.strptime(start_time_, '%H%M%S').strftime('%H:%M:%S')
  98. session_id = '{}_{}'.format(child_id, date_iso_)
  99. duration = int(get_audio_duration(os.path.join(recordings_path, recording)) * 1000)
  100. return {'experiment': experiment,
  101. 'experiment_stage': '_'.join(experiment_stage),
  102. 'child_id': child_id,
  103. 'date_iso': date_iso,
  104. 'start_time': start_time,
  105. 'recording_device_type': recording_device_type,
  106. 'recording_filename': recording,
  107. 'session_id': session_id,
  108. 'duration': duration,
  109. 'imported_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  110. }
  111. except Exception as e:
  112. logger.error(f'{raw_filename} raised an error. This file will be discarded. (Exception: {str(e)})')
  113. return False
  114. def import_recordings(project_path, experiment, recording_device_type):
  115. """
  116. This function creates or update the metadata file `recordings.csv`
  117. :param project_path: Path to `dat/data_set` directory:
  118. :type project_path: str
  119. :param experiment: name of the experiment
  120. :type experiment: str
  121. :param recording_device_type: type of device used to record the data
  122. :type recording_device_type: str
  123. :return: None
  124. :rtype: None
  125. """
  126. recordings_metadata_path = os.path.join(project_path, 'metadata', 'recordings.csv')
  127. recordings = _get_recordings(recordings_metadata_path)
  128. recordings_count = len(recordings)
  129. recordings_path = os.path.join(project_path, 'recordings', 'raw')
  130. recording_file_list = walk_dir(recordings_path, ext='wav', return_full_path=False)
  131. for recording_file in recording_file_list:
  132. if recording_file in recordings['recording_filename'].values: continue
  133. recording_metadata = _build_recording_metadata(recordings_path, recording_file,
  134. experiment, recording_device_type)
  135. # Add new recordings only
  136. if not recording_metadata:
  137. continue
  138. else:
  139. recordings = pd.concat([recordings, pd.DataFrame.from_dict([recording_metadata])], ignore_index=True, axis=0)
  140. recordings['duration'] = recordings['duration'].astype(int)
  141. recordings.to_csv(recordings_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
  142. logger.info('{} new recordings imported ({} recordings altogether).'.format(len(recordings) - recordings_count,
  143. len(recordings)))
  144. def import_children(project_path, experiment):
  145. """
  146. This function creates or update the metadata file `children.csv`
  147. :param project_path: Path to `dat/data_set` directory
  148. :type project_path: str
  149. :param experiment: name of the experiment
  150. :type experiment: str
  151. :return: None
  152. :rtype: None
  153. """
  154. recordings_metadata_path = os.path.join(project_path, 'metadata', 'recordings.csv')
  155. children_metadata_path = os.path.join(project_path, 'metadata', 'children.csv')
  156. correspondance_metadata_path = os.path.join(project_path, 'metadata', 'correspondance.csv') #ac2lp
  157. recordings = _get_recordings(recordings_metadata_path)
  158. children = _get_children(children_metadata_path)
  159. children_count = len(children)
  160. children = _get_correspondance(correspondance_metadata_path) #ac2pl
  161. child_id_recordings = set(recordings['child_id'])
  162. missing_children = child_id_recordings - set(children['child_id'])
  163. for child_id in missing_children:
  164. # ac2lp check if children may not correspond to another telephone number
  165. # add: if child_id in correspondance['original'] then redefine child_id as the right cell in correspondance['original'] else what follows
  166. child_metadata = {
  167. 'experiment': experiment,
  168. 'child_id': child_id,
  169. 'child_dob': CHILDREN_DEFAULT_DOB
  170. }
  171. children = pd.concat([children, pd.DataFrame.from_dict([child_metadata])], ignore_index=True, axis=0)
  172. children.to_csv(children_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
  173. logger.info('{} new children imported ({} children altogether).'.format(len(children) - children_count,
  174. len(children)))
  175. def data_importation(project_path, experiment, recording_device_type):
  176. """
  177. This functions imports new recordings and updates `recordings.csv` and updates `children.csv` if necessary.
  178. :param project_path: Path to `dat/data_set` directory
  179. :type project_path: str
  180. :param experiment: name of the experiment
  181. :type experiment: str
  182. :param recording_device_type: type of device used to record the data
  183. :type recording_device_type: str
  184. :return: None
  185. :rtype: None
  186. """
  187. import_recordings(project_path, experiment, recording_device_type)
  188. import_children(project_path, experiment)
  189. def main(project_path, experiment, recording_device_type='unknown'):
  190. """
  191. Import recordings to the current ChildProject dataset
  192. :param experiment: name of the experiment
  193. :type experiment: str
  194. :param recording_device_type: type of recording device used to capture the audio
  195. :type recording_device_type: str
  196. :return: None
  197. :rtype: None
  198. """
  199. # Check if running the script from the root of the data set
  200. expected_recording_path = os.path.join(project_path, 'recordings', 'raw')
  201. expected_metadata_path = os.path.join(project_path, 'metadata')
  202. assert os.path.exists(expected_recording_path) and os.path.exists(expected_metadata_path), \
  203. ValueError('Expected recording ({}) and metadata ({}) path not found. Are you sure to be running this '
  204. 'command from the root of the data set?'.format(expected_recording_path, expected_metadata_path))
  205. data_importation(project_path, experiment, recording_device_type)
  206. def _parse_args(argv):
  207. import argparse
  208. parser = argparse.ArgumentParser(description='Import recordings to a ChildProject data set.')
  209. parser.add_argument('--project-path', required=False, type=str, default='',
  210. help="Path to a ChildProject/datalad project (useful for debugging purposes).")
  211. parser.add_argument('--experiment', required=True, type=str,
  212. help='Name of the experiments.')
  213. parser.add_argument('--recording-device-type', required=False, type=str, default='unknown',
  214. help="Type of recording device used to record the audio files.")
  215. args = parser.parse_args(argv)
  216. return vars(args)
  217. if __name__ == '__main__':
  218. import sys
  219. pgrm_name, argv = sys.argv[0], sys.argv[1:]
  220. args = _parse_args(argv)
  221. logging.basicConfig(level=logging.INFO)
  222. try:
  223. main(**args)
  224. sys.exit(0)
  225. except Exception as e:
  226. logger.exception(e)
  227. sys.exit(1)