import_recordings.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!usr/bin/env python
  2. # -*- coding: utf8 -*-
  3. # -----------------------------------------------------------------------------
  4. # File: import_recordings.py (as part of project URUMETRICS)
  5. # Created: 20/05/2022 16:25
  6. # Last Modified: 20/05/2022 16:25
  7. # -----------------------------------------------------------------------------
  8. # Author: William N. Havard
  9. # Postdoctoral Researcher
  10. #
  11. # Mail : william.havard@ens.fr / william.havard@gmail.com
  12. #
  13. # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
  14. #
  15. # ------------------------------------------------------------------------------
  16. # Description:
  17. # •
  18. # -----------------------------------------------------------------------------
  19. import csv
  20. import logging
  21. import os
  22. import sys
  23. from datetime import datetime
  24. import pandas as pd
  25. from ChildProject.utils import get_audio_duration
  26. from .consts import CHILDREN_DEFAULT_DOB
  27. from .utils import walk_dir
  28. logger = logging.getLogger(__name__)
  29. def _get_recordings(recordings_path):
  30. """
  31. Returns a DataFrame of all the recordings already imported or a empty DataFrame if `recordings.csv` does not
  32. exist
  33. :param recordings_path: Path to the `recordings.csv` metadata file
  34. :type recordings_path: str
  35. :return: dataframe of already imported recordings or empty dataframe
  36. :rtype: pandas.DataFrame
  37. """
  38. try:
  39. data = pd.read_csv(recordings_path)
  40. except:
  41. columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
  42. 'recording_device_type', 'recording_filename', 'session_id']
  43. data = pd.DataFrame(columns=columns)
  44. return data
  45. def _get_children(children_path):
  46. """
  47. Returns a DataFrame of all the children already imported or a empty DataFrame if `children.csv` does not
  48. exist
  49. :param recordings_path: Path to the `children.csv` metadata file
  50. :type children_path: str
  51. :return: dataframe of already imported children or empty dataframe
  52. :rtype: pandas.DataFrame
  53. """
  54. try:
  55. data = pd.read_csv(children_path)
  56. except:
  57. columns = ['experiment', 'child_id', 'child_dob']
  58. data = pd.DataFrame(columns=columns)
  59. return data
  60. def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type):
  61. """
  62. Return the metadata corresponding to a given file (date, time, duration, etc.)
  63. :param recordings_path: path to the directory storing the WAV files
  64. :type recordings_path: str
  65. :param recording: name of the WAV file
  66. :type recording: str
  67. :param experiment: name of the experiment the recording belongs to
  68. :type experiment: str
  69. :param recording_device_type: type of recording device used
  70. :type recording_device_type: str
  71. :return: metadata for the given file (possibly none)
  72. :rtype: dict or bool
  73. """
  74. raw_filename, _ = os.path.splitext(recording)
  75. try:
  76. child_id, *experiment_stage, date_iso_, start_time_ = raw_filename.split('_')
  77. date_iso = datetime.strptime(date_iso_, '%Y%m%d').strftime('%Y-%m-%d')
  78. start_time = datetime.strptime(start_time_, '%H%M%S').strftime('%H:%M:%S')
  79. session_id = '{}_{}'.format(child_id, date_iso_)
  80. duration = int(get_audio_duration(os.path.join(recordings_path, recording)) * 1000)
  81. return {'experiment': experiment,
  82. 'experiment_stage': '_'.join(experiment_stage),
  83. 'child_id': child_id,
  84. 'date_iso': date_iso,
  85. 'start_time': start_time,
  86. 'recording_device_type': recording_device_type,
  87. 'recording_filename': recording,
  88. 'session_id': session_id,
  89. 'duration': duration,
  90. 'imported_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  91. }
  92. except Exception as e:
  93. logger.error(f'{raw_filename} raised an error. This file will be discarded. (Exception: {str(e)})')
  94. return False
  95. def import_recordings(data_path, experiment, recording_device_type):
  96. """
  97. This function creates or update the metadata file `recordings.csv`
  98. :param data_path: Path to `dat/data_set` directory:
  99. :type data_path: str
  100. :param experiment: name of the experiment
  101. :type experiment: str
  102. :param recording_device_type: type of device used to record the data
  103. :type recording_device_type: str
  104. :return: None
  105. :rtype: None
  106. """
  107. recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv')
  108. recordings = _get_recordings(recordings_metadata_path)
  109. recordings_count = len(recordings)
  110. recordings_path = os.path.join(data_path, 'recordings', 'raw')
  111. recording_file_list = walk_dir(recordings_path, ext='wav', return_full_path=False)
  112. for recording_file in recording_file_list:
  113. if recording_file in recordings['recording_filename'].values: continue
  114. recording_metadata = _build_recording_metadata(recordings_path, recording_file,
  115. experiment, recording_device_type)
  116. # Add new recordings only
  117. if not recording_metadata:
  118. continue
  119. else:
  120. recordings = pd.concat([recordings, pd.DataFrame.from_dict([recording_metadata])], ignore_index=True)
  121. recordings.to_csv(recordings_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
  122. logger.info('{} new recordings imported ({} recordings altogether).'.format(len(recordings) - recordings_count,
  123. len(recordings)))
  124. def import_children(data_path, experiment):
  125. """
  126. This function creates or update the metadata file `children.csv`
  127. :param data_path: Path to `dat/data_set` directory
  128. :type data_path: str
  129. :param experiment: name of the experiment
  130. :type experiment: str
  131. :return: None
  132. :rtype: None
  133. """
  134. recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv')
  135. children_metadata_path = os.path.join(data_path, 'metadata', 'children.csv')
  136. recordings = _get_recordings(recordings_metadata_path)
  137. children = _get_children(children_metadata_path)
  138. children_count = len(children)
  139. child_id_recordings = set(recordings['child_id'])
  140. missing_children = child_id_recordings - set(children['child_id'])
  141. for child_id in missing_children:
  142. child_metadata = {
  143. 'experiment': experiment,
  144. 'child_id': child_id,
  145. 'child_dob': CHILDREN_DEFAULT_DOB
  146. }
  147. children = pd.concat([children, pd.DataFrame.from_dict([child_metadata])], ignore_index=True)
  148. children.to_csv(children_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
  149. logger.info('{} new children imported ({} children altogether).'.format(len(children) - children_count,
  150. len(children)))
  151. def data_importation(data_path, experiment, recording_device_type):
  152. """
  153. This functions imports new recordings and updates `recordings.csv` and updates `children.csv` if necessary.
  154. :param data_path: Path to `dat/data_set` directory
  155. :type data_path: str
  156. :param experiment: name of the experiment
  157. :type experiment: str
  158. :param recording_device_type: type of device used to record the data
  159. :type recording_device_type: str
  160. :return: None
  161. :rtype: None
  162. """
  163. import_recordings(data_path, experiment, recording_device_type)
  164. import_children(data_path, experiment)
  165. def main(experiment='URU22', recording_device_type='unknown'):
  166. data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set'))
  167. data_importation(data_path, experiment, recording_device_type)
  168. if __name__ == '__main__':
  169. logging.basicConfig(level=logging.INFO)
  170. try:
  171. main()
  172. sys.exit(0)
  173. except Exception as e:
  174. print(e)
  175. sys.exit(1)