#!usr/bin/env python # -*- coding: utf8 -*- # ----------------------------------------------------------------------------- # File: import_recordings.py (as part of project URUMETRICS) # Created: 20/05/2022 16:25 # Last Modified: 20/05/2022 16:25 # ----------------------------------------------------------------------------- # Author: William N. Havard # Postdoctoral Researcher # # Mail : william.havard@ens.fr / william.havard@gmail.com # # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique # # ------------------------------------------------------------------------------ # Description: # • # ----------------------------------------------------------------------------- import csv import logging import os import sys from datetime import datetime import pandas as pd from ChildProject.utils import get_audio_duration from .consts import CHILDREN_DEFAULT_DOB from .utils import walk_dir logger = logging.getLogger(__name__) def _get_recordings(recordings_path): """ Returns a DataFrame of all the recordings already imported or a empty DataFrame if `recordings.csv` does not exist :param recordings_path: Path to the `recordings.csv` metadata file :type recordings_path: str :return: dataframe of already imported recordings or empty dataframe :rtype: pandas.DataFrame """ try: data = pd.read_csv(recordings_path) except: columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time', 'recording_device_type', 'recording_filename', 'session_id'] data = pd.DataFrame(columns=columns) return data def _get_children(children_path): """ Returns a DataFrame of all the children already imported or a empty DataFrame if `children.csv` does not exist :param recordings_path: Path to the `children.csv` metadata file :type children_path: str :return: dataframe of already imported children or empty dataframe :rtype: pandas.DataFrame """ try: data = pd.read_csv(children_path) except: columns = ['experiment', 'child_id', 'child_dob'] data = pd.DataFrame(columns=columns) return data def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type): """ Return the metadata corresponding to a given file (date, time, duration, etc.) :param recordings_path: path to the directory storing the WAV files :type recordings_path: str :param recording: name of the WAV file :type recording: str :param experiment: name of the experiment the recording belongs to :type experiment: str :param recording_device_type: type of recording device used :type recording_device_type: str :return: metadata for the given file (possibly none) :rtype: dict or bool """ raw_filename, _ = os.path.splitext(recording) try: child_id, *experiment_stage, date_iso_, start_time_ = raw_filename.split('_') date_iso = datetime.strptime(date_iso_, '%Y%m%d').strftime('%Y-%m-%d') start_time = datetime.strptime(start_time_, '%H%M%S').strftime('%H:%M:%S') session_id = '{}_{}'.format(child_id, date_iso_) duration = int(get_audio_duration(os.path.join(recordings_path, recording)) * 1000) return {'experiment': experiment, 'experiment_stage': '_'.join(experiment_stage), 'child_id': child_id, 'date_iso': date_iso, 'start_time': start_time, 'recording_device_type': recording_device_type, 'recording_filename': recording, 'session_id': session_id, 'duration': duration, 'imported_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } except Exception as e: logger.error(f'{raw_filename} raised an error. This file will be discarded. (Exception: {str(e)})') return False def import_recordings(data_path, experiment, recording_device_type): """ This function creates or update the metadata file `recordings.csv` :param data_path: Path to `dat/data_set` directory: :type data_path: str :param experiment: name of the experiment :type experiment: str :param recording_device_type: type of device used to record the data :type recording_device_type: str :return: None :rtype: None """ recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv') recordings = _get_recordings(recordings_metadata_path) recordings_count = len(recordings) recordings_path = os.path.join(data_path, 'recordings', 'raw') recording_file_list = walk_dir(recordings_path, ext='wav', return_full_path=False) for recording_file in recording_file_list: if recording_file in recordings['recording_filename'].values: continue recording_metadata = _build_recording_metadata(recordings_path, recording_file, experiment, recording_device_type) # Add new recordings only if not recording_metadata: continue else: recordings = pd.concat([recordings, pd.DataFrame.from_dict([recording_metadata])], ignore_index=True) recordings.to_csv(recordings_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC) logger.info('{} new recordings imported ({} recordings altogether).'.format(len(recordings) - recordings_count, len(recordings))) def import_children(data_path, experiment): """ This function creates or update the metadata file `children.csv` :param data_path: Path to `dat/data_set` directory :type data_path: str :param experiment: name of the experiment :type experiment: str :return: None :rtype: None """ recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv') children_metadata_path = os.path.join(data_path, 'metadata', 'children.csv') recordings = _get_recordings(recordings_metadata_path) children = _get_children(children_metadata_path) children_count = len(children) child_id_recordings = set(recordings['child_id']) missing_children = child_id_recordings - set(children['child_id']) for child_id in missing_children: child_metadata = { 'experiment': experiment, 'child_id': child_id, 'child_dob': CHILDREN_DEFAULT_DOB } children = pd.concat([children, pd.DataFrame.from_dict([child_metadata])], ignore_index=True) children.to_csv(children_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC) logger.info('{} new children imported ({} children altogether).'.format(len(children) - children_count, len(children))) def data_importation(data_path, experiment, recording_device_type): """ This functions imports new recordings and updates `recordings.csv` and updates `children.csv` if necessary. :param data_path: Path to `dat/data_set` directory :type data_path: str :param experiment: name of the experiment :type experiment: str :param recording_device_type: type of device used to record the data :type recording_device_type: str :return: None :rtype: None """ import_recordings(data_path, experiment, recording_device_type) import_children(data_path, experiment) def main(experiment='URU22', recording_device_type='unknown'): data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set')) data_importation(data_path, experiment, recording_device_type) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) try: main() sys.exit(0) except Exception as e: print(e) sys.exit(1)