123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- #!usr/bin/env python
- # -*- coding: utf8 -*-
- # -----------------------------------------------------------------------------
- # File: import_recordings.py (as part of project URUMETRICS)
- # Created: 20/05/2022 16:25
- # Last Modified: 20/05/2022 16:25
- # -----------------------------------------------------------------------------
- # Author: William N. Havard
- # Postdoctoral Researcher
- #
- # Mail : william.havard@ens.fr / william.havard@gmail.com
- #
- # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
- #
- # ------------------------------------------------------------------------------
- # Description:
- # •
- # -----------------------------------------------------------------------------
- import csv
- import logging
- import os
- import sys
- from datetime import datetime
- import pandas as pd
- from ChildProject.utils import get_audio_duration
- from .consts import CHILDREN_DEFAULT_DOB
- from .utils import walk_dir
- logger = logging.getLogger(__name__)
- def _get_recordings(recordings_path):
- """
- Returns a DataFrame of all the recordings already imported or a empty DataFrame if `recordings.csv` does not
- exist
- :param recordings_path: Path to the `recordings.csv` metadata file
- :type recordings_path: str
- :return: dataframe of already imported recordings or empty dataframe
- :rtype: pandas.DataFrame
- """
- try:
- data = pd.read_csv(recordings_path)
- except:
- columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
- 'recording_device_type', 'recording_filename', 'session_id']
- data = pd.DataFrame(columns=columns)
- return data
- def _get_children(children_path):
- """
- Returns a DataFrame of all the children already imported or a empty DataFrame if `children.csv` does not
- exist
- :param recordings_path: Path to the `children.csv` metadata file
- :type children_path: str
- :return: dataframe of already imported children or empty dataframe
- :rtype: pandas.DataFrame
- """
- try:
- data = pd.read_csv(children_path)
- except:
- columns = ['experiment', 'child_id', 'child_dob']
- data = pd.DataFrame(columns=columns)
- return data
- def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type):
- """
- Return the metadata corresponding to a given file (date, time, duration, etc.)
- :param recordings_path: path to the directory storing the WAV files
- :type recordings_path: str
- :param recording: name of the WAV file
- :type recording: str
- :param experiment: name of the experiment the recording belongs to
- :type experiment: str
- :param recording_device_type: type of recording device used
- :type recording_device_type: str
- :return: metadata for the given file (possibly none)
- :rtype: dict or bool
- """
- raw_filename, _ = os.path.splitext(recording)
- try:
- child_id, *experiment_stage, date_iso_, start_time_ = raw_filename.split('_')
- date_iso = datetime.strptime(date_iso_, '%Y%m%d').strftime('%Y-%m-%d')
- start_time = datetime.strptime(start_time_, '%H%M%S').strftime('%H:%M:%S')
- session_id = '{}_{}'.format(child_id, date_iso_)
- duration = int(get_audio_duration(os.path.join(recordings_path, recording)) * 1000)
- return {'experiment': experiment,
- 'experiment_stage': '_'.join(experiment_stage),
- 'child_id': child_id,
- 'date_iso': date_iso,
- 'start_time': start_time,
- 'recording_device_type': recording_device_type,
- 'recording_filename': recording,
- 'session_id': session_id,
- 'duration': duration,
- 'imported_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
- }
- except Exception as e:
- logger.error(f'{raw_filename} raised an error. This file will be discarded. (Exception: {str(e)})')
- return False
- def import_recordings(data_path, experiment, recording_device_type):
- """
- This function creates or update the metadata file `recordings.csv`
- :param data_path: Path to `dat/data_set` directory:
- :type data_path: str
- :param experiment: name of the experiment
- :type experiment: str
- :param recording_device_type: type of device used to record the data
- :type recording_device_type: str
- :return: None
- :rtype: None
- """
- recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv')
- recordings = _get_recordings(recordings_metadata_path)
- recordings_count = len(recordings)
- recordings_path = os.path.join(data_path, 'recordings', 'raw')
- recording_file_list = walk_dir(recordings_path, ext='wav', return_full_path=False)
- for recording_file in recording_file_list:
- if recording_file in recordings['recording_filename'].values: continue
- recording_metadata = _build_recording_metadata(recordings_path, recording_file,
- experiment, recording_device_type)
- # Add new recordings only
- if not recording_metadata:
- continue
- else:
- recordings = pd.concat([recordings, pd.DataFrame.from_dict([recording_metadata])], ignore_index=True)
- recordings.to_csv(recordings_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
- logger.info('{} new recordings imported ({} recordings altogether).'.format(len(recordings) - recordings_count,
- len(recordings)))
- def import_children(data_path, experiment):
- """
- This function creates or update the metadata file `children.csv`
- :param data_path: Path to `dat/data_set` directory
- :type data_path: str
- :param experiment: name of the experiment
- :type experiment: str
- :return: None
- :rtype: None
- """
- recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv')
- children_metadata_path = os.path.join(data_path, 'metadata', 'children.csv')
- recordings = _get_recordings(recordings_metadata_path)
- children = _get_children(children_metadata_path)
- children_count = len(children)
- child_id_recordings = set(recordings['child_id'])
- missing_children = child_id_recordings - set(children['child_id'])
- for child_id in missing_children:
- child_metadata = {
- 'experiment': experiment,
- 'child_id': child_id,
- 'child_dob': CHILDREN_DEFAULT_DOB
- }
- children = pd.concat([children, pd.DataFrame.from_dict([child_metadata])], ignore_index=True)
- children.to_csv(children_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
- logger.info('{} new children imported ({} children altogether).'.format(len(children) - children_count,
- len(children)))
- def data_importation(data_path, experiment, recording_device_type):
- """
- This functions imports new recordings and updates `recordings.csv` and updates `children.csv` if necessary.
- :param data_path: Path to `dat/data_set` directory
- :type data_path: str
- :param experiment: name of the experiment
- :type experiment: str
- :param recording_device_type: type of device used to record the data
- :type recording_device_type: str
- :return: None
- :rtype: None
- """
- import_recordings(data_path, experiment, recording_device_type)
- import_children(data_path, experiment)
- def main(experiment='URU22', recording_device_type='unknown'):
- data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set'))
- data_importation(data_path, experiment, recording_device_type)
- if __name__ == '__main__':
- logging.basicConfig(level=logging.INFO)
- try:
- main()
- sys.exit(0)
- except Exception as e:
- print(e)
- sys.exit(1)
|