LAAC-LSCP
/
URUMETRICS-CODE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
							#!usr/bin/env python
# -*- coding: utf8 -*-

# -----------------------------------------------------------------------------
#   File: import_recordings.py (as part of project URUMETRICS)
#   Created: 20/05/2022 16:25
#   Last Modified: 20/05/2022 16:25
# -----------------------------------------------------------------------------
#   Author: William N. Havard
#           Postdoctoral Researcher
#
#   Mail  : william.havard@ens.fr / william.havard@gmail.com
#  
#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
#
# ------------------------------------------------------------------------------
#   Description: 
#       • 
# -----------------------------------------------------------------------------

import csv
import logging
import os
from datetime import datetime

import pandas as pd
from ChildProject.utils import get_audio_duration

from consts import CHILDREN_DEFAULT_DOB
from utils import walk_dir

logger = logging.getLogger(__name__)


def _get_recordings(recordings_path):
    """
    Returns a DataFrame of all the recordings already imported or an empty DataFrame if `recordings.csv` does not
    exist
    :param recordings_path: Path to the `recordings.csv` metadata file
    :type recordings_path: str
    :return: dataframe of already imported recordings or empty dataframe
    :rtype: pandas.DataFrame
    """
    try:
        data = pd.read_csv(recordings_path)
        #TODO, check that data has wanted columns?
    except:
        columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
                   'recording_device_type', 'recording_filename', 'session_id']
        data = pd.DataFrame(columns=columns)
    return data


def _get_children(children_path):
    """
    Returns a DataFrame of all the children already imported or an empty DataFrame if `children.csv` does not
    exist
    :param recordings_path: Path to the `children.csv` metadata file
    :type children_path: str
    :return: dataframe of already imported children or empty dataframe
    :rtype: pandas.DataFrame
    """
    try:
        data = pd.read_csv(children_path)
        #TODO, check that data has wanted columns?
    except:
        columns = ['experiment', 'child_id', 'child_dob']
        data = pd.DataFrame(columns=columns)
    return data


#ac2pl
def _get_correspondance(correspondance_path):
    """
    Returns a DataFrame of correspondances across child ID (phone numbers) or an empty DataFrame if `correspondance.csv` does not
    exist
    :param recordings_path: Path to the `correspondance.csv` metadata file
    :type correspondance_path: str
    :return: dataframe of correspondances across child ID (phone numbers) or empty dataframe
    :rtype: pandas.DataFrame
    """
    try:
        data = pd.read_csv(correspondance_path)
    except:
        columns = ['new_number', 'original_number']
        data = pd.DataFrame(columns=columns)

    # Change to string
    data = data.astype({cname:'string' for cname in data.columns})

    return dict(data.values.tolist())


def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type, correspondance):
    """
    Return the metadata corresponding to a given file (date, time, duration, etc.)
    :param recordings_path: path to the directory storing the WAV files
    :type recordings_path: str
    :param recording: name of the WAV file
    :type recording: str
    :param experiment: name of the experiment the recording belongs to
    :type experiment: str
    :param recording_device_type: type of recording device used
    :type recording_device_type: str
    :return: metadata for the given file (possibly none)
    :rtype: dict or bool
    """
    raw_filename, _ = os.path.splitext(os.path.basename(recording))
    try:
        child_id_, *experiment_stage, date_iso_, start_time_ = raw_filename.split('_')
        child_id = 'chi_{}'.format(correspondance.get(child_id_, child_id_)) # coerce ID to be a string (prevents later mistakes)
        date_iso = datetime.strptime(date_iso_, '%Y%m%d').strftime('%Y-%m-%d')
        start_time = datetime.strptime(start_time_, '%H%M%S').strftime('%H:%M:%S')
        session_id = '{}_{}'.format(child_id, date_iso_)
        duration = int(get_audio_duration(os.path.join(recordings_path, recording)) * 1000)

        return {'experiment': experiment,
                'experiment_stage': '_'.join(experiment_stage),
                'child_id': child_id,
                'date_iso': date_iso,
                'start_time': start_time,
                'recording_device_type': recording_device_type,
                'recording_filename': recording,
                'session_id': session_id,
                'duration': duration,
                'imported_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                }
    except Exception as e:
        logger.error(f'{raw_filename} raised an error. This file will be discarded. (Exception: {str(e)})')
        return False


def import_recordings(project_path, experiment, recording_device_type):
    """
    This function creates or update the metadata file `recordings.csv`
    :param project_path: Path to `dat/data_set` directory:
    :type project_path: str
    :param experiment: name of the experiment
    :type experiment: str
    :param recording_device_type: type of device used to record the data
    :type  recording_device_type: str
    :return: None
    :rtype: None
    """
    recordings_metadata_path = os.path.join(project_path, 'metadata', 'recordings.csv')
    correspondance_metadata_path = os.path.join(project_path, 'metadata', 'correspondance.csv')

    recordings = _get_recordings(recordings_metadata_path)
    recordings_count = len(recordings)

    correspondance = _get_correspondance(correspondance_metadata_path)

    recordings_path = os.path.join(project_path, 'recordings', 'raw')
    recording_file_list = walk_dir(recordings_path, ext='wav', return_full_path=False)

    for recording_file in recording_file_list:
        if recording_file in recordings['recording_filename'].values: continue

        recording_metadata = _build_recording_metadata(recordings_path, recording_file,
                                                       experiment, recording_device_type,
                                                       correspondance)
        # Add new recordings only
        if not recording_metadata:
            continue
        else:
            recordings = pd.concat([recordings, pd.DataFrame.from_dict([recording_metadata])], ignore_index=True, axis=0)

    recordings['duration'] = recordings['duration'].astype(int)
    recordings.to_csv(recordings_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
    logger.info('{} new recordings imported ({} recordings altogether).'.format(len(recordings) - recordings_count,
                                                                                len(recordings)))


def import_children(project_path, experiment):
    """
    This function creates or update the metadata file `children.csv`
    :param project_path: Path to `dat/data_set` directory
    :type project_path: str
    :param experiment: name of the experiment
    :type experiment: str
    :return: None
    :rtype: None
    """
    recordings_metadata_path = os.path.join(project_path, 'metadata', 'recordings.csv')
    children_metadata_path = os.path.join(project_path, 'metadata', 'children.csv')

    recordings = _get_recordings(recordings_metadata_path)
    children = _get_children(children_metadata_path)
    children_count = len(children)

    child_id_recordings = set(recordings['child_id'])
    missing_children = child_id_recordings - set(children['child_id'])


    for child_id in missing_children:
        child_metadata = {
            'experiment': experiment,
            'child_id': child_id,
            'child_dob': CHILDREN_DEFAULT_DOB
        }
        children = pd.concat([children, pd.DataFrame.from_dict([child_metadata])], ignore_index=True, axis=0)

    children.to_csv(children_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
    logger.info('{} new children imported ({} children altogether).'.format(len(children) - children_count,
                                                                            len(children)))


def data_importation(project_path, experiment, recording_device_type):
    """
    This functions imports new recordings and updates `recordings.csv` and updates `children.csv` if necessary.
    :param project_path: Path to `dat/data_set` directory
    :type project_path: str
    :param experiment: name of the experiment
    :type experiment: str
    :param recording_device_type: type of device used to record the data
    :type recording_device_type: str
    :return: None
    :rtype: None
    """
    import_recordings(project_path, experiment, recording_device_type)
    import_children(project_path, experiment)


def main(project_path, experiment, recording_device_type='unknown'):
    """
    Import recordings to the current ChildProject dataset
    :param experiment: name of the experiment
    :type experiment: str
    :param recording_device_type: type of recording device used to capture the audio
    :type recording_device_type: str
    :return: None
    :rtype: None
    """

    # Check if running the script from the root of the data set
    expected_recording_path = os.path.join(project_path, 'recordings', 'raw')
    expected_metadata_path = os.path.join(project_path, 'metadata')

    assert os.path.exists(expected_recording_path) and os.path.exists(expected_metadata_path), \
        ValueError('Expected recording ({}) and metadata ({}) path not found. Are you sure to be running this '
                   'command from the root of the data set?'.format(expected_recording_path, expected_metadata_path))

    data_importation(project_path, experiment, recording_device_type)


def _parse_args(argv):
    import argparse

    parser = argparse.ArgumentParser(description='Import recordings to a ChildProject data set.')
    parser.add_argument('--project-path', required=False, type=str, default='',
                        help="Path to a ChildProject/datalad project (useful for debugging purposes).")
    parser.add_argument('--experiment', required=True, type=str,
                        help='Name of the experiments.')
    parser.add_argument('--recording-device-type', required=False, type=str, default='unknown',
                        help="Type of recording device used to record the audio files.")
    args = parser.parse_args(argv)

    return vars(args)


if __name__ == '__main__':
    import sys
    pgrm_name, argv = sys.argv[0], sys.argv[1:]
    args = _parse_args(argv)

    logging.basicConfig(level=logging.INFO)

    try:
        main(**args)
        sys.exit(0)
    except Exception as e:
        logger.exception(e)
        sys.exit(1)