LAAC-LSCP
/
URUMETRICS-CODE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
							#!usr/bin/env python
# -*- coding: utf8 -*-
# -----------------------------------------------------------------------------
#   File: compute_derived_annotations.py (as part of project URUMETRICS-CODE)
#   Created: 03/08/2022 17:32
#   Last Modified: 03/08/2022 17:32
# -----------------------------------------------------------------------------
#   Author: William N. Havard
#           Postdoctoral Researcher
#
#   Mail  : william.havard@ens.fr / william.havard@gmail.com
#
#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
#
# ------------------------------------------------------------------------------
#   Description:
#       •
# -----------------------------------------------------------------------------

import logging
import os
from functools import partial
import csv

import pandas as pd

from ChildProject.annotations import AnnotationManager
from ChildProject.projects import ChildProject

import annotations_functions

logger = logging.getLogger(__name__)


def _annotation_function_wrapper(func, parser_args, **kwargs):
    return partial(func,parser_args=parser_args, **kwargs)


def get_available_segments(project_path, set_name, base_sets, raw_recording_available=False):
    """
    Get the annotation segments that will be used to construct new annotations for a set.
    This returns the segments in the sets used (base_sets) to compute the new annotations (for set_name).
    This will exclude segments for which the annotations already exist in the target set.
    :param project_path: path to the dataset
    :type project_path: str
    :param set_name: set to which we will add new annotation files. segments of base sets are excluded if they already have annotation files in the target set
    :type set_name: str
    :param base_sets: sets from which to get the segments
    :type base_sets: list[str]
    :param raw_recording_available: if True, exclude annotations for which the actual recording is not present (for when the process requires the audio)
    :type raw_recording_available: bool
    """
    project = ChildProject(project_path)
    am = AnnotationManager(project)
    am.read()

    for base_set in base_sets:
        assert os.path.exists(os.path.join(project.path, 'annotations', base_set)), \
                ValueError('BASE_SET `{}` not found!'.format(base_set))

    # Get available VTC annotations (the files are readable)
    base_sets_df = am.annotations[am.annotations['set'].isin(base_sets)]
    available_base_sets_anns = base_sets_df[base_sets_df.apply(
        lambda r: os.path.exists(os.path.join(project.path, 'annotations', r['set'], 'converted', r['annotation_filename'])),
        axis=1)]

    # Get already existing conversation annotations and only compute annotations for the files that do not already
    # have conversational annotations
    set_name = set_name.lower()
    if set_name in set(am.annotations['set']):
        target_set_anns = am.annotations[am.annotations['set'] == set_name]
        available_base_sets_anns = available_base_sets_anns[
            ~available_base_sets_anns['recording_filename'].isin(target_set_anns['recording_filename'])]

    # We check that the recording is available if the user wants
    if raw_recording_available:
        available_base_sets_anns = available_base_sets_anns[available_base_sets_anns['recording_filename'].apply(
        lambda fn: os.path.exists(os.path.join(project.path, 'recordings', 'raw', fn)))]

    # Get the segments that are left
    data = am.get_segments(available_base_sets_anns)
    return data


def _compute_annotations(project_path, annotation_type, annotation_function, base_sets, raw_recording_available):
    """
    Computes annotations for the ChildProject in directory project_path, of a specific set, from a list of sets
    :param project_path: path to ChildProject dataset
    :type project_path: str
    :param annotation_type: name of the set to compute for
    :type annotation_type: str
    :param annotation_function: callable that creates the annotations (stored in annotations_functions)
    :type annotation_function: callable
    :base_sets: sets that are required to compute the new annotations
    :type base_sets: list[str]
    :param raw_recording_available: is the actual recording file needed
    :type raw_recording_available: bool
    :return: annotations
    :rtype: pd.DataFrame
    """

    data = get_available_segments(project_path,
                                  set_name=annotation_type,
                                  base_sets=base_sets,
                                  raw_recording_available=raw_recording_available)
    if not len(data):
        return pd.DataFrame()

    data = data[~data['speaker_type'].isnull()]

    annotations = []
    data_grouped = data.groupby('recording_filename')

    for data_grouped_name, data_grouped_line in data_grouped:
        df_annotations = annotation_function(recording_filename=data_grouped_name, segments=data_grouped_line,
                                             project_path = project_path)
        annotations.append(df_annotations)

    output = pd.concat(annotations, axis=0)
    return output


def save_annotations(save_path, annotations, annotation_type):
    """
    Save the computed annotations
    :param save_path: path where to save the annotations (use annotation raw folder)
    :type save_path: str
    :param annotations: annotations to be saved
    :type annotations: pd.DataFrame
    :param annotation_type: annotation type, only used to name the raw file
    :type annotation_type: str
    :return: None
    :rtype: None
    """

    annotations_grouped = annotations.groupby('raw_filename')
    for annotation_group_name, annotation_group_data in annotations_grouped:
        output_filename = '{}_{}'.format(annotation_type.upper(),annotation_group_name.replace('.rttm', '.csv'))

        full_save_path = os.path.join(save_path, output_filename)
        if os.path.exists(full_save_path):
            logger.warning('File {} already exists! If you want to recompute annotations for this file, '
                           'please delete it first!'.format(full_save_path))
            continue

        annotation_group_data = annotation_group_data.drop(columns=
                                           ['raw_filename',
                                            'set',
                                            'time_seek',
                                            'range_onset',
                                            'range_offset',
                                            'format',
                                            'filter',
                                            'annotation_filename',
                                            'imported_at',
                                            'package_version',
                                            'error',
                                            'merged_from',
                                            ])

        annotation_group_data.to_csv(full_save_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
        logger.info('Saved to {}.'.format(full_save_path))


def main(project_path, annotation_type, save_path, unknown_args):
    # Check if running the script from the root of the data set
    expected_annotation_path = os.path.join(project_path, 'annotations')
    expected_recordings_path = os.path.join(project_path, 'recordings')

    assert os.path.exists(expected_annotation_path) and os.path.exists(expected_recordings_path), \
        ValueError('Expected annotation ({}) or recording path ({}) not found. Are you sure to be running this '
                   'command from the root of the data set?'.format(expected_annotation_path, expected_recordings_path))

    assert os.path.exists(os.path.abspath(save_path)), IOError('Path {} does not exist!'.format(save_path))

    assert hasattr(annotations_functions, '{}_annotations'.format(annotation_type.lower())), \
        ValueError('Annotation function {}_annotations not found.'.format(annotation_type.lower()))

    annotation_function = getattr(annotations_functions, '{}_annotations'.format(annotation_type.lower()))
    annotation_function_base_sets = getattr(annotation_function, 'BASE_SETS')
    raw_recording_available = getattr(annotation_function, 'RAW_RECORDING_AVAILABLE', False)

    annotations = _compute_annotations(project_path=project_path,
                                       annotation_type = annotation_type,
                                       annotation_function=_annotation_function_wrapper(
                                              func=annotation_function,
                                              parser_args=unknown_args),
                                       base_sets=annotation_function_base_sets,
                                       raw_recording_available=raw_recording_available)

    if not len(annotations):
        logger.warning('Apparently nothing needs to be computed!')
        return

    save_annotations(save_path, annotations, annotation_type)


def _parse_args(argv):
    import argparse

    parser = argparse.ArgumentParser(description='Compute acoustic annotations.')
    parser.add_argument('--project-path', required=False, type=str, default='',
                        help="Path to a ChildProject/datalad project (useful for debugging purposes).")
    parser.add_argument('--annotation-type', required=True,
                        help='Which type of annotations should be computed.')
    parser.add_argument('--save-path', required=True,
                        help='Path were the annotations should be saved.')
    args, unknown_args = parser.parse_known_args(argv)

    return vars(args), unknown_args


if __name__ == '__main__':
    import sys

    pgrm_name, argv = sys.argv[0], sys.argv[1:]
    args, unknown_args = _parse_args(argv)

    logging.basicConfig(level=logging.INFO)

    try:
        main(unknown_args=unknown_args, **args)
        sys.exit(0)
    except Exception as e:
        logger.exception(e)
        sys.exit(1)