#!usr/bin/env python # -*- coding: utf8 -*- # ----------------------------------------------------------------------------- # File: compute_derived_annotations.py (as part of project URUMETRICS-CODE) # Created: 03/08/2022 17:32 # Last Modified: 03/08/2022 17:32 # ----------------------------------------------------------------------------- # Author: William N. Havard # Postdoctoral Researcher # # Mail : william.havard@ens.fr / william.havard@gmail.com # # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique # # ------------------------------------------------------------------------------ # Description: # • # ----------------------------------------------------------------------------- import logging import os from functools import partial import csv import pandas as pd from ChildProject.annotations import AnnotationManager from ChildProject.projects import ChildProject import annotations_functions logger = logging.getLogger(__name__) def _annotation_function_wrapper(func, parser_args, **kwargs): return partial(func,parser_args=parser_args, **kwargs) def get_available_segments(project_path, set_name, base_sets, raw_recording_available=False): """ Get the annotation segments that will be used to construct new annotations for a set. This returns the segments in the sets used (base_sets) to compute the new annotations (for set_name). This will exclude segments for which the annotations already exist in the target set. :param project_path: path to the dataset :type project_path: str :param set_name: set to which we will add new annotation files. segments of base sets are excluded if they already have annotation files in the target set :type set_name: str :param base_sets: sets from which to get the segments :type base_sets: list[str] :param raw_recording_available: if True, exclude annotations for which the actual recording is not present (for when the process requires the audio) :type raw_recording_available: bool """ project = ChildProject(project_path) am = AnnotationManager(project) am.read() for base_set in base_sets: assert os.path.exists(os.path.join(project.path, 'annotations', base_set)), \ ValueError('BASE_SET `{}` not found!'.format(base_set)) # Get available VTC annotations (the files are readable) base_sets_df = am.annotations[am.annotations['set'].isin(base_sets)] available_base_sets_anns = base_sets_df[base_sets_df.apply( lambda r: os.path.exists(os.path.join(project.path, 'annotations', r['set'], 'converted', r['annotation_filename'])), axis=1)] # Get already existing conversation annotations and only compute annotations for the files that do not already # have conversational annotations if set_name in set(am.annotations['set']): target_set_anns = am.annotations[am.annotations['set'] == set_name] available_base_sets_anns = available_base_sets_anns[ ~available_base_sets_anns['recording_filename'].isin(target_set_anns['recording_filename'])] # We check that the recording is available if the user wants if raw_recording_available: available_base_sets_anns = available_base_sets_anns[available_base_sets_anns['recording_filename'].apply( lambda fn: os.path.exists(os.path.join(project.path, 'recordings', 'raw', fn)))] # Get the segments that are left data = am.get_segments(available_base_sets_anns) return data def _compute_annotations(project_path, annotation_type, annotation_function, base_sets, raw_recording_available): """ Computes annotations for the ChildProject in directory project_path, of a specific set, from a list of sets :param project_path: path to ChildProject dataset :type project_path: str :param annotation_type: name of the set to compute for :type annotation_type: str :param annotation_function: callable that creates the annotations (stored in annotations_functions) :type annotation_function: callable :base_sets: sets that are required to compute the new annotations :type base_sets: list[str] :param raw_recording_available: is the actual recording file needed :type raw_recording_available: bool :return: annotations :rtype: pd.DataFrame """ data = get_available_segments(project_path, set_name=annotation_type, base_sets=base_sets, raw_recording_available=raw_recording_available) if not len(data): return pd.DataFrame() data = data[~data['speaker_type'].isnull()] annotations = [] data_grouped = data.groupby('recording_filename') for data_grouped_name, data_grouped_line in data_grouped: df_annotations = annotation_function(recording_filename=data_grouped_name, segments=data_grouped_line, project_path = project_path) annotations.append(df_annotations) output = pd.concat(annotations, axis=0) return output def save_annotations(save_path, annotations, annotation_type): """ Save the computed annotations :param save_path: path where to save the annotations (use annotation raw folder) :type save_path: str :param annotations: annotations to be saved :type annotations: pd.DataFrame :param annotation_type: annotation type, only used to name the raw file :type annotation_type: str :return: None :rtype: None """ annotations_grouped = annotations.groupby('raw_filename') for annotation_group_name, annotation_group_data in annotations_grouped: output_filename = '{}_{}'.format(annotation_type.upper(),annotation_group_name.replace('.rttm', '.csv')) full_save_path = os.path.join(save_path, output_filename) if os.path.exists(full_save_path): logger.warning('File {} already exists! If you want to recompute annotations for this file, ' 'please delete it first!'.format(full_save_path)) #TODO, currently outputs file until one exists and then fail, resulting in an unknown number being written and similarly not written, maybe should be 'continue' for now. return annotation_group_data = annotation_group_data.drop(columns= ['raw_filename', 'set', 'time_seek', 'range_onset', 'range_offset', 'format', 'filter', 'annotation_filename', 'imported_at', 'package_version', 'error', 'merged_from', ]) annotation_group_data.to_csv(full_save_path, index=False, quoting=csv.QUOTE_NONNUMERIC) logger.info('Saved to {}.'.format(full_save_path)) def main(project_path, annotation_type, save_path, unknown_args): # Check if running the script from the root of the data set expected_annotation_path = os.path.join(project_path, 'annotations') expected_recordings_path = os.path.join(project_path, 'recordings') assert os.path.exists(expected_annotation_path) and os.path.exists(expected_recordings_path), \ ValueError('Expected annotation ({}) or recording path ({}) not found. Are you sure to be running this ' 'command from the root of the data set?'.format(expected_annotation_path, expected_recordings_path)) assert os.path.exists(os.path.abspath(save_path)), IOError('Path {} does not exist!'.format(save_path)) assert hasattr(annotations_functions, '{}_annotations'.format(annotation_type.lower())), \ ValueError('Annotation function {}_annotations not found.'.format(annotation_type.lower())) annotation_function = getattr(annotations_functions, '{}_annotations'.format(annotation_type.lower())) annotation_function_base_sets = getattr(annotation_function, 'BASE_SETS') raw_recording_available = getattr(annotation_function, 'RAW_RECORDING_AVAILABLE', False) annotations = _compute_annotations(project_path=project_path, annotation_type = annotation_type, annotation_function=_annotation_function_wrapper( func=annotation_function, parser_args=unknown_args), base_sets=annotation_function_base_sets, raw_recording_available=raw_recording_available) if not len(annotations): logger.warning('Apparently nothing needs to be computed!') return save_annotations(save_path, annotations, annotation_type) def _parse_args(argv): import argparse parser = argparse.ArgumentParser(description='Compute acoustic annotations.') parser.add_argument('--project-path', required=False, type=str, default='', help="Path to a ChildProject/datalad project (useful for debugging purposes).") parser.add_argument('--annotation-type', required=True, help='Which type of annotations should be computed.') parser.add_argument('--save-path', required=True, help='Path were the annotations should be saved.') args, unknown_args = parser.parse_known_args(argv) return vars(args), unknown_args if __name__ == '__main__': import sys pgrm_name, argv = sys.argv[0], sys.argv[1:] args, unknown_args = _parse_args(argv) logging.basicConfig(level=logging.INFO) try: main(unknown_args=unknown_args, **args) sys.exit(0) except Exception as e: logger.exception(e) sys.exit(1)