123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- #!usr/bin/env python
- # -*- coding: utf8 -*-
- # -----------------------------------------------------------------------------
- # File: compute_derived_annotations.py (as part of project URUMETRICS-CODE)
- # Created: 03/08/2022 17:32
- # Last Modified: 03/08/2022 17:32
- # -----------------------------------------------------------------------------
- # Author: William N. Havard
- # Postdoctoral Researcher
- #
- # Mail : william.havard@ens.fr / william.havard@gmail.com
- #
- # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
- #
- # ------------------------------------------------------------------------------
- # Description:
- # •
- # -----------------------------------------------------------------------------
- import logging
- import os
- from functools import partial
- import csv
- import pandas as pd
- from ChildProject.annotations import AnnotationManager
- from ChildProject.projects import ChildProject
- import annotations_functions
- logger = logging.getLogger(__name__)
- def _annotation_function_wrapper(func, parser_args, **kwargs):
- return partial(func,parser_args=parser_args, **kwargs)
- def get_available_segments(project_path, set_name, base_sets, raw_recording_available=False):
- """
- Get the annotation segments that will be used to construct new annotations for a set.
- This returns the segments in the sets used (base_sets) to compute the new annotations (for set_name).
- This will exclude segments for which the annotations already exist in the target set.
- :param project_path: path to the dataset
- :type project_path: str
- :param set_name: set to which we will add new annotation files. segments of base sets are excluded if they already have annotation files in the target set
- :type set_name: str
- :param base_sets: sets from which to get the segments
- :type base_sets: list[str]
- :param raw_recording_available: if True, exclude annotations for which the actual recording is not present (for when the process requires the audio)
- :type raw_recording_available: bool
- """
- project = ChildProject(project_path)
- am = AnnotationManager(project)
- am.read()
- for base_set in base_sets:
- assert os.path.exists(os.path.join(project.path, 'annotations', base_set)), \
- ValueError('BASE_SET `{}` not found!'.format(base_set))
- # Get available VTC annotations (the files are readable)
- base_sets_df = am.annotations[am.annotations['set'].isin(base_sets)]
- available_base_sets_anns = base_sets_df[base_sets_df.apply(
- lambda r: os.path.exists(os.path.join(project.path, 'annotations', r['set'], 'converted', r['annotation_filename'])),
- axis=1)]
- # Get already existing conversation annotations and only compute annotations for the files that do not already
- # have conversational annotations
- if set_name in set(am.annotations['set']):
- target_set_anns = am.annotations[am.annotations['set'] == set_name]
- available_base_sets_anns = available_base_sets_anns[
- ~available_base_sets_anns['recording_filename'].isin(target_set_anns['recording_filename'])]
- # We check that the recording is available if the user wants
- if raw_recording_available:
- available_base_sets_anns = available_base_sets_anns[available_base_sets_anns['recording_filename'].apply(
- lambda fn: os.path.exists(os.path.join(project.path, 'recordings', 'raw', fn)))]
- # Get the segments that are left
- data = am.get_segments(available_base_sets_anns)
- return data
- def _compute_annotations(project_path, annotation_type, annotation_function, base_sets, raw_recording_available):
- """
- Computes annotations for the ChildProject in directory project_path, of a specific set, from a list of sets
- :param project_path: path to ChildProject dataset
- :type project_path: str
- :param annotation_type: name of the set to compute for
- :type annotation_type: str
- :param annotation_function: callable that creates the annotations (stored in annotations_functions)
- :type annotation_function: callable
- :base_sets: sets that are required to compute the new annotations
- :type base_sets: list[str]
- :param raw_recording_available: is the actual recording file needed
- :type raw_recording_available: bool
- :return: annotations
- :rtype: pd.DataFrame
- """
- data = get_available_segments(project_path,
- set_name=annotation_type,
- base_sets=base_sets,
- raw_recording_available=raw_recording_available)
- if not len(data):
- return pd.DataFrame()
- data = data[~data['speaker_type'].isnull()]
- annotations = []
- data_grouped = data.groupby('recording_filename')
- for data_grouped_name, data_grouped_line in data_grouped:
- df_annotations = annotation_function(recording_filename=data_grouped_name, segments=data_grouped_line,
- project_path = project_path)
- annotations.append(df_annotations)
- output = pd.concat(annotations, axis=0)
- return output
- def save_annotations(save_path, annotations, annotation_type):
- """
- Save the computed annotations
- :param save_path: path where to save the annotations (use annotation raw folder)
- :type save_path: str
- :param annotations: annotations to be saved
- :type annotations: pd.DataFrame
- :param annotation_type: annotation type, only used to name the raw file
- :type annotation_type: str
- :return: None
- :rtype: None
- """
- annotations_grouped = annotations.groupby('raw_filename')
- for annotation_group_name, annotation_group_data in annotations_grouped:
- output_filename = '{}_{}'.format(annotation_type.upper(),annotation_group_name.replace('.rttm', '.csv'))
- full_save_path = os.path.join(save_path, output_filename)
- if os.path.exists(full_save_path):
- logger.warning('File {} already exists! If you want to recompute annotations for this file, '
- 'please delete it first!'.format(full_save_path))
- #TODO, currently outputs file until one exists and then fail, resulting in an unknown number being written and similarly not written, maybe should be 'continue' for now.
- return
- annotation_group_data = annotation_group_data.drop(columns=
- ['raw_filename',
- 'set',
- 'time_seek',
- 'range_onset',
- 'range_offset',
- 'format',
- 'filter',
- 'annotation_filename',
- 'imported_at',
- 'package_version',
- 'error',
- 'merged_from',
- ])
- annotation_group_data.to_csv(full_save_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
- logger.info('Saved to {}.'.format(full_save_path))
- def main(project_path, annotation_type, save_path, unknown_args):
- # Check if running the script from the root of the data set
- expected_annotation_path = os.path.join(project_path, 'annotations')
- expected_recordings_path = os.path.join(project_path, 'recordings')
- assert os.path.exists(expected_annotation_path) and os.path.exists(expected_recordings_path), \
- ValueError('Expected annotation ({}) or recording path ({}) not found. Are you sure to be running this '
- 'command from the root of the data set?'.format(expected_annotation_path, expected_recordings_path))
- assert os.path.exists(os.path.abspath(save_path)), IOError('Path {} does not exist!'.format(save_path))
- assert hasattr(annotations_functions, '{}_annotations'.format(annotation_type.lower())), \
- ValueError('Annotation function {}_annotations not found.'.format(annotation_type.lower()))
- annotation_function = getattr(annotations_functions, '{}_annotations'.format(annotation_type.lower()))
- annotation_function_base_sets = getattr(annotation_function, 'BASE_SETS')
- raw_recording_available = getattr(annotation_function, 'RAW_RECORDING_AVAILABLE', False)
- annotations = _compute_annotations(project_path=project_path,
- annotation_type = annotation_type,
- annotation_function=_annotation_function_wrapper(
- func=annotation_function,
- parser_args=unknown_args),
- base_sets=annotation_function_base_sets,
- raw_recording_available=raw_recording_available)
- if not len(annotations):
- logger.warning('Apparently nothing needs to be computed!')
- return
- save_annotations(save_path, annotations, annotation_type)
- def _parse_args(argv):
- import argparse
- parser = argparse.ArgumentParser(description='Compute acoustic annotations.')
- parser.add_argument('--project-path', required=False, type=str, default='',
- help="Path to a ChildProject/datalad project (useful for debugging purposes).")
- parser.add_argument('--annotation-type', required=True,
- help='Which type of annotations should be computed.')
- parser.add_argument('--save-path', required=True,
- help='Path were the annotations should be saved.')
- args, unknown_args = parser.parse_known_args(argv)
- return vars(args), unknown_args
- if __name__ == '__main__':
- import sys
- pgrm_name, argv = sys.argv[0], sys.argv[1:]
- args, unknown_args = _parse_args(argv)
- logging.basicConfig(level=logging.INFO)
- try:
- main(unknown_args=unknown_args, **args)
- sys.exit(0)
- except Exception as e:
- logger.exception(e)
- sys.exit(1)
|