|
@@ -0,0 +1,229 @@
|
|
|
+#!usr/bin/env python
|
|
|
+# -*- coding: utf8 -*-
|
|
|
+
|
|
|
+# -----------------------------------------------------------------------------
|
|
|
+# File: import_annotations.py (as part of project URUMETRICS)
|
|
|
+# Created: 23/05/2022 11:28
|
|
|
+# Last Modified: 23/05/2022 11:28
|
|
|
+# -----------------------------------------------------------------------------
|
|
|
+# Author: William N. Havard
|
|
|
+# Postdoctoral Researcher
|
|
|
+#
|
|
|
+# Mail : william.havard@ens.fr / william.havard@gmail.com
|
|
|
+#
|
|
|
+# Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
|
|
|
+#
|
|
|
+# ------------------------------------------------------------------------------
|
|
|
+# Description:
|
|
|
+# •
|
|
|
+# -----------------------------------------------------------------------------
|
|
|
+
|
|
|
+# !usr/bin/env python
|
|
|
+# -*- coding: utf8 -*-
|
|
|
+
|
|
|
+#
|
|
|
+# Author: William N. Havard (base on various files by Lucas Gautheron)
|
|
|
+#
|
|
|
+
|
|
|
+import logging
|
|
|
+import os
|
|
|
+import sys
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+from ChildProject.annotations import AnnotationManager
|
|
|
+from ChildProject.projects import ChildProject
|
|
|
+
|
|
|
+from .consts import ANNOTATION_TYPES
|
|
|
+from .utils import get_raw_filename
|
|
|
+from .acoustic_converter import AcousticConverter # /!\ Do not remove
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+pd.set_option('mode.chained_assignment', None) # Silences pandas' complaints
|
|
|
+
|
|
|
+
|
|
|
+def _filter_missing_annotation_files(path_project, input):
|
|
|
+ """
|
|
|
+ Checks whether some annotation files are missing or not and returns only the rows for which the annotation
|
|
|
+ file could be found.
|
|
|
+ :param path_project: path to ChildProject project directory
|
|
|
+ :type path_project: str
|
|
|
+ :param input: dataframe containing the list of recordings and their corresponding annotation file
|
|
|
+ :type input: pandas.DataFrame
|
|
|
+ :return: dataframe containing only row for which an annotation file was found
|
|
|
+ :rtype: pandas.DataFrame
|
|
|
+ """
|
|
|
+ annotation_path = lambda row: os.path.join(path_project, 'annotations', row['set'], 'raw', row['raw_filename'])
|
|
|
+ input['exists'] = input.apply(lambda row: os.path.exists(annotation_path(row)), axis=1)
|
|
|
+
|
|
|
+ missing_annotations = input[input['exists'] == False]
|
|
|
+ existing_annotations = input[input['exists'] == True]
|
|
|
+
|
|
|
+ if len(missing_annotations):
|
|
|
+ missing_annotations['expected_path'] = missing_annotations.apply(lambda row: annotation_path(row), axis=1)
|
|
|
+ missing_annotations_path = sorted(missing_annotations['expected_path'].tolist())
|
|
|
+ missing_message = "Some annotations you expected to have are missing.\n" \
|
|
|
+ "Check whether these annotations exist and if so, if their expected path " \
|
|
|
+ "reflect their true path.\n\t - {}".format('\n\t - '.join(missing_annotations_path))
|
|
|
+ logger.warning(missing_message)
|
|
|
+ return existing_annotations
|
|
|
+
|
|
|
+
|
|
|
+def _check_importation(am, imported, expected_ann_number):
|
|
|
+ """
|
|
|
+ Checks whether the importation was carried out successfully or not. Returns the number of imported annotations files
|
|
|
+ imported so far and the number of segments they contain.
|
|
|
+ :param am: ChildProject annotation manager object
|
|
|
+ :type am: ChildProject.AnnotationManager
|
|
|
+ :param imported: DataFrame object containing the annotation files that were successfully imported
|
|
|
+ :type imported: pandas.DataFrame
|
|
|
+ :param expected_ann_number: containing the expected number of annotation files that should have been imported
|
|
|
+ :type expected_ann_number: int
|
|
|
+ :return: number of imported annotations, number of imported segments
|
|
|
+ :rtype: tuple of int
|
|
|
+ """
|
|
|
+ if len(imported) != expected_ann_number:
|
|
|
+ logger.warning('Expected to import {} annotations, only found {}!'.format(len(imported), expected_ann_number))
|
|
|
+
|
|
|
+ annotations_segments = am.get_segments(imported)
|
|
|
+ if len(annotations_segments) == 0:
|
|
|
+ logger.warning('Annotations were imported, but they either contain no segments '
|
|
|
+ 'or the segments were not imported properly!')
|
|
|
+
|
|
|
+ return len(imported), len(annotations_segments)
|
|
|
+
|
|
|
+
|
|
|
+def _get_recordings(project, annotation_set, annotation_format):
|
|
|
+ """
|
|
|
+ Returns a DataFrame of recordings already imported for the project `project` and prepare the data frame indicating
|
|
|
+ the format and the set of annotations that will be imported.
|
|
|
+ :param project: path to ChildProject project directory
|
|
|
+ :type project: str
|
|
|
+ :param annotation_set: set of annotation to import
|
|
|
+ :type annotation_set: str
|
|
|
+ :param annotation_format: format of the annotations that will be imported
|
|
|
+ :type annotation_format: str
|
|
|
+ :return: dataframe containing the recordings found for the project
|
|
|
+ :rtype: pandas.DataFrame
|
|
|
+ """
|
|
|
+ input = project.recordings[['recording_filename', 'duration', 'child_id']]
|
|
|
+ input.dropna(inplace=True)
|
|
|
+ input = input[input['recording_filename'] != 'NA']
|
|
|
+ input['set'] = annotation_set
|
|
|
+ input['format'] = annotation_format
|
|
|
+ input['time_seek'] = 0
|
|
|
+ input['range_onset'] = 0
|
|
|
+ input['range_offset'] = input['duration']
|
|
|
+ input.drop(['duration'], axis=1, inplace=True)
|
|
|
+
|
|
|
+ return input
|
|
|
+
|
|
|
+
|
|
|
+def _build_raw_filename(input, annotation_format, filename='', extension=''):
|
|
|
+ """
|
|
|
+ Build the expected annotation filename path containing the annotation corresponding to a given recording
|
|
|
+ :param input: DataFrame containing a list of recordings
|
|
|
+ :type input: pandas.DataFrame
|
|
|
+ :param annotation_format: format of the annotation that will be imported
|
|
|
+ :type annotation_format: str
|
|
|
+ :param filename: filename of the annotation file where to look for the annotations for a specific recording
|
|
|
+ :type filename: str
|
|
|
+ :param extension: file extension of the annotation file
|
|
|
+ :type extension: str
|
|
|
+ :return: dataframe containing the expected name the annotation file should have
|
|
|
+ :rtype: pandas.DataFrame
|
|
|
+ """
|
|
|
+ # Set up 'raw_filename' and 'filter' depending on the annotation set to import
|
|
|
+ annotation_format = annotation_format.removesuffix('_rttm')
|
|
|
+ annotation_format_extension = ANNOTATION_TYPES.asdict()[annotation_format.upper()][0] \
|
|
|
+ if not extension else extension
|
|
|
+ if annotation_format in ['vtc', 'vcm', 'alice', 'acoustic']:
|
|
|
+ # Annotations have the same name as the recording filename. Update if it's not the case for you.
|
|
|
+ input['raw_filename'] = input['recording_filename'].apply(
|
|
|
+ lambda f: '{}.{}'.format(get_raw_filename(f), annotation_format_extension)) if not filename else filename
|
|
|
+ # We only keep lines for which the 'file' column is equal to filter
|
|
|
+ input['filter'] = input['recording_filename'].apply(lambda f: os.path.basename(get_raw_filename(f)))
|
|
|
+ elif annotation_format in ['cha', 'its', 'eaf']:
|
|
|
+ # CHA/ITS/EAF files do not need filtering as they only contain annotations for the file they are linked to
|
|
|
+ input['raw_filename'] = input['recording_filename'].apply(
|
|
|
+ lambda f: '{}.{}'.format(get_raw_filename(f), annotation_format_extension))
|
|
|
+ else:
|
|
|
+ raise ValueError('Unknown annotation format `{}`!'.format(annotation_format))
|
|
|
+
|
|
|
+ return input
|
|
|
+
|
|
|
+
|
|
|
+def import_annotations(data_path):
|
|
|
+ """
|
|
|
+ Imports all the new annotations files that are found in the `annotations` directory. This directory will
|
|
|
+ be recursively explored.
|
|
|
+ :param data_path: Path to a ChildProject project directory
|
|
|
+ :type data_path: str
|
|
|
+ :return: None
|
|
|
+ :rtype: None
|
|
|
+ """
|
|
|
+
|
|
|
+ annotation_path = os.path.join(data_path, 'annotations')
|
|
|
+ annotation_metadata_path = os.path.join(data_path, 'metadata', 'annotations.csv')
|
|
|
+
|
|
|
+ # Load project
|
|
|
+ project = ChildProject(data_path)
|
|
|
+ am = AnnotationManager(project)
|
|
|
+ am.read()
|
|
|
+
|
|
|
+ for annotation_type in os.listdir(annotation_path):
|
|
|
+ annotation_raw_files = os.path.join(annotation_path, annotation_type, 'raw')
|
|
|
+ if annotation_type.upper() not in ANNOTATION_TYPES.asdict().keys(): continue
|
|
|
+
|
|
|
+ for annotation_raw_file in os.listdir(annotation_raw_files):
|
|
|
+ _, extension = os.path.splitext(annotation_raw_file)
|
|
|
+
|
|
|
+ annotation_set = annotation_type.lower()
|
|
|
+ annotation_format = ANNOTATION_TYPES.asdict()[annotation_set.upper()][0]
|
|
|
+
|
|
|
+ # Get recordings and set up df
|
|
|
+ input = _get_recordings(project, annotation_set, annotation_format)
|
|
|
+
|
|
|
+ # Build raw file names
|
|
|
+ input = _build_raw_filename(input, annotation_format, annotation_raw_file, extension)
|
|
|
+
|
|
|
+ # Filter out rows for which we do not find the matching annotation file
|
|
|
+ input = _filter_missing_annotation_files(data_path, input)
|
|
|
+
|
|
|
+ # We make sure we remove annotation files from the DataFrame if they were already previously imported
|
|
|
+ if os.path.exists(annotation_metadata_path):
|
|
|
+ already_imported_metadata = pd.read_csv(annotation_metadata_path)
|
|
|
+ already_imported_metadata = already_imported_metadata[['recording_filename', 'set', 'raw_filename']]
|
|
|
+
|
|
|
+ # Drop annotation files that were already imported
|
|
|
+ input = (input.merge(already_imported_metadata, how='left', indicator=True,
|
|
|
+ on=['recording_filename', 'set', 'raw_filename'])
|
|
|
+ .loc[lambda x: x['_merge'] == 'left_only']
|
|
|
+ .drop(columns="_merge"))
|
|
|
+
|
|
|
+ # Do importation
|
|
|
+ if len(input) > 0:
|
|
|
+ imported = am.import_annotations(input)
|
|
|
+
|
|
|
+ expected_ann_number = len(input)
|
|
|
+ len_ann, len_seg = _check_importation(am, imported, expected_ann_number)
|
|
|
+ logger.info('Imported {} new annotation files resulting '
|
|
|
+ 'in {} new {} segments!'.format(len_ann, len_seg, annotation_type))
|
|
|
+
|
|
|
+ else:
|
|
|
+ logger.warning('Nothing to import for annotation type {}!'.format(annotation_format))
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set'))
|
|
|
+ import_annotations(data_path)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ logging.basicConfig(level=logging.INFO)
|
|
|
+
|
|
|
+ try:
|
|
|
+ main()
|
|
|
+ sys.exit(0)
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ sys.exit(1)
|