import argparse import os import pandas as pd from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam') parser.add_argument('classifications', help = 'classifications file') args = parser.parse_args() # load VanDam dataset project = ChildProject('vandam-data') am = AnnotationManager(project) # read classifications classifications = pd.read_csv(args.classifications) # recover the majority choice def majority_choice(values): if len(values) == 1: return values.iloc[0] counts = values.value_counts(sort = True) counts = counts[counts == counts[0]] # return the majority choice if it is unambiguous # otherwise, pick randomly among the candidates if len(counts) <= 1: return counts.index[0] else: return counts.sample(1).index[0] classifications = ( classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id']) .agg(answer = ('answer', majority_choice)) ).reset_index() # combine all tasks into one row per chunk classifications = classifications.pivot( index = ['recording_filename', 'onset', 'offset'], columns = ['task_id'], values = ['answer'] ) # rename the columns classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values] classifications.reset_index(inplace = True) classifications.rename(columns = { 'onset': 'segment_onset', 'offset': 'segment_offset', 'answer_T1': 'speaker_age', 'answer_T2': 'speaker_gender', 'answer_T3': 'vcm_type' }, inplace = True) # standardized vcm_type classifications['vcm_type'] = classifications['vcm_type'].map({ 'Crying': 'Y', 'Laughing': 'L', 'Canonical': 'C', 'Non-Canonical': 'N', 'Junk': 'J' }) # save and import annotations into vandam-data annotations = [] for recording_filename, segments in classifications.groupby('recording_filename'): filename = os.path.splitext(recording_filename)[0] segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv')) annotations.append({ 'recording_filename': recording_filename, 'raw_filename': filename + '.csv', 'set': 'zoo', 'format': 'csv', 'time_seek': 0, 'range_onset': 0 }) annotations = pd.DataFrame(annotations) annotations = annotations.merge(project.recordings[['recording_filename', 'duration']]) annotations['range_offset'] = annotations['duration'] if __name__ == '__main__': am.remove_set('zoo') am.import_annotations(annotations)