12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- import argparse
- import os
- import pandas as pd
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam')
- parser.add_argument('classifications', help = 'classifications file')
- args = parser.parse_args()
- # load VanDam dataset
- project = ChildProject('vandam-data')
- am = AnnotationManager(project)
- # read classifications
- classifications = pd.read_csv(args.classifications)
- # recover the majority choice
- def majority_choice(values):
- if len(values) == 1:
- return values.iloc[0]
-
- counts = values.value_counts(sort = True)
- counts = counts[counts == counts[0]]
- # return the majority choice if it is unambiguous
- # otherwise, pick randomly among the candidates
- if len(counts) <= 1:
- return counts.index[0]
- else:
- return counts.sample(1).index[0]
- classifications = (
- classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id'])
- .agg(answer = ('answer', majority_choice))
- ).reset_index()
- # combine all tasks into one row per chunk
- classifications = classifications.pivot(
- index = ['recording_filename', 'onset', 'offset'],
- columns = ['task_id'],
- values = ['answer']
- )
- # rename the columns
- classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values]
- classifications.reset_index(inplace = True)
- classifications.rename(columns = {
- 'onset': 'segment_onset',
- 'offset': 'segment_offset',
- 'answer_T1': 'speaker_age',
- 'answer_T2': 'speaker_gender',
- 'answer_T3': 'vcm_type'
- }, inplace = True)
- # standardized vcm_type
- classifications['vcm_type'] = classifications['vcm_type'].map({
- 'Crying': 'Y',
- 'Laughing': 'L',
- 'Canonical': 'C',
- 'Non-Canonical': 'N',
- 'Junk': 'J'
- })
- # save and import annotations into vandam-data
- annotations = []
- for recording_filename, segments in classifications.groupby('recording_filename'):
- filename = os.path.splitext(recording_filename)[0]
- segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv'))
- annotations.append({
- 'recording_filename': recording_filename,
- 'raw_filename': filename + '.csv',
- 'set': 'zoo',
- 'format': 'csv',
- 'time_seek': 0,
- 'range_onset': 0
- })
- annotations = pd.DataFrame(annotations)
- annotations = annotations.merge(project.recordings[['recording_filename', 'duration']])
- annotations['range_offset'] = annotations['duration']
- if __name__ == '__main__':
- am.remove_set('zoo')
- am.import_annotations(annotations)
|