LAAC-LSCP
/
zoo-campaign


			
			
				
					
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
							import argparse
import os
import pandas as pd

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager

parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam')
parser.add_argument('classifications', help = 'classifications file')
args = parser.parse_args()

# load VanDam dataset
project = ChildProject('vandam-data')
am = AnnotationManager(project)

# read classifications
classifications = pd.read_csv(args.classifications)

# recover the majority choice
def majority_choice(values):
    if len(values) == 1:
        return values.iloc[0]
    
    counts = values.value_counts(sort = True)
    counts = counts[counts == counts[0]]

    # return the majority choice if it is unambiguous
    # otherwise, pick randomly among the candidates
    if len(counts) <= 1:
        return counts.index[0]
    else:
        return counts.sample(1).index[0]

classifications = (
    classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id'])
    .agg(answer = ('answer', majority_choice))
).reset_index()

# combine all tasks into one row per chunk
classifications = classifications.pivot(
    index = ['recording_filename', 'onset', 'offset'],
    columns = ['task_id'],
    values = ['answer']
)

# rename the columns
classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values]
classifications.reset_index(inplace = True)
classifications.rename(columns = {
    'onset': 'segment_onset',
    'offset': 'segment_offset',
    'answer_T1': 'speaker_age',
    'answer_T2': 'speaker_gender',
    'answer_T3': 'vcm_type'
}, inplace = True)

# standardized vcm_type
classifications['vcm_type'] = classifications['vcm_type'].map({
    'Crying': 'Y',
    'Laughing': 'L',
    'Canonical': 'C',
    'Non-Canonical': 'N',
    'Junk': 'J'
})

# save and import annotations into vandam-data
annotations = []
for recording_filename, segments in classifications.groupby('recording_filename'):
    filename = os.path.splitext(recording_filename)[0]
    segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv'))

    annotations.append({
        'recording_filename': recording_filename,
        'raw_filename': filename + '.csv',
        'set': 'zoo',
        'format': 'csv',
        'time_seek': 0,
        'range_onset': 0
    })

annotations = pd.DataFrame(annotations)

annotations = annotations.merge(project.recordings[['recording_filename', 'duration']])
annotations['range_offset'] = annotations['duration']

if __name__ == '__main__':
    am.remove_set('zoo')
    am.import_annotations(annotations)