doi
/
zoo-campaign
forked from LAAC-LSCP/zoo-campaign


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
							import argparse
import os
import pandas as pd

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager

parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam')
parser.add_argument('classifications', help = 'classifications file')
args = parser.parse_args()

# load VanDam dataset
project = ChildProject('vandam-data')
am = AnnotationManager(project)

# read classifications
classifications = pd.read_csv(args.classifications)

# recover the majority choice
def majority_choice(values):
    counts = values.value_counts(sort = True)
    counts = counts[counts == counts[0]]

    # return the majority choice if it exists
    # otherwise, do a random pick
    if len(counts) <= 1:
        return counts.index[0]
    else:
        return counts.sample(1).index[0]

classifications = (
    classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id'])
    .agg(answer = ('answer', majority_choice))
).reset_index()

# combine all tasks into one row per chunk
classifications = classifications.pivot(
    index = ['recording_filename', 'onset', 'offset'],
    columns = ['task_id'],
    values = ['answer']
)

# rename the columns
classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values]
classifications.reset_index(inplace = True)
classifications.rename(columns = {
    'onset': 'segment_onset',
    'offset': 'segment_offset',
    'answer_T1': 'speaker_age',
    'answer_T2': 'speaker_gender',
    'answer_T3': 'vcm_type'
}, inplace = True)

# standardized vcm_type
classifications['vcm_type'] = classifications['vcm_type'].map({
    'Crying': 'J',
    'Laughing': 'L',
    'Canonical': 'C',
    'Non-Canonical': 'N'
})

# save and import annotations into vandam-data
annotations = []
for recording_filename, segments in classifications.groupby('recording_filename'):
    filename = os.path.splitext(recording_filename)[0]
    segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv'))

    annotations.append({
        'recording_filename': recording_filename,
        'raw_filename': filename + '.csv',
        'set': 'zoo',
        'format': 'csv',
        'time_seek': 0,
        'range_onset': 0
    })

annotations = pd.DataFrame(annotations)

annotations = annotations.merge(project.recordings[['recording_filename', 'duration']])
annotations['range_offset'] = annotations['duration']

if __name__ == '__main__':
    am.remove_set('zoo')
    am.import_annotations(annotations)