feed-annotations.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import argparse
  2. import os
  3. import pandas as pd
  4. from ChildProject.projects import ChildProject
  5. from ChildProject.annotations import AnnotationManager
  6. parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam')
  7. parser.add_argument('classifications', help = 'classifications file')
  8. args = parser.parse_args()
  9. # load VanDam dataset
  10. project = ChildProject('vandam-data')
  11. am = AnnotationManager(project)
  12. # read classifications
  13. classifications = pd.read_csv(args.classifications)
  14. # recover the majority choice
  15. def majority_choice(values):
  16. if len(values) == 1:
  17. return values.iloc[0]
  18. counts = values.value_counts(sort = True)
  19. counts = counts[counts == counts[0]]
  20. # return the majority choice if it is unambiguous
  21. # otherwise, pick randomly among the candidates
  22. if len(counts) <= 1:
  23. return counts.index[0]
  24. else:
  25. return counts.sample(1).index[0]
  26. classifications = (
  27. classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id'])
  28. .agg(answer = ('answer', majority_choice))
  29. ).reset_index()
  30. # combine all tasks into one row per chunk
  31. classifications = classifications.pivot(
  32. index = ['recording_filename', 'onset', 'offset'],
  33. columns = ['task_id'],
  34. values = ['answer']
  35. )
  36. # rename the columns
  37. classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values]
  38. classifications.reset_index(inplace = True)
  39. classifications.rename(columns = {
  40. 'onset': 'segment_onset',
  41. 'offset': 'segment_offset',
  42. 'answer_T1': 'speaker_age',
  43. 'answer_T2': 'speaker_gender',
  44. 'answer_T3': 'vcm_type'
  45. }, inplace = True)
  46. # standardized vcm_type
  47. classifications['vcm_type'] = classifications['vcm_type'].map({
  48. 'Crying': 'Y',
  49. 'Laughing': 'L',
  50. 'Canonical': 'C',
  51. 'Non-Canonical': 'N',
  52. 'Junk': 'J'
  53. })
  54. # save and import annotations into vandam-data
  55. annotations = []
  56. for recording_filename, segments in classifications.groupby('recording_filename'):
  57. filename = os.path.splitext(recording_filename)[0]
  58. segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv'))
  59. annotations.append({
  60. 'recording_filename': recording_filename,
  61. 'raw_filename': filename + '.csv',
  62. 'set': 'zoo',
  63. 'format': 'csv',
  64. 'time_seek': 0,
  65. 'range_onset': 0
  66. })
  67. annotations = pd.DataFrame(annotations)
  68. annotations = annotations.merge(project.recordings[['recording_filename', 'duration']])
  69. annotations['range_offset'] = annotations['duration']
  70. if __name__ == '__main__':
  71. am.remove_set('zoo')
  72. am.import_annotations(annotations)