feed-annotations.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. import argparse
  2. import os
  3. import pandas as pd
  4. from ChildProject.projects import ChildProject
  5. from ChildProject.annotations import AnnotationManager
  6. parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam')
  7. parser.add_argument('classifications', help = 'classifications file')
  8. args = parser.parse_args()
  9. # load VanDam dataset
  10. project = ChildProject('vandam-data')
  11. am = AnnotationManager(project)
  12. # read classifications
  13. classifications = pd.read_csv(args.classifications)
  14. # recover the majority choice
  15. def majority_choice(values):
  16. counts = values.value_counts(sort = True)
  17. counts = counts[counts == counts[0]]
  18. # return the majority choice if it exists
  19. # otherwise, do a random pick
  20. if len(counts) <= 1:
  21. return counts.index[0]
  22. else:
  23. return counts.sample(1).index[0]
  24. classifications = (
  25. classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id'])
  26. .agg(answer = ('answer', majority_choice))
  27. ).reset_index()
  28. # combine all tasks into one row per chunk
  29. classifications = classifications.pivot(
  30. index = ['recording_filename', 'onset', 'offset'],
  31. columns = ['task_id'],
  32. values = ['answer']
  33. )
  34. # rename the columns
  35. classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values]
  36. classifications.reset_index(inplace = True)
  37. classifications.rename(columns = {
  38. 'onset': 'segment_onset',
  39. 'offset': 'segment_offset',
  40. 'answer_T1': 'speaker_age',
  41. 'answer_T2': 'speaker_gender',
  42. 'answer_T3': 'vcm_type'
  43. }, inplace = True)
  44. # standardized vcm_type
  45. classifications['vcm_type'] = classifications['vcm_type'].map({
  46. 'Crying': 'J',
  47. 'Laughing': 'L',
  48. 'Canonical': 'C',
  49. 'Non-Canonical': 'N'
  50. })
  51. # save and import annotations into vandam-data
  52. annotations = []
  53. for recording_filename, segments in classifications.groupby('recording_filename'):
  54. filename = os.path.splitext(recording_filename)[0]
  55. segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv'))
  56. annotations.append({
  57. 'recording_filename': recording_filename,
  58. 'raw_filename': filename + '.csv',
  59. 'set': 'zoo',
  60. 'format': 'csv',
  61. 'time_seek': 0,
  62. 'range_onset': 0
  63. })
  64. annotations = pd.DataFrame(annotations)
  65. annotations = annotations.merge(project.recordings[['recording_filename', 'duration']])
  66. annotations['range_offset'] = annotations['duration']
  67. if __name__ == '__main__':
  68. am.remove_set('zoo')
  69. am.import_annotations(annotations)