Browse Source

feeding annotations and comparing them

Lucas Gautheron 2 years ago
parent
commit
ee4027a852
4 changed files with 183 additions and 0 deletions
  1. 35 0
      README.md
  2. 74 0
      annotations/compare.py
  3. 1 0
      annotations/comparison.png
  4. 73 0
      annotations/feed-annotations.py

+ 35 - 0
README.md

@@ -182,3 +182,38 @@ Now, only relevant chunks are returned, and they are associated to all correspon
 |346474475|2202359.0|64210728  |T1     |0        |17576      |Baby         |70   |BN32_010007.mp3   |37978055|37978555|37978100     |37979011      |BN32_010007_37978055_37978555.wav|BN32_010007_37978055_37978555.mp3|2021-07-16 19:19:44|True    |14957     |vandam_chi_fem|64210728     |chi_fem|
 |346474475|2202359.0|64210728  |T3     |1        |17576      |Non-Canonical|70   |BN32_010007.mp3   |37978055|37978555|37978100     |37979011      |BN32_010007_37978055_37978555.wav|BN32_010007_37978055_37978555.mp3|2021-07-16 19:19:44|True    |14957     |vandam_chi_fem|64210728     |chi_fem|
 
+## Importing classifications into the source dataset
+
+Once the classifications have been recovered, they can be used to enrich the source dataset with more annotations.
+While this step may depend a lot on the type of annotation that you are doing, this repository provides an example.
+
+The [feed-annotations](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/feed-annotations.py) script does just that. It can be run with:
+
+ ```bash
+ python annotations/feed-annotations.py classifications/classifications.csv
+ ```
+
+ The classifications are then imported into the `vandam-data` subdataset using ChildProject:
+
+ ```bash
+ $ tail -n 1 vandam-data/metadata/annotations.csv 
+zoo,BN32_010007.mp3,0,0,50464512,BN32_010007.csv,csv,,BN32_010007_0_50464512.csv,2021-07-19 11:14:58,,0.0.1
+```
+
+In case several users have classified the same chunks, the majority choice is retained. You can have a look at the [source of the script](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/feed-annotations.py) to see how that works - or to adapt it to your needs!
+
+### Comparing Zooniverse annotations with other annotations
+
+Once the annotations have been imported into the original dataset, you can use all the functionalities of the ChildProject package e.g. for reliability estimations.
+
+For instance, let's day we'd like to test the ability of the VTC to distinguish children from adults, based on the classifications retrieved with Zooniverse.
+
+The [compare](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/compare.py) does just that (look at the [source](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/compare.py) and try it by yourself!):
+
+ ```bash
+ python annotations/compare.py
+ ```
+
+ Which will output:
+
+ ![Comparing the VTC and Zooniverse classifications](annotations/comparison.png)

+ 74 - 0
annotations/compare.py

@@ -0,0 +1,74 @@
+import argparse
+import numpy as np
+import os
+import pandas as pd
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+from ChildProject.metrics import segments_to_grid, conf_matrix
+
+categories = ['Adult', 'Youngster', 'Junk']
+
+# load VanDam dataset
+project = ChildProject('vandam-data')
+am = AnnotationManager(project)
+
+annotations = am.annotations[am.annotations['set'].isin(['vtc', 'zoo'])]
+segments = am.get_collapsed_segments(annotations)
+
+vtc_segments = segments.loc[segments['set'] == 'vtc']
+vtc_segments['speaker_age'] = vtc_segments['speaker_type'].replace({
+    'MAL': 'Adult',
+    'FEM': 'Adult',
+    'CHI': 'Youngster',
+    'OCH': 'Youngster'
+})
+
+zoo_segments = segments.loc[segments['set'] == 'zoo']
+zoo_segments['speaker_age'] = zoo_segments['speaker_age'].replace({
+    'Baby': 'Youngster',
+    'Child': 'Youngster',
+    'Adolescent': 'Adult'
+})
+
+vtc = segments_to_grid(
+    vtc_segments, 0, segments['segment_offset'].max(), 50, 'speaker_age', categories, none = True
+)
+zoo = segments_to_grid(
+    zoo_segments, 0, segments['segment_offset'].max(), 50, 'speaker_age', categories, none = True
+)
+
+vtc = vtc[zoo[:,-1] == 0][:,:-1]
+zoo = zoo[zoo[:,-1] == 0][:,:-1]
+
+confusion_counts = conf_matrix(vtc, zoo)
+
+print(confusion_counts)
+
+plt.rcParams.update({'font.size': 12})
+plt.rc('xtick', labelsize = 10)
+plt.rc('ytick', labelsize = 10)
+
+fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(6.4*2, 4.8))
+
+confusion = confusion_counts/np.sum(vtc, axis = 0)[:,None]
+
+sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[0], cmap = 'Reds')
+axes[0].set_xlabel('zoo')
+axes[0].set_ylabel('vtc')
+axes[0].xaxis.set_ticklabels(categories)
+axes[0].yaxis.set_ticklabels(categories)
+
+confusion_counts = np.transpose(confusion_counts)
+confusion = confusion_counts/np.sum(zoo, axis = 0)[:,None]
+
+sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[1], cmap = 'Reds')
+axes[1].set_xlabel('vtc')
+axes[1].set_ylabel('zoo')
+axes[1].xaxis.set_ticklabels(categories)
+axes[1].yaxis.set_ticklabels(categories)
+
+plt.savefig('annotations/comparison.png', bbox_inches = 'tight')

+ 1 - 0
annotations/comparison.png

@@ -0,0 +1 @@
+../.git/annex/objects/v3/KP/MD5E-s35527--0d17854096fb9df1d49b5ba4554416d2.png/MD5E-s35527--0d17854096fb9df1d49b5ba4554416d2.png

+ 73 - 0
annotations/feed-annotations.py

@@ -0,0 +1,73 @@
+import argparse
+import os
+import pandas as pd
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+
+parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam')
+parser.add_argument('classifications', help = 'classifications file')
+args = parser.parse_args()
+
+# load VanDam dataset
+project = ChildProject('vandam-data')
+am = AnnotationManager(project)
+
+# read classifications
+classifications = pd.read_csv(args.classifications)
+
+# recover the majority choice
+classifications = (
+    classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id'])
+    .agg(answer = ('answer', lambda x:x.value_counts().index[0]))
+).reset_index()
+
+# combine all tasks into one row per chunk
+classifications = classifications.pivot(
+    index = ['recording_filename', 'onset', 'offset'],
+    columns = ['task_id'],
+    values = ['answer']
+)
+
+# rename the columns
+classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values]
+classifications.reset_index(inplace = True)
+classifications.rename(columns = {
+    'onset': 'segment_onset',
+    'offset': 'segment_offset',
+    'answer_T1': 'speaker_age',
+    'answer_T2': 'speaker_gender',
+    'answer_T3': 'vcm_type'
+}, inplace = True)
+
+# standardized vcm_type
+classifications['vcm_type'] = classifications['vcm_type'].map({
+    'Crying': 'J',
+    'Laughing': 'L',
+    'Canonical': 'C',
+    'Non-Canonical': 'N'
+})
+
+# save and import annotations into vandam-data
+annotations = []
+for recording_filename, segments in classifications.groupby('recording_filename'):
+    filename = os.path.splitext(recording_filename)[0]
+    segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv'))
+
+    annotations.append({
+        'recording_filename': recording_filename,
+        'raw_filename': filename + '.csv',
+        'set': 'zoo',
+        'format': 'csv',
+        'time_seek': 0,
+        'range_onset': 0
+    })
+
+annotations = pd.DataFrame(annotations)
+
+annotations = annotations.merge(project.recordings[['recording_filename', 'duration']])
+annotations['range_offset'] = annotations['duration']
+
+if __name__ == '__main__':
+    am.remove_set('zoo')
+    am.import_annotations(annotations)