2 years ago · ee4027a852
--- a/README.md
+++ b/README.md
@@ -182,3 +182,38 @@ Now, only relevant chunks are returned, and they are associated to all correspon
 
				 |346474475|2202359.0|64210728  |T1     |0        |17576      |Baby         |70   |BN32_010007.mp3   |37978055|37978555|37978100     |37979011      |BN32_010007_37978055_37978555.wav|BN32_010007_37978055_37978555.mp3|2021-07-16 19:19:44|True    |14957     |vandam_chi_fem|64210728     |chi_fem|
			
 
				 |346474475|2202359.0|64210728  |T3     |1        |17576      |Non-Canonical|70   |BN32_010007.mp3   |37978055|37978555|37978100     |37979011      |BN32_010007_37978055_37978555.wav|BN32_010007_37978055_37978555.mp3|2021-07-16 19:19:44|True    |14957     |vandam_chi_fem|64210728     |chi_fem|
			
 
				 
			
 
				+## Importing classifications into the source dataset
			
 
				+
			
 
				+Once the classifications have been recovered, they can be used to enrich the source dataset with more annotations.
			
 
				+While this step may depend a lot on the type of annotation that you are doing, this repository provides an example.
			
 
				+
			
 
				+The [feed-annotations](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/feed-annotations.py) script does just that. It can be run with:
			
 
				+
			
 
				+ ```bash
			
 
				+ python annotations/feed-annotations.py classifications/classifications.csv
			
 
				+ ```
			
 
				+
			
 
				+ The classifications are then imported into the `vandam-data` subdataset using ChildProject:
			
 
				+
			
 
				+ ```bash
			
 
				+ $ tail -n 1 vandam-data/metadata/annotations.csv 
			
 
				+zoo,BN32_010007.mp3,0,0,50464512,BN32_010007.csv,csv,,BN32_010007_0_50464512.csv,2021-07-19 11:14:58,,0.0.1
			
 
				+```
			
 
				+
			
 
				+In case several users have classified the same chunks, the majority choice is retained. You can have a look at the [source of the script](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/feed-annotations.py) to see how that works - or to adapt it to your needs!
			
 
				+
			
 
				+### Comparing Zooniverse annotations with other annotations
			
 
				+
			
 
				+Once the annotations have been imported into the original dataset, you can use all the functionalities of the ChildProject package e.g. for reliability estimations.
			
 
				+
			
 
				+For instance, let's day we'd like to test the ability of the VTC to distinguish children from adults, based on the classifications retrieved with Zooniverse.
			
 
				+
			
 
				+The [compare](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/compare.py) does just that (look at the [source](https://gin.g-node.org/LAAC-LSCP/zoo-campaign/src/master/annotations/compare.py) and try it by yourself!):
			
 
				+
			
 
				+ ```bash
			
 
				+ python annotations/compare.py
			
 
				+ ```
			
 
				+
			
 
				+ Which will output:
			
 
				+
			
 
				+ ![Comparing the VTC and Zooniverse classifications](annotations/comparison.png)
			
--- a/annotations/compare.py
+++ b/annotations/compare.py
@@ -0,0 +1,74 @@
 
				+import argparse
			
 
				+import numpy as np
			
 
				+import os
			
 
				+import pandas as pd
			
 
				+
			
 
				+import seaborn as sns
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+from ChildProject.projects import ChildProject
			
 
				+from ChildProject.annotations import AnnotationManager
			
 
				+from ChildProject.metrics import segments_to_grid, conf_matrix
			
 
				+
			
 
				+categories = ['Adult', 'Youngster', 'Junk']
			
 
				+
			
 
				+# load VanDam dataset
			
 
				+project = ChildProject('vandam-data')
			
 
				+am = AnnotationManager(project)
			
 
				+
			
 
				+annotations = am.annotations[am.annotations['set'].isin(['vtc', 'zoo'])]
			
 
				+segments = am.get_collapsed_segments(annotations)
			
 
				+
			
 
				+vtc_segments = segments.loc[segments['set'] == 'vtc']
			
 
				+vtc_segments['speaker_age'] = vtc_segments['speaker_type'].replace({
			
 
				+    'MAL': 'Adult',
			
 
				+    'FEM': 'Adult',
			
 
				+    'CHI': 'Youngster',
			
 
				+    'OCH': 'Youngster'
			
 
				+})
			
 
				+
			
 
				+zoo_segments = segments.loc[segments['set'] == 'zoo']
			
 
				+zoo_segments['speaker_age'] = zoo_segments['speaker_age'].replace({
			
 
				+    'Baby': 'Youngster',
			
 
				+    'Child': 'Youngster',
			
 
				+    'Adolescent': 'Adult'
			
 
				+})
			
 
				+
			
 
				+vtc = segments_to_grid(
			
 
				+    vtc_segments, 0, segments['segment_offset'].max(), 50, 'speaker_age', categories, none = True
			
 
				+)
			
 
				+zoo = segments_to_grid(
			
 
				+    zoo_segments, 0, segments['segment_offset'].max(), 50, 'speaker_age', categories, none = True
			
 
				+)
			
 
				+
			
 
				+vtc = vtc[zoo[:,-1] == 0][:,:-1]
			
 
				+zoo = zoo[zoo[:,-1] == 0][:,:-1]
			
 
				+
			
 
				+confusion_counts = conf_matrix(vtc, zoo)
			
 
				+
			
 
				+print(confusion_counts)
			
 
				+
			
 
				+plt.rcParams.update({'font.size': 12})
			
 
				+plt.rc('xtick', labelsize = 10)
			
 
				+plt.rc('ytick', labelsize = 10)
			
 
				+
			
 
				+fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(6.4*2, 4.8))
			
 
				+
			
 
				+confusion = confusion_counts/np.sum(vtc, axis = 0)[:,None]
			
 
				+
			
 
				+sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[0], cmap = 'Reds')
			
 
				+axes[0].set_xlabel('zoo')
			
 
				+axes[0].set_ylabel('vtc')
			
 
				+axes[0].xaxis.set_ticklabels(categories)
			
 
				+axes[0].yaxis.set_ticklabels(categories)
			
 
				+
			
 
				+confusion_counts = np.transpose(confusion_counts)
			
 
				+confusion = confusion_counts/np.sum(zoo, axis = 0)[:,None]
			
 
				+
			
 
				+sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[1], cmap = 'Reds')
			
 
				+axes[1].set_xlabel('vtc')
			
 
				+axes[1].set_ylabel('zoo')
			
 
				+axes[1].xaxis.set_ticklabels(categories)
			
 
				+axes[1].yaxis.set_ticklabels(categories)
			
 
				+
			
 
				+plt.savefig('annotations/comparison.png', bbox_inches = 'tight')
			
--- a/annotations/comparison.png
+++ b/annotations/comparison.png
@@ -0,0 +1 @@
 
				+../.git/annex/objects/v3/KP/MD5E-s35527--0d17854096fb9df1d49b5ba4554416d2.png/MD5E-s35527--0d17854096fb9df1d49b5ba4554416d2.png
			
--- a/annotations/feed-annotations.py
+++ b/annotations/feed-annotations.py
@@ -0,0 +1,73 @@
 
				+import argparse
			
 
				+import os
			
 
				+import pandas as pd
			
 
				+
			
 
				+from ChildProject.projects import ChildProject
			
 
				+from ChildProject.annotations import AnnotationManager
			
 
				+
			
 
				+parser = argparse.ArgumentParser(description = 'import retrieved classifications as annotations into VanDam')
			
 
				+parser.add_argument('classifications', help = 'classifications file')
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+# load VanDam dataset
			
 
				+project = ChildProject('vandam-data')
			
 
				+am = AnnotationManager(project)
			
 
				+
			
 
				+# read classifications
			
 
				+classifications = pd.read_csv(args.classifications)
			
 
				+
			
 
				+# recover the majority choice
			
 
				+classifications = (
			
 
				+    classifications.groupby(['recording_filename', 'onset', 'offset', 'task_id'])
			
 
				+    .agg(answer = ('answer', lambda x:x.value_counts().index[0]))
			
 
				+).reset_index()
			
 
				+
			
 
				+# combine all tasks into one row per chunk
			
 
				+classifications = classifications.pivot(
			
 
				+    index = ['recording_filename', 'onset', 'offset'],
			
 
				+    columns = ['task_id'],
			
 
				+    values = ['answer']
			
 
				+)
			
 
				+
			
 
				+# rename the columns
			
 
				+classifications.columns = ['_'.join(col).strip() for col in classifications.columns.values]
			
 
				+classifications.reset_index(inplace = True)
			
 
				+classifications.rename(columns = {
			
 
				+    'onset': 'segment_onset',
			
 
				+    'offset': 'segment_offset',
			
 
				+    'answer_T1': 'speaker_age',
			
 
				+    'answer_T2': 'speaker_gender',
			
 
				+    'answer_T3': 'vcm_type'
			
 
				+}, inplace = True)
			
 
				+
			
 
				+# standardized vcm_type
			
 
				+classifications['vcm_type'] = classifications['vcm_type'].map({
			
 
				+    'Crying': 'J',
			
 
				+    'Laughing': 'L',
			
 
				+    'Canonical': 'C',
			
 
				+    'Non-Canonical': 'N'
			
 
				+})
			
 
				+
			
 
				+# save and import annotations into vandam-data
			
 
				+annotations = []
			
 
				+for recording_filename, segments in classifications.groupby('recording_filename'):
			
 
				+    filename = os.path.splitext(recording_filename)[0]
			
 
				+    segments.to_csv(os.path.join(project.path, 'annotations/zoo/raw', filename + '.csv'))
			
 
				+
			
 
				+    annotations.append({
			
 
				+        'recording_filename': recording_filename,
			
 
				+        'raw_filename': filename + '.csv',
			
 
				+        'set': 'zoo',
			
 
				+        'format': 'csv',
			
 
				+        'time_seek': 0,
			
 
				+        'range_onset': 0
			
 
				+    })
			
 
				+
			
 
				+annotations = pd.DataFrame(annotations)
			
 
				+
			
 
				+annotations = annotations.merge(project.recordings[['recording_filename', 'duration']])
			
 
				+annotations['range_offset'] = annotations['duration']
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    am.remove_set('zoo')
			
 
				+    am.import_annotations(annotations)
		`@@ -0,0 +1 @@`
		`+../.git/annex/objects/v3/KP/MD5E-s35527--0d17854096fb9df1d49b5ba4554416d2.png/MD5E-s35527--0d17854096fb9df1d49b5ba4554416d2.png`