2 years ago · 00ccf6dde6
--- a/Fig4.pdf
+++ b/Fig4.pdf
@@ -1 +1 @@
 
				-.git/annex/objects/Qq/g4/MD5E-s19440--0e343483be93bc292ef16b5c6c194fd4.pdf/MD5E-s19440--0e343483be93bc292ef16b5c6c194fd4.pdf
			
 
				+.git/annex/objects/QG/pj/MD5E-s17834--919b6b31222728bc516a01d08a660dde.pdf/MD5E-s17834--919b6b31222728bc516a01d08a660dde.pdf
			
--- a/Fig5.pdf
+++ b/Fig5.pdf
@@ -1 +1 @@
 
				-.git/annex/objects/v0/Jf/MD5E-s15636--6e365c33a40b5f870ccd610b355c6b15.pdf/MD5E-s15636--6e365c33a40b5f870ccd610b355c6b15.pdf
			
 
				+.git/annex/objects/3v/pM/MD5E-s15625--bf6d10d87c8366ebb3da2ec266102093.pdf/MD5E-s15625--bf6d10d87c8366ebb3da2ec266102093.pdf
			
--- a/Makefile
+++ b/Makefile
@@ -9,10 +9,10 @@ main.pdf: main.tex references.bib Fig4.pdf Fig5.pdf
 
				 Fig4.pdf: code/recall.py scores.csv
			
 
				 	code/recall.py vandam-data
			
 
				 
			
 
				-Fig5.pdf: code/confusion_matrix.py vandam-data/annotations/its/converted/*.csv vandam-data/annotations/vtc/converted/*.csv
			
 
				+Fig5.pdf: code/confusion_matrix.py vandam-data/annotations/eaf/converted/*.csv vandam-data/annotations/vtc/converted/*.csv
			
 
				 	code/confusion_matrix.py vandam-data
			
 
				 
			
 
				-scores.csv: vandam-data/annotations/its/converted/*.csv vandam-data/annotations/vtc/converted/*.csv
			
 
				+scores.csv: vandam-data/annotations/its/converted/*.csv vandam-data/annotations/vtc/converted/*.csv vandam-data/annotations/eaf/converted/*.csv vandam-data/annotations/cha/aligned/converted/*.csv
			
 
				 	code/recall.py vandam-data
			
 
				 
			
 
				 vandam-data/annotations/its/converted/*.csv:
			
@@ -21,6 +21,12 @@ vandam-data/annotations/its/converted/*.csv:
 
				 vandam-data/annotations/vtc/converted/*.csv:
			
 
				 	datalad get vandam-data/annotations/vtc/converted
			
 
				 
			
 
				+vandam-data/annotations/cha/aligned/converted/*.csv:
			
 
				+	datalad get vandam-data/annotations/cha/aligned/converted
			
 
				+
			
 
				+vandam-data/annotations/eaf/converted/*.csv:
			
 
				+	datalad get vandam-data/annotations/eaf/converted
			
 
				+
			
 
				 # This rule cleans up temporary LaTeX files, and result and PDF files
			
 
				 clean:
			
 
				 	rm -f main.bbl main.aux main.blg main.log main.out main.pdf main.tdo main.fls main.fdb_latexmk texput.log *-eps-converted-to.pdf scores.csv
			
--- a/code/confusion_matrix.py
+++ b/code/confusion_matrix.py
@@ -23,16 +23,16 @@ if __name__ == '__main__':
 
				     am = AnnotationManager(project)
			
 
				     am.read()
			
 
				 
			
 
				-    intersection = AnnotationManager.intersection(am.annotations, ['vtc', 'its'])
			
 
				+    intersection = AnnotationManager.intersection(am.annotations, ['vtc', 'eaf'])
			
 
				     segments = am.get_collapsed_segments(intersection)
			
 
				     segments = segments[segments['speaker_type'].isin(speakers)]
			
 
				 
			
 
				     vtc = segments_to_grid(segments[segments['set'] == 'vtc'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
			
 
				-    its = segments_to_grid(segments[segments['set'] == 'its'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
			
 
				+    eaf = segments_to_grid(segments[segments['set'] == 'eaf'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
			
 
				 
			
 
				     speakers.extend(['none'])
			
 
				 
			
 
				-    confusion_counts = conf_matrix(vtc, its)
			
 
				+    confusion_counts = conf_matrix(vtc, eaf)
			
 
				 
			
 
				     plt.rcParams.update({'font.size': 12})
			
 
				     plt.rc('xtick', labelsize = 10)
			
@@ -43,17 +43,17 @@ if __name__ == '__main__':
 
				     confusion = confusion_counts/np.sum(vtc, axis = 0)[:,None]
			
 
				 
			
 
				     sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[0], cmap = 'Reds')
			
 
				-    axes[0].set_xlabel('its')
			
 
				+    axes[0].set_xlabel('eaf')
			
 
				     axes[0].set_ylabel('vtc')
			
 
				     axes[0].xaxis.set_ticklabels(speakers)
			
 
				     axes[0].yaxis.set_ticklabels(speakers)
			
 
				 
			
 
				     confusion_counts = np.transpose(confusion_counts)
			
 
				-    confusion = confusion_counts/np.sum(its, axis = 0)[:,None]
			
 
				+    confusion = confusion_counts/np.sum(eaf, axis = 0)[:,None]
			
 
				 
			
 
				     sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[1], cmap = 'Reds')
			
 
				     axes[1].set_xlabel('vtc')
			
 
				-    axes[1].set_ylabel('its')
			
 
				+    axes[1].set_ylabel('eaf')
			
 
				     axes[1].xaxis.set_ticklabels(speakers)
			
 
				     axes[1].yaxis.set_ticklabels(speakers)
			
 
				 
			
--- a/code/recall.py
+++ b/code/recall.py
@@ -12,16 +12,12 @@ import random
 
				 import sys
			
 
				 
			
 
				 speakers = ['CHI', 'OCH', 'FEM', 'MAL']
			
 
				-sets = ['its', 'vtc (conf 50%)', 'vtc (drop 50%)', 'vtc (conf 75%)', 'vtc (drop 75%)']
			
 
				 
			
 
				-def confusion(segments, prob):
			
 
				-    segments['speaker_type'] = segments['speaker_type'].apply(
			
 
				-        lambda s: random.choice(speakers) if random.random() < prob else s
			
 
				-    )
			
 
				-    return segments
			
 
				-
			
 
				-def drop(segments, prob):
			
 
				-    return segments.sample(frac = 1-prob)
			
 
				+sets = {
			
 
				+    'vtc': 'VTC',
			
 
				+    'its': 'LENA',
			
 
				+    'cha/aligned': 'chat+mfa'
			
 
				+}
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     if not os.path.exists('scores.csv'):
			
@@ -32,33 +28,15 @@ if __name__ == '__main__':
 
				         am = AnnotationManager(project)
			
 
				         am.read()
			
 
				 
			
 
				-        intersection = AnnotationManager.intersection(am.annotations, ['vtc', 'its'])
			
 
				+        intersection = AnnotationManager.intersection(am.annotations, ['eaf'] + list(sets.keys()))
			
 
				         segments = am.get_collapsed_segments(intersection)
			
 
				         segments = segments[segments['speaker_type'].isin(speakers)]
			
 
				 
			
 
				-        conf50 = segments[segments['set'] == 'vtc'].copy()
			
 
				-        conf50 = confusion(conf50, 0.5)
			
 
				-        conf50['set'] = 'vtc (conf 50%)'
			
 
				-
			
 
				-        conf75 = segments[segments['set'] == 'vtc'].copy()
			
 
				-        conf75 = confusion(conf75, 0.75)
			
 
				-        conf75['set'] = 'vtc (conf 75%)'
			
 
				-
			
 
				-        drop50 = segments[segments['set'] == 'vtc'].copy()
			
 
				-        drop50 = drop(drop50, 0.5)
			
 
				-        drop50['set'] = 'vtc (drop 50%)'
			
 
				-
			
 
				-        drop75 = segments[segments['set'] == 'vtc'].copy()
			
 
				-        drop75 = drop(drop75, 0.75)
			
 
				-        drop75['set'] = 'vtc (drop 75%)'
			
 
				-
			
 
				-        segments = pd.concat([segments, conf50, conf75, drop50, drop75])
			
 
				-
			
 
				         metric = DetectionPrecisionRecallFMeasure()
			
 
				 
			
 
				         scores = []
			
 
				         for speaker in speakers:
			
 
				-            ref = segments_to_annotation(segments[(segments['set'] == 'vtc') & (segments['speaker_type'] == speaker)], 'speaker_type')
			
 
				+            ref = segments_to_annotation(segments[(segments['set'] == 'eaf') & (segments['speaker_type'] == speaker)], 'speaker_type')
			
 
				 
			
 
				             for s in sets:
			
 
				                 hyp = segments_to_annotation(segments[(segments['set'] == s) & (segments['speaker_type'] == speaker)], 'speaker_type')
			
@@ -106,7 +84,7 @@ if __name__ == '__main__':
 
				 
			
 
				         if i >= 2:
			
 
				             ax.set_xticks(range(len(sets)))
			
 
				-            ax.set_xticklabels(sets, rotation = 45, horizontalalignment = 'right')
			
 
				+            ax.set_xticklabels(sets.values(), rotation = 45, horizontalalignment = 'right')
			
 
				         else:
			
 
				             ax.set_xticklabels(['' for i in range(len(sets))])
			
 
				 
			
@@ -118,7 +96,7 @@ if __name__ == '__main__':
 
				         _scores = scores[scores['speaker'] == speaker]
			
 
				         for metric in ['recall', 'precision', 'f']:
			
 
				             ax.scatter(
			
 
				-                x = _scores['set'].apply(lambda s: sets.index(s)),
			
 
				+                x = _scores['set'].apply(lambda s: list(sets.keys()).index(s)),
			
 
				                 y = _scores[metric],
			
 
				                 label = labels[metric],
			
 
				                 s = 15,
			
--- a/main.pdf
+++ b/main.pdf
@@ -1 +1 @@
 
				-.git/annex/objects/2M/kf/MD5E-s367487--425211da1dc40679d1b9947d8358e114.pdf/MD5E-s367487--425211da1dc40679d1b9947d8358e114.pdf
			
 
				+.git/annex/objects/wg/j3/MD5E-s366723--fd2b7a18493d831ad28a3dc950fab4b2.pdf/MD5E-s366723--fd2b7a18493d831ad28a3dc950fab4b2.pdf
			
--- a/main.tex
+++ b/main.tex
@@ -409,7 +409,7 @@ It should be noted that these measures are most useful in the absence of ground
 
				 \centering
			
 
				 \includegraphics[width=0.8\textwidth]{Fig4.pdf}
			
 
				 
			
 
				-\caption{\label{fig:precision}\textbf{Examples of diarization performance evaluation using recall, precision and F1 score}. Audio from the the public VanDam corpus \citep{vandam-day} is annotated according to who-speaks-when, using both the LENA diarizer (its) and the Voice Type Classifier (VTC) by \citet{lavechin2020opensource}. Speech segments are classified among four speaker types: the key child (CHI), other children (OCH), male adults (MAL) and female adults (FEM). For illustration purposes, fake annotations are generated from that of the VTC. Two are computed by randomly assigning the speaker type to 50\% or 75\% (conf) of the VTC's speech segments. Two are computed by dropping 50\% or 75\% of speech segments from the VTC (drop). Recall, precision and F1 score are calculated for each of these annotations, by comparing them to the VTC. The worst F-score for the LENA is reached for OCH segments. Dropping segments does not alter precision; however, as expected, it has a substantially negative impact on recall.
			
 
				+\caption{\label{fig:precision}\textbf{Examples of diarization performance evaluation using recall, precision and F1 score}. Audio from the the public VanDam corpus \citep{vandam-day} is annotated automatically according to who-speaks-when, using: the LENA diarizer; the Voice Type Classifier (VTC) by \citet{lavechin2020opensource}; and manual CHAT transcriptions \citep{MacWhinney2000} adjusted with the Montreal Forced Aligner \citep{mfa} (``cha''). Speech segments are classified among four speaker types: the key child (CHI), other children (OCH), male adults (MAL) and female adults (FEM). Recall, precision and F1 score are calculated for each of these annotations, by comparing them to annotations of 5 $\times$ 1 minute clips annotated by a human annotator using ELAN (``eaf''; \citealt{wittenburg2006elan}). The clips with the most adult words were targeted.
			
 
				 }
			
 
				 
			
 
				 \end{figure*}
			
@@ -420,8 +420,8 @@ It should be noted that these measures are most useful in the absence of ground
 
				 \centering
			
 
				 \includegraphics[width=\textwidth]{Fig5.pdf}
			
 
				 
			
 
				-\caption{\label{fig:confusion}\textbf{Example of diarization performance comparison using confusion matrices}
			
 
				-LENA's annotations (its) of the public VanDam corpus \citep{vandam-day} are compared to the VTC's. The first coefficient of the left side matrix should be read as: ``38\% of CHI segments from the VTC are also labelled as CHI by the LENA's''. The first coefficient of the right side matrix should be read as: ``82\% of the CHI segments of the LENA are also labelled as CHI by the VTC''. The sum of each row of the right-hand plot may exceed one, since the VTC, unlike the LENA, can detect overlapping speakers.
			
 
				+\caption{\label{fig:confusion}\textbf{Example of diarization performance evaluation using confusion matrices}
			
 
				+VTC annotations of the public VanDam corpus \citep{vandam-day} are compared to a gold standard manually annotated using ELAN (eaf). The first coefficient of the left side matrix should be read as: ``43\% of CHI segments from the VTC were also labelled as CHI by the human annotator'' (i.e. as the precision). The first coefficient of the right side matrix should be read as: ``95\% of the portions labelled as CHI speech by the annotator were also labelled as CHI by the VTC'' (i.e. as the recall). The sum of each row of the right-hand plot may exceed one due to overlapping speech. However, the diagonal should ideally be only ones.
			
 
				 }
			
 
				 
			
 
				 \end{figure*}
			
--- a/references.bib
+++ b/references.bib
@@ -617,6 +617,16 @@ journal = {}
 
				   url = {http://pyannote.github.io/pyannote-metrics},
			
 
				 }
			
 
				 
			
 
				+@inproceedings{mfa,
			
 
				+  author={Michael McAuliffe and Michaela Socolof and Sarah Mihuc and Michael Wagner and Morgan Sonderegger},
			
 
				+  title={Montreal Forced Aligner: Trainable Text-Speech Alignment Using Kaldi},
			
 
				+  year=2017,
			
 
				+  booktitle={Proc. Interspeech 2017},
			
 
				+  pages={498--502},
			
 
				+  doi={10.21437/Interspeech.2017-1386},
			
 
				+  url={http://dx.doi.org/10.21437/Interspeech.2017-1386}
			
 
				+}
			
 
				+
			
 
				 @article{Wu2018,
			
 
				   doi = {10.2196/10046},
			
 
				   url = {https://doi.org/10.2196/10046},
			
--- a/vandam-data
+++ b/vandam-data
@@ -1 +1 @@
 
				-Subproject commit 9c39475ffd287d7ebd40ba37b2bc43159d1a73e3
			
 
				+Subproject commit 1dcc4650d722cefa231fff9bd89c32bb5e54677b