2 years ago · 46d751dc86
--- a/Fig1.tex
+++ b/Fig1.tex
@@ -0,0 +1,103 @@
 
				+\tikzset{%
			
 
				+  >={Latex[width=2mm,length=2mm]},
			
 
				+            base/.style = {rectangle, rounded corners, draw=black,
			
 
				+                           minimum width=4cm, minimum height=1cm,
			
 
				+                           text centered, font=\sffamily},
			
 
				+  recorder/.style = {base, fill=blue!30},
			
 
				+       meta/.style = {base, fill=red!30},
			
 
				+    annotator/.style = {base, fill=green!30},
			
 
				+          txt/.style = {font=\sffamily, text centered},
			
 
				+         title/.style = {txt, font=\sffamily\Large},
			
 
				+}
			
 
				+
			
 
				+% Drawing part, node distance is 1.5 cm and every node
			
 
				+% is prefilled with white background
			
 
				+\begin{tikzpicture}[node distance=5cm,
			
 
				+    every node/.style={fill=white, font=\sffamily}, align=center]
			
 
				+  % Specification of nodes (position, etc.)
			
 
				+%   \node (lena) [recorder] {LENA recorder};
			
 
				+%   \node (babylogger) [recorder, right of=lena, xshift=5em] {BabyLogger};
			
 
				+%   \node (others) [recorder, right of=babylogger] {Other alternatives\\
			
 
				+%   \footnotesize{USB, Olympus...}};
			
 
				+
			
 
				+%   \node (lena_software) [classifier, below of=lena, yshift = 5em] {LENA software\\
			
 
				+%   \footnotesize{Speaker type, Adult Word Count,}\\
			
 
				+%   \footnotesize{Child Vocalization Count, Conversational Turn Count}};
			
 
				+  
			
 
				+%   \node [txt, below of=lena_software, yshift=10em] {\large{LENA commercial environment}};
			
 
				+
			
 
				+%   \node (vtc) [classifier, below of=babylogger, xshift=8em, yshift = 7em] {Voice Type Classifier (VTC)\\
			
 
				+%   \footnotesize {speech detection, speaker type classification}};
			
 
				+  
			
 
				+%   \node (alice) [classifier, below of=vtc, yshift = 8em] {Automatic LInguistic Unit Count Estimator (ALICE)\\
			
 
				+%   \footnotesize {phoneme, syllable and word counts}};
			
 
				+
			
 
				+
			
 
				+%   \node (seshat) [annotator, below of=lena_software, xshift=15em, yshift = 2em]  {Seshat\\
			
 
				+%   \footnotesize{web-based annotator}\\
			
 
				+%   \footnotesize{inter-rater reliability}};
			
 
				+   
			
 
				+%   \node (zooniverse) [annotator,right of=seshat]  {Zooniverse\\
			
 
				+%   \footnotesize{crowd-sourced classification tasks}};
			
 
				+   
			
 
				+%   \node (elan) [annotator, left of=seshat]  {ELAN\\
			
 
				+%   \footnotesize{annotation software}};
			
 
				+   
			
 
				+%   \node (das) [annotator, below of=elan, yshift = 10em]  {ACLEW DAS\\
			
 
				+%   \footnotesize{annotation scheme}};
			
 
				+
			
 
				+%     \node (recorders) [title,right of=others] {Recording device};
			
 
				+%     \node (classifiers) [title,below of=recorders, yshift=3em] {Automatic annotation};
			
 
				+%     \node (annotators) [title,below of=classifiers, yshift=3em] {Manual annotation};
			
 
				+
			
 
				+
			
 
				+     
			
 
				+%   % Specification of lines between nodes specified above
			
 
				+%   % with aditional nodes for description 
			
 
				+%   \draw[->]             (lena) -- (lena_software);
			
 
				+%   \draw[->]             (vtc) -- (alice);
			
 
				+%   \draw[->]             (elan) -- (das);
			
 
				+
			
 
				+% \draw [draw=black,dashed] ($(lena.north west) + (-2,0.5)$) rectangle ($(lena_software.south east) + (1,-1.5)$);
			
 
				+
			
 
				+\node (media) [recorder] {
			
 
				+\Large \textbf{Media} \normalsize \\
			
 
				+($\sim 10^2$ to $10^4$ hours) \\
			
 
				+\framebox{
			
 
				+    {\begin{varwidth}{\linewidth}\begin{itemize}
			
 
				+        \item Audio (up to 24 hours \\ per recording)
			
 
				+        \item Video (up to 30 minutes \\ per recording)
			
 
				+        \item Accelerometer data (xyz)
			
 
				+        \item etc.
			
 
				+    \end{itemize}\end{varwidth}}
			
 
				+}};
			
 
				+
			
 
				+\node (annotations) [annotator,shape=rectangle,draw,right of=media] {
			
 
				+\Large \textbf{Annotations} \normalsize \\
			
 
				+($\sim 10^5$ to $10^7$ segments)\\ \\
			
 
				+            \begin{tabular}{l r}
			
 
				+             \textbf{manual} & \textbf{automated}\\
			
 
				+              \multicolumn{2}{c}{who-speaks-when} \\
			
 
				+              \multicolumn{2}{c}{linguistic units} \\
			
 
				+              \multicolumn{2}{c}{vocal maturity} \\
			
 
				+               speech directedness &  \\
			
 
				+               transcriptions &  \\
			
 
				+
			
 
				+            \end{tabular}
			
 
				+        };
			
 
				+        
			
 
				+\node (metadata) at ($(media)!0.5!(annotations)-(0,3)$) [meta,shape=rectangle,draw] {
			
 
				+\Large \textbf{Metadata} \normalsize \\ \\
			
 
				+            \framebox{
			
 
				+    {\begin{varwidth}{\linewidth}\begin{itemize}
			
 
				+        \item Recordings date and time, type of device, etc.
			
 
				+        \item Child date of birth, gender, normativity, etc.
			
 
				+        \item Socio-economic status, location, language(s), household size, etc.
			
 
				+        \item Questionnaires
			
 
				+    \end{itemize}\end{varwidth}}
			
 
				+}};
			
 
				+        
			
 
				+
			
 
				+
			
 
				+  \end{tikzpicture}
			
 
				+  
			
--- a/Fig2.pdf
+++ b/Fig2.pdf
@@ -0,0 +1 @@
 
				+/annex/objects/MD5E-s99210--e2a3942abc0c080a207078724b3bfae2.pdf
			
--- a/Fig3a.eps
+++ b/Fig3a.eps
--- a/Fig1a.tex
+++ b/Fig1a.tex
@@ -102,7 +102,7 @@
 
				       \put(209,2629){\rotatebox{-270}{\makebox(0,0){\strut{}DataLad versions released}}}%
			
 
				     }%
			
 
				     \gplbacktext
			
 
				-    \put(0,0){\includegraphics[width={360.00bp},height={252.00bp}]{Fig1a}}%
			
 
				+    \put(0,0){\includegraphics[width={360.00bp},height={252.00bp}]{Fig3a}}%
			
 
				     \gplfronttext
			
 
				   \end{picture}%
			
 
				 \endgroup
			
--- a/Fig3b.eps
+++ b/Fig3b.eps
--- a/Fig1b.tex
+++ b/Fig1b.tex
@@ -97,7 +97,7 @@
 
				       \put(3940,154){\makebox(0,0){\strut{}Top contributors}}%
			
 
				     }%
			
 
				     \gplbacktext
			
 
				-    \put(0,0){\includegraphics[width={360.00bp},height={252.00bp}]{Fig1b}}%
			
 
				+    \put(0,0){\includegraphics[width={360.00bp},height={252.00bp}]{Fig3b}}%
			
 
				     \gplfronttext
			
 
				   \end{picture}%
			
 
				 \endgroup
			
--- a/Fig4.pdf
+++ b/Fig4.pdf
@@ -1 +0,0 @@
 
				-.git/annex/objects/QG/pj/MD5E-s17834--919b6b31222728bc516a01d08a660dde.pdf/MD5E-s17834--919b6b31222728bc516a01d08a660dde.pdf
			
--- a/Fig4.tex
+++ b/Fig4.tex
--- a/Fig5.pdf
+++ b/Fig5.pdf
@@ -1 +0,0 @@
 
				-.git/annex/objects/3v/pM/MD5E-s15625--bf6d10d87c8366ebb3da2ec266102093.pdf/MD5E-s15625--bf6d10d87c8366ebb3da2ec266102093.pdf
			
--- a/Fig5a.jpg
+++ b/Fig5a.jpg
--- a/Fig5b.jpg
+++ b/Fig5b.jpg
--- a/Fig6.pdf
+++ b/Fig6.pdf
@@ -0,0 +1 @@
 
				+/annex/objects/MD5E-s34898--09dc2245e5fad57d4cc29955d8aae32a.pdf
			
--- a/Fig7.pdf
+++ b/Fig7.pdf
@@ -0,0 +1 @@
 
				+/annex/objects/MD5E-s37730--8e4ed85b5df1400785cc187a83145f13.pdf
			
--- a/Makefile
+++ b/Makefile
@@ -3,13 +3,13 @@ all: main.pdf
 
				 # This rule is executed last, and renders the full PDF from the manuscript with latexmk.
			
 
				 # The -g flag is used to *always* process the document, even if no changes have been made to it.
			
 
				 
			
 
				-main.pdf: main.tex references.bib Fig4.pdf Fig5.pdf
			
 
				+main.pdf: main.tex references.bib Fig2.pdf Fig6.pdf Fig7.pdf
			
 
				 	latexmk -pdf -g $<
			
 
				 
			
 
				-Fig4.pdf: code/recall.py scores.csv
			
 
				+Fig6.pdf: code/recall.py scores.csv
			
 
				 	code/recall.py vandam-data
			
 
				 
			
 
				-Fig5.pdf: code/confusion_matrix.py vandam-data/annotations/eaf/converted/*.csv vandam-data/annotations/vtc/converted/*.csv
			
 
				+Fig7.pdf: code/confusion_matrix.py vandam-data/annotations/eaf/converted/*.csv vandam-data/annotations/vtc/converted/*.csv
			
 
				 	code/confusion_matrix.py vandam-data
			
 
				 
			
 
				 scores.csv: vandam-data/annotations/its/converted/*.csv vandam-data/annotations/vtc/converted/*.csv vandam-data/annotations/eaf/converted/*.csv vandam-data/annotations/cha/aligned/converted/*.csv
			
@@ -27,7 +27,7 @@ vandam-data/annotations/cha/aligned/converted/*.csv:
 
				 vandam-data/annotations/eaf/converted/*.csv:
			
 
				 	datalad get vandam-data/annotations/eaf/converted
			
 
				 
			
 
				-sample.pdf: code/sample.py vandam-data/recordings/converted/standard
			
 
				+Fig2.pdf: code/sample.py vandam-data/recordings/converted/standard
			
 
				 	python code/sample.py
			
 
				 
			
 
				 vandam-data/recordings/converted/standard:
			
--- a/code/confusion_matrix.py
+++ b/code/confusion_matrix.py
@@ -10,8 +10,16 @@ from sklearn.metrics import confusion_matrix
 
				 from sklearn.preprocessing import normalize
			
 
				 
			
 
				 import seaborn as sns
			
 
				+import matplotlib
			
 
				 import matplotlib.pyplot as plt
			
 
				-
			
 
				+matplotlib.use("pgf")
			
 
				+matplotlib.rcParams.update({
			
 
				+    "pgf.texsystem": "pdflatex",
			
 
				+    'font.family': 'serif',
			
 
				+    "font.serif" : "Times New Roman",
			
 
				+    'text.usetex': True,
			
 
				+    'pgf.rcfonts': False,
			
 
				+})
			
 
				 import sys
			
 
				 
			
 
				 speakers = ['CHI', 'OCH', 'FEM', 'MAL']
			
@@ -57,4 +65,4 @@ if __name__ == '__main__':
 
				     axes[1].xaxis.set_ticklabels(speakers)
			
 
				     axes[1].yaxis.set_ticklabels(speakers)
			
 
				 
			
 
				-    plt.savefig('Fig5.pdf', bbox_inches = 'tight')
			
 
				+    plt.savefig('Fig7.pdf', bbox_inches = 'tight')
			
--- a/code/recall.py
+++ b/code/recall.py
@@ -4,7 +4,16 @@ from ChildProject.projects import ChildProject
 
				 from ChildProject.annotations import AnnotationManager
			
 
				 from ChildProject.metrics import segments_to_annotation
			
 
				 
			
 
				+import matplotlib
			
 
				 import matplotlib.pyplot as plt
			
 
				+matplotlib.use("pgf")
			
 
				+matplotlib.rcParams.update({
			
 
				+    "pgf.texsystem": "pdflatex",
			
 
				+    'font.family': 'serif',
			
 
				+    "font.serif" : "Times New Roman",
			
 
				+    'text.usetex': True,
			
 
				+    'pgf.rcfonts': False,
			
 
				+})
			
 
				 import numpy as np
			
 
				 import os
			
 
				 import pandas as pd
			
@@ -107,4 +116,4 @@ if __name__ == '__main__':
 
				     ax.legend(loc = "upper right", borderaxespad = 0.1, bbox_to_anchor=(1, 1.25), ncol = 3)
			
 
				 
			
 
				     plt.subplots_adjust(wspace = 0.15)
			
 
				-    plt.savefig('Fig4.pdf', bbox_inches = 'tight')
			
 
				+    plt.savefig('Fig6.pdf', bbox_inches = 'tight')
			
--- a/code/sample.py
+++ b/code/sample.py
@@ -58,9 +58,9 @@ if __name__ == "__main__":
 
				         1 / sr,
			
 
				     )
			
 
				 
			
 
				-    plt.plot(time, signal, color = 'black')
			
 
				+    plt.plot(time, .5*signal, color = 'black')
			
 
				 
			
 
				-    positions = {"eaf": -0.6, "cha": -0.9, "its": -1.2}
			
 
				+    positions = {"eaf": -0.4, "cha": -0.6, "its": -0.8}
			
 
				     annotators = {"its": '\\textbf{LENA}', 'cha': '\\textbf{Annotator 2}\n\\textbf{(CHAT)}', 'eaf': '\\textbf{Annotator 1}\n\\textbf{(ELAN)}'}
			
 
				     colors = {"MAL": "red", "FEM": "blue", "CHI": "green"}
			
 
				     speakers = {"MAL": "male adult", "FEM": "female adult", "CHI": "key child"}
			
@@ -103,4 +103,4 @@ if __name__ == "__main__":
 
				 
			
 
				 
			
 
				     plt.axis("off")
			
 
				-    plt.savefig("sample.pdf", bbox_inches = 'tight')
			
 
				+    plt.savefig("Fig2.pdf", bbox_inches = 'tight')
			
--- a/main.pdf
+++ b/main.pdf
@@ -1 +0,0 @@
 
				-.git/annex/objects/wg/j3/MD5E-s366723--fd2b7a18493d831ad28a3dc950fab4b2.pdf/MD5E-s366723--fd2b7a18493d831ad28a3dc950fab4b2.pdf
			
--- a/main.pdf
+++ b/main.pdf
@@ -0,0 +1 @@
 
				+/annex/objects/MD5E-s470526--97726c8157984ba440543dfc1dfa4c52.pdf
			
--- a/main.tex
+++ b/main.tex
@@ -28,6 +28,9 @@
 
				 \newcommand{\inputTikZ}[2]{%  
			
 
				      \scalebox{#1}{\input{#2}}
			
 
				 }
			
 
				+\usepackage{varwidth}
			
 
				+\usepackage{pgfplots}
			
 
				+
			
 
				 \usepackage{subfig}
			
 
				 \usepackage{epstopdf}
			
 
				 \usepackage{textcomp}
			
@@ -57,7 +60,7 @@ Laboratoire de Sciences Cognitives et de Psycholinguistique, Département d'Etud
 
				 \maketitle
			
 
				 
			
 
				 \abstract{
			
 
				-The technique of  long-form recordings via wearables is gaining momentum in different fields of research, notably linguistics and pathology. This technique, however, poses several technical challenges, some of which are amplified by the peculiarities of the data, including their sensitivity and their volume. In this paper, we begin by outlining key problems related to the management, storage, and sharing of the corpora  that emerge when using this technique. We continue by proposing a multi-component solution to these problems, specifically in the case of daylong recordings of children. As part of this solution, we release \emph{ChildProject}, a python package for performing the operations typically required by such datasets and for evaluating the reliability of annotations using a number of measures commonly used in speech processing and linguistics. Our proposal could be generalized to other  populations. 
			
 
				+The technique of  long-form recordings via wearables is gaining momentum in different fields of research, notably linguistics and pathology. This technique, however, poses several technical challenges, some of which are amplified by the peculiarities of the data, including their sensitivity and their volume. In this paper, we begin by outlining key problems related to the management, storage, and sharing of the corpora  that emerge when using this technique. We continue by proposing a multi-component solution to these problems, specifically in the case of daylong recordings of children. As part of this solution, we release \emph{ChildProject}, a python package for performing the operations typically required by such datasets and for evaluating the reliability of annotations using a number of measures commonly used in speech processing and linguistics. This package builds upon an annotation management system that allows the importation of annotations from a wide range of existing formats, as well as data validation procedures to assert the conformity of the data, or, alternatively, produce detailed and explicit error reports. Our proposal could be generalized to   populations other than children. 
			
 
				 }
			
 
				 
			
 
				 \keywords{daylong recordings, speech data management, data distribution, annotation evaluation, inter-rater reliability, reproducible research}
			
@@ -65,17 +68,31 @@ The technique of  long-form recordings via wearables is gaining momentum in diff
 
				 
			
 
				 \section{Introduction}
			
 
				 
			
 
				-Long-form recordings are those collected over extended periods of time, typically via a wearable. Although the technique was used with normotypical adults decades ago \citep{ear1,ear2}, it became widespread in the study of early childhood over the last decade since the publication of a seminal white paper by the LENA Foundation \citep{gilkerson2008power}. The LENA Foundation created a hardware-software combination that illuminated the potential of this technique for theoretical and applied purposes (e.g., \citealt{christakis2009audible,warlaumont2014social}). More recently, long-form data is being discussed in the context of neurological disorders (e.g., \citealt{riad2020vocal}). In this article, we define the unique space of difficulties surrounding long-form recordings, and introduce a set of packages that provides practical solutions, with a focus on child-centered recordings.  We end by discussing ways in which these solutions could be generalized to other populations. In order to demonstrate how our proposal could foster reproducible research on day-long recordings of children, we have released the source of the paper and the code used to build the figures which illustrate the capabilities of our python package in Section \ref{section:application}.
			
 
				+Long-form recordings are those collected over extended periods of time, typically via a wearable. Although the technique was used with normotypical adults decades ago \citep{ear1,ear2}, it became widespread in the study of early childhood over the last decade since the publication of a seminal white paper by the LENA Foundation \citep{gilkerson2008power}. The LENA Foundation created a hardware-software combination that illuminated the potential of this technique for theoretical and applied purposes (e.g., \citealt{christakis2009audible,warlaumont2014social}). Fig. \ref{fig:data} summarizes which data are typically found in corpora of day-long recordings used for child language acquisition studies, while Fig. \ref{fig:annotations} illustrates annotations drawn from a public corpus.
			
 
				+More recently, long-form data is also being discussed in the context of neurological disorders (e.g., \citealt{riad2020vocal}). In this article, we define the unique space of difficulties surrounding long-form recordings, and introduce a set of packages that provides practical solutions, with a focus on child-centered recordings. Put briefly, we provide a solution that is compatible with a wide range of annotation and storage approaches through a package that builds on a common standard to integrate functionalities for data processing and continuous validation, and which is combined with extant solutions allowing collaborative work and striking a balance between privacy on the one hand, reproducibility, findability, and long-term archiving on the other.  We end by discussing ways in which these solutions could be generalized to other populations. \footnote{In order to demonstrate how our proposal could foster reproducible research on day-long recordings of children, we have released the source of the paper and the code used to build the figures which illustrate the capabilities of our python package in Section \ref{section:application}.}
			
 
				+
			
 
				+\begin{figure}
			
 
				+    \centering
			
 
				+    \input{Fig1.tex}
			
 
				+    \caption{\textbf{Data typically encountered in corpora of child-centered day-long recordings}. Media files (usually only audio recordings) are annotated by either humans or automated tools. Metadata often contain information about both the subject and his or her environment.}
			
 
				+    \label{fig:data}
			
 
				+\end{figure}
			
 
				+    
			
 
				+\begin{figure}
			
 
				+    \centering
			
 
				+    \includegraphics[width=0.8\linewidth]{Fig2.pdf}
			
 
				+    \caption{\label{fig:annotations}\textbf{Example of annotations derived from \cite{vandam-day}}. Annotator 1 positioned and labelled segments according to who speaks when, using the ELAN software \citep{wittenburg2006elan}; Annotator 2 transcribed speech using CHAT \citep{MacWhinney2000}; The LENA software \citep{gilkerson2008power} performed voice activation detection, speaker classification and count estimation.}
			
 
				+\end{figure}
			
 
				 
			
 
				 \section{Problem space}\label{section:problemspace}
			
 
				 
			
 
				-Management of scientific data is a long-standing issue which has been the subject of substantial progress in the recent years. For instance, FAIR principles (Findability, Accessibility, Interoperability, and Reusability; see \citealt{Wilkinson2016}) have been proposed to help improve the usefulness of data and data analysis pipelines. Similarly, databases implementing these practices have emerged, such as Dataverse \citep{dataverse} and Zenodo \citep{zenodo}. The method of daylong recordings should incorporate such methodological advances. It should be noted, however, that some of the difficulties surrounding the management of corpora of daylong recordings are more idiosyncratic to this technique and therefore require specific solutions. Below, we list some of the challenges that researchers are likely to face while employing long-form recordings in naturalistic environments.
			
 
				+Management of scientific data is a long-standing issue which has been the subject of substantial progress in the recent years. For instance, FAIR principles (Findability, Accessibility, Interoperability, and Reusability; see \citealt{Wilkinson2016}) have been proposed to help improve the usefulness of data and data analysis pipelines. Similarly, databases implementing these practices have emerged, such as Dataverse \citep{dataverse} and Zenodo \citep{zenodo}. Daylong recordings cannot be treated in precisely the same way \citep{Cychosz2020}, and therefore require specific solutions. Below, we list some of the challenges that researchers are likely to face while employing long-form recordings in naturalistic environments.
			
 
				 
			
 
				 \subsubsection*{The need for standards}
			
 
				 
			
 
				-Extant datasets rely on a wide variety of metadata structures, file formats, and naming conventions. For instance, some data from long-form recordings have been archived publicly on Databrary (such as the ACLEW starter set \citep{starter}) and HomeBank (including the VanDam Daylong corpus from \citealt{vandam-day}). Table \ref{table:datasets} shows some divergence across the two, which is simply the result of researchers working in parallel. As a result of this divergence, however, each lab finds itself re-inventing the wheel. For instance, the HomeBankCode organization \footnote{\url{https://github.com/homebankcode/}} contains at least 4 packages that do more or less the same operations, such as aggregating how much speech was produced in each recording, but implemented in different languages (MatLab,  R, perl, and Python). This divergence may also hide different operationalizations, rendering comparisons across labs fraught, effectively diminishing replicability.\footnote{\textit{Replicability} is typically defined as the effort to re-do a study with a new sample, whereas \textit{reproducibility} relates to re-doing the exact same analyses with the exact same data. Reproducibility is addressed in another section.} 
			
 
				+Extant datasets rely on a wide variety of metadata structures, file formats, and naming conventions. For instance, some data from long-form recordings have been archived publicly on Databrary (such as the ACLEW starter set \citep{starter}) and HomeBank (including the VanDam Daylong corpus from \citealt{vandam-day}). Table \ref{table:datasets} shows some divergence across the two, which is simply the result of researchers working in parallel. As a result of this divergence, however, each lab finds itself re-inventing the wheel. For instance, the HomeBankCode organization \footnote{\url{https://github.com/homebankcode/}} contains at least 4 packages that do more or less the same operations, such as aggregating how much speech was produced in each recording, but implemented in different languages (MatLab,  R, perl, and Python). This divergence may also hide different operationalizations, rendering comparisons across labs fraught, effectively diminishing replicability.\footnote{\textit{Replicability} is typically defined as the effort to re-do a study with a new sample, whereas \textit{reproducibility} relates to re-doing the exact same analyses with the exact same data. Reproducibility is addressed in another section.} The variety of annotation formats (as illustrated in Fig. \ref{fig:annotations} for instance) has also led to duplication of efforts, as very similar tasks were implemented for one specific format and then later re-developed for another format.
			
 
				 
			
 
				-Designing pipelines and analyses that are consistent across datasets requires standards for how the datasets are structured. Although this may represent an initial investment, such standards facilitate the pooling of research efforts, by allowing labs to benefit from code developed in other labs. Additionally, this field operates increasingly via collaborative cross-lab efforts. For instance, the ACLEW project\footnote{\url{sites.google.com/site/aclewdid}} involved nine principal investigators (PIs) from five different countries, who needed a substantive initial investment to agree on a standard organization for the six corpora used in the project. We expect even larger collaborations to emerge in the future, a move that would benefit from standardization, as exemplified by the community that emerged around CHILDES for short-form recordings \citep{macwhinney2000childes}.
			
 
				+Designing pipelines and analyses that are consistent across datasets requires standards for how the datasets are structured. Although this may represent an initial investment, such standards facilitate the pooling of research efforts, by allowing labs to benefit from code developed in other labs. Additionally, this field operates increasingly via collaborative cross-lab efforts. For instance, the ACLEW project\footnote{\url{https://sites.google.com/view/aclewdid/home}} involved nine principal investigators (PIs) from five different countries, who needed a substantive initial investment to agree on a standard organization for the six corpora used in the project. We expect even larger collaborations to emerge in the future, a move that would benefit from standardization, as exemplified by the community that emerged around CHILDES for short-form recordings \citep{macwhinney2000childes}. We show how building on the standards described in Section \ref{sec:format} allows our proposed python package to accomplish a wide variety of tasks summarized in Section \ref{section:childproject}.
			
 
				 
			
 
				 \begin{table}
			
 
				 \centering
			
@@ -95,12 +112,11 @@ Metadata                & none           & excel \\ \bottomrule
 
				 
			
 
				 \subsubsection*{Keeping up with updates and contributions}
			
 
				 
			
 
				-Datasets are not frozen. Rather, they are continuously enriched through annotations provided by humans or new algorithms. Human annotations may also undergo corrections as errors are discovered. The process of collecting the recordings may also require a certain amount of time, as they are progressively returned by the field workers or the participants themselves. In the case of longitudinal studies, supplementary audio data may accumulate over several years. Researchers should be able to keep track of these changes while also upgrading their analyses. Moreover, several collaborators may be brought to contribute work to the same dataset simultaneously. To take the example of ACLEW, PIs first annotated a random selection of 2-minute clips for 10 children in-house. They then exchanged some of these audio clips so that the annotators in another lab could re-annotate the same data, for the purposes of inter-rater reliability. This revealed divergences in definitions, and all datasets needed to be revised. Finally, a second sample of 2-minute clips with high levels of speech activity were annotated, and another process of reliability was performed.
			
 
				+Datasets are not frozen. Rather, they are continuously enriched through annotations provided by humans or new algorithms. Human annotations may also undergo corrections as errors are discovered. The process of collecting the recordings may also require a certain amount of time, as they are progressively returned by the field workers or the participants themselves. In the case of longitudinal studies, supplementary audio data may accumulate over several years. Researchers should be able to keep track of these changes while also upgrading their analyses. Moreover, several collaborators may be brought to contribute work to the same dataset simultaneously. To take the example of ACLEW, PIs first annotated a random selection of 2-minute clips for 10 children in-house. They then exchanged some of these audio clips so that the annotators in another lab could re-annotate the same data, for the purposes of inter-rater reliability. This revealed divergences in definitions, and all datasets needed to be revised. Finally, a second sample of 2-minute clips with high levels of speech activity were annotated, and another process of reliability was performed. We suggest to solve these problems through the use of DataLad \citep{datalad_paper}, an extension of git and one of the components of our proposed design, as explained in Section \ref{section:datalad}.
			
 
				 
			
 
				 \subsubsection*{Delivering large amounts of data}
			
 
				 
			
 
				-Considering typical values for the bit depth and sampling rates of the recordings -- 16 bits and 16 kilohertz respectively -- yields a throughput of approximately three gigabytes per day of audio. Although there is a great deal of variation, past studies often involved at least 30 recording days (e.g., three days for each of ten children). The trend, however, is for datasets to be larger; for instance, last year, we collaborated in the collection of a single dataset, in which 200 children each contributed two recordings. Such datasets may exceed one terabyte. Moreover, these recordings can be associated with annotations spread across thousands of files. In the ACLEW example discussed above, there was one .eaf file per human annotator per type of annotation (i.e., random, high speech, random reliability, high speech reliability). In addition, the full day was analyzed with between one and four automated routines. Thus, for each recording day there were 8 annotation files, leading to 5 corpora $\times$ 10 children $\times$ 8 annotation = 400 annotation files. Other researchers will use one annotation file per clip selected for annotation, which quickly adds up to thousands of files. Even a small processing latency may result in significant overheads while gathering so many files. 
			
 
				-
			
 
				+Considering typical values for the bit depth and sampling rates of the recordings -- 16 bits and 16 kilohertz respectively -- yields a throughput of approximately three gigabytes per day of audio. Although there is a great deal of variation, past studies often involved at least 30 recording days (e.g., three days for each of ten children). The trend, however, is for datasets to be larger; for instance, last year, we collaborated in the collection of a single dataset, in which 200 children each contributed two recordings. Such datasets may exceed one terabyte. Moreover, these recordings can be associated with annotations spread across thousands of files. In the ACLEW example discussed above, there was one .eaf file per human annotator per each of four types of annotation (i.e., random, high speech, random reliability, high speech reliability). In addition, the full day was analyzed with between one and four automated routines. Thus, for each recording day there were 8 annotation files, leading to 6 corpora $\times$ 10 children $\times \ (4+4)$  annotations = 480 annotation files. Other researchers will use one annotation file per clip selected for annotation, which quickly adds up to thousands of files\footnote{For most ongoing research projects we know of, there is no central annotation system and instead annotators work in parallel on separate files. Some researchers may prefer to have the "final" version of the annotations in a merged format that represents the "current best guess". For transparency and clarity, however, such merged formats will emerge at a secondary stage, with a first stage represented by independent files including information about the independent listeners' judgments. Our package provides a solution that considers the current practice of working in parallel, but will adapt easily to alternative habits based on merged or collaborative formats.}. Even a small processing latency may result in significant overheads while gathering so many files. As a result of these constraints, data access performance is a key aspect of the management of daylong recordings corpora. Our proposal addresses this matter by using DataLad (see Section \ref{section:datalad}), which is specifically designed to handle large files.
			
 
				 
			
 
				 
			
 
				 \subsubsection*{Privacy}
			
@@ -109,44 +125,44 @@ Long-form recordings are sensitive; they contain identifying and personal inform
 
				 
			
 
				 However, although  long-form recordings are sensitive, many of the data types derived from them are not. With appropriate file-naming and meta-data practices, it is effectively possible to completely deidentify automated annotations (which at present never include automatic speech recognition). It is also often possible to deidentify human annotations, except when these involve transcribing what participants said, since participants will use personal names and reveal other personal details. Nonetheless, since this particular case involves a human doing the annotation, this human can be trained to modify the record (e.g., replace personal names with foils) and/or tag the annotation as sensitive and not to be openly shared. This is a practice called vetting, and it is one area in which the community working with long-form recordings has started to create standardized procedures, currently available from the HomeBank landing site (\url{homebank.talkbank.org}; e.g., \citealt{vandam2018vetting}).
			
 
				 
			
 
				-Therefore, the ideal storing-and-sharing strategy should naturally enforce security and privacy safeguards by implementing access restrictions adapted to the level of confidentiality of the data. Data-access should be doable programmatically, and users should be able to download only the data that they need for their analysis.
			
 
				+Therefore, the ideal storing-and-sharing strategy should naturally enforce security and privacy safeguards by implementing access restrictions adapted to the level of confidentiality of the data. Data-access should be doable programmatically, and users should be able to download only the data that they need for their analysis. As argued in Sections \ref{section:datalad} and \ref{section:gin}, our proposal can solve this problem by enabling multi-tier data access.
			
 
				 
			
 
				 \subsubsection*{Long-term availability}
			
 
				 
			
 
				-The collection of long-form recordings requires a considerable level of investment to explain the technique to families and communities, to ensure a secure data management system, and, in the case of remote populations, to access the site. In our experience, one data collection trip to a field site costs about 15 thousand US\$.\footnote{This grossly underestimates overall costs, because the best way to do any kind of field research is through maintaining strong bonds with the community and helping them in other ways throughout the year, not only during our visits (read more about ethical fieldwork on \citealt{broesch2020navigating}). A successful example for this is that of the UNM-UCSB Tsimane' Project (\url{http://tsimane.anth.ucsb.edu/}), which has been collaborating with the Tsimane' population since 2001. They are currently funded by a 5-year, 3-million US\$ NIH grant \url{https://reporter.nih.gov/project-details/9538306}. } These data are precious not only because of the investment that has gone into them, but also because they capture slices of life at a given point in time, which is particularly informative in the case of populations that are experiencing market integration or other forms of societal change -- which today is most or all populations. Moreover, some communities who are collaborating in such research speak languages that are minority languages in the local context, and thus at a potential risk for being lost in the future. The conservation of naturalistic speech samples of children's language acquisition throughout a normal day could be precious for fueling future efforts of language revitalization \citep{Nee2021}. It would therefore be particularly damaging to lose such data prematurely, from  financial,  scientific, and  human standpoints.
			
 
				+The collection of long-form recordings requires a considerable level of investment to explain the technique to families and communities, to ensure a secure data management system, and, in the case of remote populations, to access the site. In our experience, we have spent up to 15 thousand USD to complete one round of data collection, including the cost of travel.\footnote{This grossly underestimates overall costs, because the best way to do any kind of field research is through maintaining strong bonds with the community and helping them in other ways throughout the year, not only during our visits (read more about ethical fieldwork on \citealt{broesch2020navigating}). A successful example for this is that of the UNM-UCSB Tsimane' Project (\url{http://tsimane.anth.ucsb.edu/}), which has been collaborating with the Tsimane' population since 2001. They are currently funded by a 5-year, 3-million US\$ NIH grant \url{https://reporter.nih.gov/project-details/9538306}. } These data are precious not only because of the investment that has gone into them, but also because they capture slices of life at a given point in time, which is particularly informative in the case of populations that are experiencing market integration or other forms of societal change -- which today is most or all populations. Moreover, some communities who are collaborating in such research speak languages that are minority languages in the local context, and thus at a potential risk for being lost in the future. The conservation of naturalistic speech samples of children's language acquisition throughout a normal day could be precious for fueling future efforts of language revitalization \citep{Nee2021}. It would therefore be particularly damaging to lose such data prematurely, from  financial,  scientific, and  human standpoints.
			
 
				 
			
 
				-In addition, one advantage of daylong recordings over other observational methods such as parental reports is that they can be re-exploited at later times to observe behaviors that had not been foreseen at the time of data collection. This implies that their interest partly lies in long-term re-usability.
			
 
				+In addition, one advantage of daylong recordings over other observational methods such as parental reports is that they can be re-analyzed at later times to observe behaviors that had not been foreseen at the time of data collection. This implies that their interest partly lies in long-term re-usability.
			
 
				 
			
 
				-Moreover, even state-of-the-art speech processing tools still perform poorly on daylong recordings, due to their intrinsic noisy nature \citep{casillas2019step}. As a result, taking full advantage of present data will necessitate new or improved computational models, which may take years to develop. For example, the DIHARD Challenge series has been running for three consecutive years, and documents the difficulty of making headway with complex audio data \citep{ryant2018first,ryant2019second,ryant2020third}. For instance, the best submission for speaker diarization in their meeting subcorpus achieved about 35\% Diarization Error Rate in 2018 and 2019, with improvements seen only in 2020, when the best system scored a 20\% Diarization Error Rate (Neville Ryant, personal communication, 2021-04-09). Other tasks are progressing much more slowly. For instance, the best performance in a classifier for deciding whether adult speech was addressed to the child or to an adult scored about 70\% correct in 2017 \citep{schuller2017interspeech} -- but nobody has been able to beat this record since. Recordings should therefore remain available for long periods of time -- potentially decades --, thus increasing the risk for data loss to occur at some point in their lifespan. For these reasons, the reliability of the storage design is critical, and redundancy is most certainly required. Likewise, persistent URLs may be needed in order to ensure the long-term accessibility of the datasets.
			
 
				+Moreover, even state-of-the-art speech processing tools still perform poorly on daylong recordings, due to their intrinsic noisy nature \citep{casillas2019step}. As a result, taking full advantage of present data will necessitate new or improved computational models, which may take years to develop. For example, the DIHARD Challenge series has been running for three consecutive years, and documents the difficulty of making headway with complex audio data \citep{ryant2018first,ryant2019second,ryant2020third}. For instance, the best submission for speaker diarization in their meeting subcorpus achieved about 35\% Diarization Error Rate in 2018 and 2019, with improvements seen only in 2020, when the best system scored a 20\% Diarization Error Rate (Neville Ryant, personal communication, 2021-04-09). Other tasks are progressing much more slowly. For instance, the best performance in a classifier for deciding whether adult speech was addressed to the child or to an adult scored about 70\% correct in 2017 \citep{schuller2017interspeech} -- but nobody has been able to beat this record since. Recordings should therefore remain available for long periods of time -- potentially decades --, thus increasing the risk for data loss to occur at some point in their lifespan. For these reasons, the reliability of the storage design is critical, and redundancy is most certainly required. Likewise, persistent URLs may be needed in order to ensure the long-term accessibility of the datasets. These are key features of our proposal, as argued in sections \ref{section:datalad} and \ref{section:gin}.
			
 
				 
			
 
				 \subsubsection*{Findability}
			
 
				 
			
 
				-FAIR Principles include findability and accessibility. A crucial aspect of findability of datasets involves their being indexed in ways that potential re-users can discover them. Although we elaborate on it below, we want to already highlight HomeBank (\url{homebank.talkabank.org}) as one archiving option exists which is specific for long-form recordings, thus making any corpora hosted  there easily discoverable by other researchers using the technique. Also of relevance is Databrary (\url{databrary.org}), an archive specialized on child development, which can thus make the data visible to the developmental science community. However, the current standard practice  is archiving data in either one or another of these repositories, despite the fact that if a copy of the corpus were visible from one of these archives, the dataset would be overall more easily discovered. Additionally, it is uncertain whether these highly re-usable long-form recordings are visible to researchers who are more broadly interested in spoken corpora and/or naturalistic human behavior and/or other topics that could be studied in such data. In fact, one can conceive of a future in which the technique is used with people of different ages, in which case a system that allows users to discover other datasets based on relevant metadata would be ideal. For some research purposes (e.g., trying to stream overlapping voices and noise, technically referred to as "source separation") any recording may be useful, whereas for others (neurodegenerative disorders, early language acquisition) only some ages would. In any case, options exist to allow accessibility once a dataset is archived in one of those databases.
			
 
				+FAIR Principles include findability and accessibility. A crucial aspect of findability of datasets involves their being indexed in ways that potential re-users can discover them.  We would like to emphasize that findability of daylong recordings, especially those from under-represented populations, is of a peculiar importance. Indeed, although one of the many strengths of such recordings is that they can theoretically be sampled from any environment outside the lab, current corpora are still heavily biased in favor of WEIRD (Western, Educated, Industrialized, Rich, Democratic) populations: \cite{cychosz2021using} report that 81\% of samples (and 82\% of first authors) in papers included in systematic reviews of daylong recordings come from North America (with a further 12\% of samples, and 14\% of authors, based in Europe; see Figure 2). Not only more data should be collected from more diverse populations, but they should also be at least equally as findable and accessible in order to overcome the current representativeness bias. We would also like to stress again that the needs for Privacy and for Findability/Accessibility are not mutually exclusive. Although some of the data are of course sensitive, some of them could be made available to a broad audience without any harm to the privacy of the participants -- for instance, annotations that contain no transcription and parts of the metadata -- as discussed in the \textit{Privacy} section above.
			
 
				 
			
 
				-\subsubsection*{Reproducibility}
			
 
				+Although we elaborate on it below, we want to already highlight HomeBank (\url{homebank.talkabank.org}; part of TalkBank, a recognized CLARIN Knowledge Centre) as one archiving option exists which is specific for long-form recordings, thus making any corpora hosted  there easily discoverable by other researchers using the technique. Also of relevance is Databrary (\url{databrary.org}), an archive specialized on child development, which can thus make the data visible to the developmental science community. However, the current standard practice  is archiving data in either one or another of these repositories, despite the fact that if a copy of the corpus were visible from one of these archives, the dataset would be overall more easily discovered. Additionally, it is uncertain whether these highly re-usable long-form recordings are visible to researchers who are more broadly interested in spoken corpora and/or naturalistic human behavior and/or other topics that could be studied in such data. In fact, one can conceive of a future in which the technique is used with people of different ages, in which case a system that allows users to discover other datasets based on relevant metadata would be ideal. For some research purposes (e.g., trying to stream overlapping voices and noise, technically referred to as "source separation") any recording may be useful, whereas for others (neurodegenerative disorders, early language acquisition) only some ages would. In any case, options exist to allow accessibility once a dataset is archived in one of those databases. We show how our proposed solution can be used to improve the findability of datasets in Sections \ref{section:datalad} and \ref{section:gin}.
			
 
				 
			
 
				-Independent verification of results by a third party can be facilitated by improving the \emph{reproducibility} of the analyses, i.e. by providing third-parties with enough data and information to re-derive claimed results. This itself maybe be challenging for a number of reasons, including the variety of software requirements, unclear data dependencies, or insufficiently documented steps. Sharing data sets and analyses is more complex than delivering a collection of static files; all the information that is necessary in order to re-execute any intermediate step of the analysis should also be adequately conveyed.
			
 
				- 
			
 
				+\subsubsection*{Reproducibility}
			
 
				 
			
 
				+Independent verification of results by a third party can be facilitated by improving the \emph{reproducibility} of the analyses, i.e. by providing third-parties with enough data and information to re-derive claimed results. This itself maybe be challenging for a number of reasons, including the variety of software requirements, unclear data dependencies, or insufficiently documented steps. Sharing data sets and analyses is more complex than delivering a collection of static files; all the information that is necessary in order to re-execute any intermediate step of the analysis should also be adequately conveyed. DataLad, which is one of the four components of our proposal, is specifically designed to increase reproducibility (see Section \ref{section:datalad}).
			
 
				 
			
 
				 \subsubsection*{Current archiving options}
			
 
				 
			
 
				-The field of child-centered long-form recordings has benefited from a purpose-built scientific archive from an early stage. HomeBank \cite{vandam2016homebank} builds on the same architecture as CHILDES \cite{MacWhinney2000} and other TalkBank corpora. Although this architecture served the purposes of the language-oriented community well for short recordings, there are numerous issues when using it for long-form recordings. To begin with, curators do not directly control their datasets' contents and structures, and if a curator wants to make a modification, they need to ask the HomeBank management team to make it for them. Similarly, other collaborators who spot errors cannot correct them directly, but again must request changes be made by the HomeBank management team.  Only one type of annotation is innately managed, and that is CHAT \cite{MacWhinney2000}, which is ideal for transcriptions of  recordings. However, transcription is less central to studies of long-form audio.
			
 
				+The field of child-centered long-form recordings has benefited from a purpose-built scientific archive from an early stage. HomeBank \cite{vandam2016homebank} builds on the same architecture as CHILDES \cite{MacWhinney2000} and other TalkBank corpora. Although this architecture served the purposes of the language-oriented community well for short recordings, there are numerous issues when using it for long-form recordings. To begin with, curators do not directly control their datasets' contents and structures, and if a curator wants to make a modification, they need to ask the HomeBank management team to make it for them. Similarly, other collaborators who spot errors cannot correct them directly, but again must request changes be made by the HomeBank management team.  Only one type of annotation is innately managed, and that is CHAT \cite{MacWhinney2000}, which is ideal for transcriptions of  recordings. However, transcriptions are of a lesser interest in child-centered daylong recordings because the amounts of audio they generate are such that humans would not be able to transcribe them to their full extent, and automatic transcription of such recordings -- which are very noisy -- is out of the reach of present models.
			
 
				 
			
 
				 As briefly noted above, Databrary \url{databrary.org} also already hosts some long-form recording data. The aforementioned ACLEW project actually committed to archiving data there, rather than on HomeBank, because it allowed direct control and update (without needing to ask the HomeBank management).  As re-users, one of the most useful features of Databrary is the possibility to search the full archive for data pertaining to children of specific ages or origins. Using this archiving option led us to realize there were some limitations, including the fact that there is no API system, meaning that all updates need to be done via a graphical browser-based interface.
			
 
				 
			
 
				-Additional options have been considered by researchers in the community, including OSF \footnote{\url{osf.io}}, and the Language Archive \footnote{\url{https://archive.mpi.nl/tla/}}. Detailing all their features is beyond the scope of the present paper, but some discussion can be found in \cite{casillas2019step}. 
			
 
				+Additional options have been considered by researchers in the community, including OSF \footnote{\url{osf.io}}, and the Language Archive \footnote{\url{https://archive.mpi.nl/tla/}, which holds a CLARIN certificate B}. Detailing all their features is beyond the scope of the present paper, but some discussion can be found in \cite{casillas2019step}. As a way of explaining why we think they are insufficient solutions, OSF provides very limited storage capacities and requires no structure or metadata, thus does not solve problems of storage or standards. As for the Language Archive, it does not currently have an API for allowing updates of the data, nor automatic tests for its continued integrity.
			
 
				 
			
 
				-Without denying their usefulness and importance, none of these archives provides perfect solutions to all of the problems we laid out above -- and notably, in our vision, researchers should not have to choose among them when archiving their data. These limitations have brought us to envision a new strategy for sharing these datasets, which we detail next. 
			
 
				+Without denying their usefulness and importance, none of these archives provides perfect solutions to all of the problems we laid out above -- and notably, in our vision, researchers should not have to choose among them when archiving their data.   These limitations have brought us to envision a new strategy for sharing these datasets, which is detailed in Sections \ref{section:datalad} and \ref{section:gin}.
			
 
				 
			
 
				  \subsubsection*{Our proposal}
			
 
				  
			
 
				 We propose a storing-and-sharing method designed to address the challenges outlined above simultaneously. It can be noted that these problems are, in many respects, similar to those faced by researchers in neuroimaging, a field which has long been confronting the need for reproducible analyses on large datasets of potentially sensitive data \citep{Poldrack2014}.
			
 
				 Their experience may, therefore, provide precious insight for linguists, psychologists, and developmental scientists engaging with the big-data approach of long-form recordings.
			
 
				-For instance, in the context of neuroimaging, \citet{Gorgolewski2016} have argued in favor of ``machine-readable metadata'', standard file structures and metadata, as well as consistency tests. Similarly, \citet{Eglen2017} have recommended the application of formatting standards, version control, and continuous testing.\footnote{Note that these concepts are all used in the key archiving options we evoked: HomeBank, Databrary, and the Language Archive all have defined metadata and file structures. However, they are {\it different} standards, which cannot be translated to each other, and which have not considered all the features that are relevant for long-form recordings, such as having multiple layers of annotations, with some based on sparse sampling. Additionally, the use of dataset versioning, automated consistency tests, and analyses based on subsumed datasets are less widespread in the language acquisition community.} In the following, we will demonstrate how all of these practices have been implemented in our proposed design.
			
 
				+For instance, in the context of neuroimaging, \citet{Gorgolewski2016} have argued in favor of ``machine-readable metadata'', standard file structures and metadata, as well as consistency tests. Similarly, \citet{Eglen2017} have recommended the application of formatting standards, version control, and continuous testing. Before moving on, we would like to note that these concepts are all used in the key archiving options we evoked: HomeBank, Databrary, and the Language Archive all have defined metadata and file structures. However, they are {\it different} standards, which cannot be translated to each other, and which have not considered all the features that are relevant for long-form recordings, such as having multiple layers of annotations, with some based on sparse sampling. Additionally, the use of dataset versioning, automated consistency tests, and analyses based on subsumed datasets are less widespread in the language acquisition community. In the following, we will demonstrate how these practices have been implemented in our proposed design.
			
 
				 Albeit designed for child-centered daylong recordings, we believe our solution could be replicated across a wider range of datasets with constraints similar to those exposed above.
			
 
				 
			
 
				-This solution relies on four main components, each of which is conceptually separable from the others: i) a standardized data format optimized for child-centered long-form recordings; ii) ChildProject, a python package to perform basic operations on these datasets; iii) DataLad, ``a decentralized system for integrated discovery, management, and publication of digital objects of science'' \citep{hanke_defense_2021} iv) GIN, a live archiving option for storage and distribution. Our choice for each one of these components can be revisited based on the needs of a project and/or as other options appear. Table \ref{table:components} summarizes which of these components helps address each of the challenges listed in Section \ref{section:problemspace}.
			
 
				+This solution relies on four main components, each of which is conceptually separable from the others: i) a standardized data format optimized for child-centered long-form recordings; ii) ChildProject, a python package to perform basic operations on these datasets; iii) DataLad, ``a decentralized system for integrated discovery, management, and publication of digital objects of science'' \citep{hanke_defense_2021,datalad_paper} iv) GIN, a live archiving option for storage and distribution. Our choice for each one of these components can be revisited based on the needs of a project and/or as other options appear. Table \ref{table:components} summarizes which of these components helps address each of the challenges listed in Section \ref{section:problemspace}.
			
 
				 
			
 
				 \begin{table*}[ht]
			
 
				 \centering
			
@@ -170,7 +186,7 @@ The need for standards &
 
				   git-annex &
			
 
				   \begin{tabular}[t]{@{}l@{}}git-annex compatible;\\ high storage capacity;\\ parallelised operations\end{tabular}
			
 
				    \\ \midrule
			
 
				-Ensuring privacy &
			
 
				+Ensuring privacy & \begin{tabular}[t]{@{}l@{}}Optional metadata\\detection;\end{tabular}
			
 
				    &
			
 
				   \begin{tabular}[t]{@{}l@{}}private sub-datasets;\\ private remotes;\\
			
 
				   path-based\\or metadata-based\\
			
@@ -210,14 +226,16 @@ Reproducibility &
 
				 
			
 
				 \begin{figure}[ht]
			
 
				     \centering
			
 
				-    \inputTikZ{0.8}{Fig2.tex}
			
 
				+    \inputTikZ{0.8}{Fig4.tex}
			
 
				     \caption{\textbf{Structure of a dataset}. Metadata, recordings and annotations each belong to their own folder. Raw annotations (i.e., the audio files as they have been collected, before post-processing) are separated from their post-processed counterparts (in this case: standardized and vetted recordings). Similarly, raw annotations (in this case, LENA's its annotations) are set apart from the corresponding CSV version.}
			
 
				     \label{fig:tree}
			
 
				 \end{figure}
			
 
				 
			
 
				 To begin with, we propose a set of proven standards which we use in the LAAC Team \url{https://lscp.dec.ens.fr/en/research/teams-lscp/language-acquisition-across-cultures} and which build on previous experience in several collaborative projects including ACLEW. It must be emphasized, however, that standards should be elaborated collaboratively by the community and that the following is merely a starting point.
			
 
				 
			
 
				-Data that are part of the same collection effort are bundled together within one folder\footnote{We believe a reasonable unit of bundling is the collection effort, for instance a single field trip,  a full bout of data collection for a cross-sectional sample, or a set of recordings done more or less at the same time in a longitudinal sample. Given the possibilities of versioning, some users may decide they want to keep all data from a longitudinal sample in the same dataset, adding to it progressively over months and years, to avoid having duplicate children.csv files. That said, given DataLad's system of subdatasets (see Section \ref{section:datalad}), one can always define different datasets, each of which contains the recordings collected in subsequent time periods.}, preferably a DataLad dataset (see Section \ref{section:datalad}). Datasets are packaged according to the structure given in fig. \ref{fig:tree}. The \path{metadata} folder contains at least three dataframes in CSV format: (i) \path{children.csv} contains information about the participants, such as their age or the language(s) they speak. (ii) \path{recordings.csv} contains the metadata for each recording, such as when the recording started, which device was used, or its relative path in the dataset. (iii) \path{annotations.csv} contains information concerning the annotations provided in the dataset, how they were produced, or which range they cover, etc. The dataframes are standardized according to guidelines which set conventional names for the columns and the range of allowed values. The guidelines are enforced through tests which print all the errors and inconsistencies in a dataset implemented in the ChildProject package introduced below.
			
 
				+Data that are part of the same collection effort are bundled together within one folder\footnote{We believe a reasonable unit of bundling is the collection effort, for instance a single field trip,  a full bout of data collection for a cross-sectional sample, or a set of recordings done more or less at the same time in a longitudinal sample. Given the possibilities of versioning, some users may decide they want to keep all data from a longitudinal sample in the same dataset, adding to it progressively over months and years, to avoid having duplicate children.csv files. That said, given DataLad's system of subdatasets (see Section \ref{section:datalad}), one can always define different datasets, each of which contains the recordings collected in subsequent time periods.}, preferably a DataLad dataset (see Section \ref{section:datalad}). Datasets are packaged according to the structure given in fig. \ref{fig:tree}. The \path{metadata} folder contains at least three dataframes in CSV format: (i) \path{children.csv} contains information about the participants, such as their age or the language(s) they speak. (ii) \path{recordings.csv} contains the metadata for each recording, such as when the recording started, which device was used, or its relative path in the dataset. (iii) \path{annotations.csv} contains information concerning the annotations provided in the dataset, how they were produced, or which range they cover, etc. Metadata that are sensitive (e.g. names) and which should only be shared with a limited audience can be placed into \path{metadata/recordings/} or \path{metadata/children/}; the package will automatically detect their presence and import them if they are available.
			
 
				+
			
 
				+The dataframes are standardized according to guidelines which set conventional names for the columns and the range of allowed values. The guidelines are enforced through tests which print all the errors and inconsistencies in a dataset implemented in the ChildProject package introduced below.
			
 
				 
			
 
				 The \path{recordings} folder contains two subfolders: \path{raw}, which stores the recordings as delivered by the experimenters, and \path{converted}, which contains processed copies of the recordings. All the audio files in \path{recordings/raw} are indexed in the recordings dataframe. Thus, there is no need for naming conventions for the audio files themselves, and maintainers can decide how they want to organize them.
			
 
				 
			
@@ -226,7 +244,9 @@ The \path{annotations} folder contains all sets of annotations. Each set itself
 
				 
			
 
				 \subsection{ChildProject}\label{section:childproject}
			
 
				 
			
 
				-The ChildProject package is a Python 3.6+ package that performs common operations on a dataset of child-centered recordings. It can be used from the command-line or by importing the modules from within Python. Assuming the target datasets are packaged according to the standards summarized in Section \ref{sec:format}, the package supports the functions listed below.
			
 
				+The ChildProject package is a Python 3.6+ package that performs common operations on a dataset of child-centered recordings. It can be used from the command-line or by importing the modules from within Python. It should be noted that the Python API stores metadata and annotations as Pandas dataframes \citep{pandas-software,pandas-paper}. As a result of relying on such a widely used scientific library, it is not necessary to learn new data types in order to use this package. Moreover, most operations are thus naturally vectorized, which contributes to better performance.
			
 
				+
			
 
				+Assuming the target datasets are packaged according to the standards summarized in Section \ref{sec:format}, the package supports the functions listed below.
			
 
				 
			
 
				 \subsubsection*{Listing errors and inconsistencies in a dataset}
			
 
				 
			
@@ -234,7 +254,9 @@ We provide a validation script that returns a detailed reporting of all the erro
 
				 
			
 
				 \subsubsection*{Converting and indexing annotations}\label{section:annotations}
			
 
				 
			
 
				-The package converts input annotations to standardized, wide-table CSV dataframes. The columns in these wide-table formats have been determined based on previous work, and are largely specific to the goal of studying infants' language environment and production.
			
 
				+Fig. \ref{fig:annotations} shows that, whatever their format, annotations are always conceptually segments delimited by an onset and an offset timestamps, to which are attached a number of properties such as a speaker's identity or a transcription. Therefore, annotations can almost always be represented as Pandas dataframes, with one row per segment and one column per property.
			
 
				+
			
 
				+Taking advantage of this, the package converts input annotations to standardized, wide-table CSV dataframes\footnote{\url{https://childproject.readthedocs.io/en/latest/annotations.html}}. The columns in these wide-table formats have been determined based on previous work, and are largely specific to the goal of studying infants' language environment and production. However, users can introduce custom columns if required.
			
 
				 
			
 
				 Annotations are indexed into a unique CSV dataframe which stores their location in the dataset, the set of annotations they belong to, and the recording and time interval they cover. The index, therefore, allows an easy retrieval of all the annotations that cover any given segment of audio, regardless of their original format and the naming conventions that were used. The system interfaces well with extant annotation standards. Currently, ChildProject supports: LENA annotations in .its \citep{xu2008lenatm}; ELAN annotations following the ACLEW DAS template  (\citealt{Casillas2017}, imported using Pympi: \citealt{pympi-1.70}); CHAT annotations \citep{MacWhinney2000}; as well as rttm files outputted by ACLEW tools, namely the Voice Type Classifier (VTC) by \citet{lavechin2020opensource}, the Linguistic Unit Count Estimator (ALICE) by \citet{rasanen2020}, and the VoCalisation Maturity Network (VCMNet) by \citet{AlFutaisi2019}. Users can also adapt routines for file types or conventions that vary. For instance, users can adapt the ELAN import developed for the ACLEW DAS template for their own template (e.g., \url{https://github.com/LAAC-LSCP/ChildProject/discussions/204}); and examples are also provided for Praat's .TextGrid files \citep{boersma2006praat}. The package also supports custom, user-defined conversion routines.
			
 
				 
			
@@ -244,24 +266,28 @@ Relying on the annotations index, the package can also calculate the intersectio
 
				 
			
 
				 As noted in the Introduction, recordings are too extensive to be manually annotated in their entirety. We and colleagues have typically annotated manually clips of 0.5-5 minutes in length, and the way these clips are extracted and annotated varies (as illustrated in Table \ref{table:datasets}).
			
 
				 
			
 
				-The package allows the use of predefined or custom sampling algorithms. Samples' timestamps are exported to CSV dataframes. In order to keep track of the sample generating process, input parameters are simultaneously saved into a YAML file. Predefined samplers include a periodic sampler, a sampler targeting specific speakers' vocalizations, a sampler targeting regions of high-volubility according to input annotations, and a more agnostic sampler targeting high-energy regions. In all cases, the user can define the number of regions and their duration, as well as the context that may be inspected by human annotators. These options cover all documented sampling strategies.
			
 
				+The package allows the use of predefined or custom sampling algorithms\footnote{\url{https://childproject.readthedocs.io/en/latest/samplers.html}}. Samples' timestamps are exported to CSV dataframes. In order to keep track of the sample generating process, input parameters are simultaneously saved into a YAML file. Predefined samplers include a periodic sampler, a sampler targeting specific speakers' vocalizations, a sampler targeting regions of high-volubility according to input annotations, and a more agnostic sampler targeting high-energy regions. In all cases, the user can define the number of regions and their duration, as well as the context that may be inspected by human annotators. These options cover all documented sampling strategies. Evaluations of the performance of some of these samplers can be found in \citep[Chapter 15, ``Human annotation'']{exelang-book}.
			
 
				 
			
 
				 \subsubsection*{Generating ELAN files ready to be annotated}
			
 
				 
			
 
				-Although there was some variability in terms of the program used for human annotation, the field has now by and large settled on ELAN \citep{wittenburg2006elan}. ELAN employs xml files with a hierarchical structure which are both customizable and flexible. The ChildProject can be used to generate .eaf files which can be annotated with the ELAN software based on samples of the recordings drawn using the package, as described in Section \ref{section:choosing}.
			
 
				+Although there was some variability in terms of the program used for human annotation, the field has now by and large settled on ELAN \citep{wittenburg2006elan}. ELAN employs xml files with a hierarchical structure which are both customizable and flexible. The ChildProject can be used to generate .eaf files which can be annotated with the ELAN software\footnote{\url{https://childproject.readthedocs.io/en/latest/elan.html}} based on samples of the recordings drawn using the package, as described in Section \ref{section:choosing}.
			
 
				 
			
 
				 \subsubsection*{Extracting and uploading audio samples to Zooniverse}
			
 
				 
			
 
				-The crowd-sourcing platform Zooniverse \citep{zooniverse} has been extensively employed in both natural \citep{gravityspy} and social sciences. More recently, researchers have been investigating its potential to classify samples of audio extracted from daylong recordings of children and the results have been encouraging  \citep{semenzin2020a,semenzin2020b}. We provide tools interfacing with Zooniverse's API for preparing and uploading audio samples to the platform and for retrieving the results, while protecting the privacy of the participants.
			
 
				+The crowd-sourcing platform Zooniverse \citep{zooniverse} has been extensively employed in both natural \citep{gravityspy} and social sciences. More recently, researchers have been investigating its potential to classify samples of audio extracted from daylong recordings of children and the results have been encouraging  \citep{semenzin2020a,semenzin2020b}. We provide tools interfacing with Zooniverse's API for preparing and uploading audio samples to the platform and for retrieving the results, while protecting the privacy of the participants\footnote{\url{https://childproject.readthedocs.io/en/latest/zooniverse.html}}. A step-by-step tutorial including re-usable code is also provided \citep{zooniverse_example}.
			
 
				 
			
 
				 \subsubsection*{Audio processing}
			
 
				 
			
 
				-ChildProject allows the batch-conversion of the recordings to any target audio format (thanks to \citealt{ffmpeg}).
			
 
				+ChildProject allows the batch-conversion of the recordings to any target audio format (thanks to \citealt{ffmpeg})\footnote{\url{https://childproject.readthedocs.io/en/latest/processors.html}}.
			
 
				 
			
 
				 The package also implements a ``vetting" \citep{vandam2018vetting,Cychosz2020} pipeline, which mutes segments of the recordings previously annotated by humans as confidential while preserving the duration of the audio files. After being processed, the recordings can safely be shared with other researchers or annotators.
			
 
				 
			
 
				 Another pipeline allows to perform filtering or linear combinations of audio channels for multi-channel recordings such as those produced with the BabyLogger\footnote{\url{https://docs.babycloudlab.com/}}; if necessary, users can easily design custom audio converters suiting more specific needs.
			
 
				 
			
 
				+\subsubsection*{Metrics extraction}
			
 
				+
			
 
				+The package includes a pipeline to extract metrics that are commonly used in this research area -- such as the speech rates of each speaker -- by aggregating annotations at the desired level, e.g. per recording or per child\footnote{\url{https://childproject.readthedocs.io/en/latest/metrics.html}}. Metrics can also be aggregated depending on the time of the day, where the width of the time bins is chosen by the user.
			
 
				+
			
 
				 \subsubsection*{Other functionalities}
			
 
				 
			
 
				 The package offers additional functions such as a pipeline that strips LENA's annotations from data that could be used to identify the participants, built upon previous code by \citet{eaf-anonymizer-original}.
			
@@ -279,11 +305,11 @@ The present effort is led by one research team, and thus with personnel and fund
 
				 \centering
			
 
				 \begin{minipage}{.5\linewidth}
			
 
				 \centering
			
 
				-\subfloat[]{\label{datalad:a}\resizebox{!}{0.70\linewidth}{\large\input{Fig1a.tex}\normalsize}}
			
 
				+\subfloat[]{\label{datalad:a}\resizebox{!}{0.70\linewidth}{\large\input{Fig3a.tex}\normalsize}}
			
 
				 \end{minipage}%
			
 
				 \begin{minipage}{.5\linewidth}
			
 
				 \centering
			
 
				-\subfloat[]{\label{datalad:b}\resizebox{!}{0.70\linewidth}{\large\input{Fig1b.tex}\normalsize}}
			
 
				+\subfloat[]{\label{datalad:b}\resizebox{!}{0.70\linewidth}{\large\input{Fig3b.tex}\normalsize}}
			
 
				 \end{minipage}\par\medskip
			
 
				 
			
 
				 
			
@@ -293,7 +319,7 @@ The present effort is led by one research team, and thus with personnel and fund
 
				 
			
 
				 The combination of standards and the ChildProject package allows us to solve some of the problems laid out in the Introduction, but they do not directly provide solutions to the problems of data sharing and collaborative work. DataLad, however, has been specifically designed to address such needs.
			
 
				 
			
 
				-DataLad \citep{datalad_handbook} was initially developed by researchers from the computational neuroscience community for the sharing of neuroimaging datasets. It has been under active development at a steady pace for at least six years (fig. \ref{datalad:a}). It is co-funded by the United States NSF and the German Federal Ministry of Education and Research and has several major code developers (fig. \ref{datalad:b}).% thereby lowering its bus-factor\footnote{\url{https://en.wikipedia.org/wiki/Bus_factor}} :D.
			
 
				+DataLad \citep{datalad_paper} was initially developed by researchers from the computational neuroscience community for the sharing of neuroimaging datasets. It has been under active development at a steady pace for at least six years (fig. \ref{datalad:a}). It is co-funded by the United States NSF and the German Federal Ministry of Education and Research and has several major code developers (fig. \ref{datalad:b}).% thereby lowering its bus-factor\footnote{\url{https://en.wikipedia.org/wiki/Bus_factor}} :D.
			
 
				 
			
 
				 DataLad relies on git-annex, a software designed to manage large files with git. Over the years, git has rapidly overcome competitors such as Subversion, and it has been popularized by platforms such as GitLab and GitHub. However, git does not natively handle large binary files, our recordings included. Git-annex circumvents this issue by only versioning pointers to the large files. The actual content of the files is stored in an ``annex''. Annexes can be stored remotely on a variety of supports, including Amazon Glacier, Amazon S3, Backblaze B2, Box.com, Dropbox, FTP/SFTP, Google Cloud Storage, Google Drive, Internet Archive via S3, Microsoft Azure Blob Storage, Microsoft OneDrive, OpenDrive, OwnCloud, SkyDrive, Usenet, and Yandex Disk.
			
 
				 
			
@@ -303,9 +329,9 @@ In using git-annex, DataLad enables users to download only the files that they n
 
				 
			
 
				 DataLad improves upon git-annex by adding a number of functionalities. One of them, dataset nesting, is built upon git submodules. A DataLad dataset can include sub-datasets, with as many levels of recursion as needed. This provides a natural solution to the question of how to document analyses, as an analysis repository can have the dataset on which it depends embedded as a subdataset. It also provides a good solution for the issue of different levels of data containing more or less identifying information, via the possibility of restricting permissions to different levels of the hierarchy.
			
 
				 
			
 
				-Like git, DataLad is a decentralized system, meaning that data can be stored and replicated across several ``remotes''. DataLad authors have argued in favor of decentralized research data management, as it simplifies infrastructure migrations, and helps improve the scalibility of the data storage and distribution design \cite{decentralization_hanke}. Additionally, decentralization is notably useful in that it facilitates redundancy; files can be pushed simultaneously to several storage supports (e.g.: an external hard-drive, a cloud provider), thereby reducing the risk of data loss. In addition to that, when deleting large files from your local repository, DataLad will automatically make sure that more than a certain amount of remotes still own a copy the data, which by default is set to one.
			
 
				+Like git, DataLad is a decentralized system, meaning that data can be stored and replicated across several ``remotes''. DataLad authors have argued in favor of decentralized research data management, as it simplifies infrastructure migrations, and helps improve the scalibility of the data storage and distribution design \cite{decentralization_hanke}. Additionally, decentralization is notably useful in that it facilitates redundancy; files can be pushed simultaneously to several storage supports (e.g.: an external hard-drive, a cloud provider), thereby reducing the risk of data loss. In addition to that, by default, DataLad refuses to delete a local copy of a large file unless a certain amount of remotes -- which can be configured by the user -- still own a copy the data. Of course, a user may still remove every copy of a file by using forced deletion or by setting the minimum amount of copies to zero.
			
 
				 
			
 
				-Many of the \emph{remotes} supported by DataLad require user-authentication, thus allowing for fine-grained access permissions management, such as Access-Control Lists (ACL). There are at least two ways to implement multiple levels of access within a dataset. One involves using sub-datasets with stricter access requirements. It is also possible to store data across several git-annex remotes with varying access permissions, depending on their sensitivity. Path-based pattern matching rules may configured in order to automatically select which remote the files should be pushed to. More flexible selection rules can be implemented using git-annex metadata, which allows to label files with \texttt{key=value} pairs. For instance, one could tag confidential files as \texttt{confidential=yes} and exclude these from certain remotes (blacklist). Alternatively, some files could be pushed to a certain remote provided they are labelled \texttt{public=yes} (whitelist).
			
 
				+Many of the \emph{remotes} supported by DataLad require user-authentication, thus allowing for fine-grained access permissions management, such as Access-Control Lists (ACL). There are at least two ways to implement multiple levels of access within a dataset. One involves using sub-datasets with stricter access requirements. It is also possible to store data across several git-annex remotes with varying access permissions, depending on their sensitivity. Path-based pattern matching rules may be configured in order to automatically select which remote the files should be pushed to. More flexible selection rules can be implemented using git-annex metadata, which allows to label files with \texttt{key=value} pairs. For instance, one could tag confidential files as \texttt{confidential=yes} and exclude these from certain remotes (blacklist). Alternatively, some files could be pushed to a certain remote provided they are labelled \texttt{public=yes} (whitelist).
			
 
				 
			
 
				 DataLad's metadata\footnote{\url{http://docs.datalad.org/en/stable/metadata.html}} system can extract and aggregate information describing the contents of a collection of datasets. A search function then allows the discovery of datasets based on these metadata. We have developed a DataLad extension to extract meaningful metadata from datasets into DataLad's metadata system \citep{datalad_extension}. This allows, for instance, to search for datasets containing a given language. Moreover, DataLad's metadata can natively incorporate DataCite \citep{brase2009datacite} descriptions into its own metadata.
			
 
				 
			
@@ -320,7 +346,7 @@ DataLad does not provide, by itself, the infrastructure to share data. However,
 
				 
			
 
				 Table \ref{table:providers} sums up the most relevant characteristics of a few providers that are appropriate for our research, although many more could be considered. Datasets can only be cloned from providers that support git, and the large files can only be stored on those that support git-annex. Platforms that only support the former, such as GitHub, should therefore be used in tandem with providers that support the latter, like Amazon S3.
			
 
				 
			
 
				-Among criteria of special interest are: the provider's ability to handle complex permissions; how much data it can accept; its ability to assign permanent URLs and identifiers to the datasets; and of course, whether it complies with the legislation regarding privacy. For our purposes, Table \ref{table:providers} suggests GIN is the best option, handling well large files, with highly customizable permissions, and Git-based version control and access (see Appendix \ref{appendix:gin} for a practical use-case of GIN). That said, private projects are limited in space, although at the time of writing this limit can be raised by contributions to the GIN administrators. The next best option may be S3, and some users may prefer S3 when they do not have access to a local cluster, since S3 allows both easy storage and processing. 
			
 
				+Among criteria of special interest are: the provider's ability to handle complex permissions; how much data it can accept; its ability to assign permanent URLs and identifiers to the datasets; and of course, whether it complies with the legislation regarding privacy. For our purposes, Table \ref{table:providers} suggests GIN is the best option, handling well large files, with highly customizable permissions, and Git-based version control and access (see Appendix \ref{appendix:gin} for a practical use-case of GIN). That said, private projects are limited in space, although at the time of writing this limit can be raised by contributions to the GIN administrators. Moreover, there is no long-term guarantee that GIN will keep operating as it currently does. However, GIN's software is open-source, enabling users to run their own instance, where they could move their data to at any time -- which is very straightforward with DataLad. The next best option may be S3, and some users may prefer S3 when they do not have access to a local cluster, since S3 allows both easy storage and processing. 
			
 
				 
			
 
				 To render comparison of options easier, detailed examples of storage designs taken from real datasets are listed in Appendix \ref{appendix:examples}. Scripts to implement these strategies can be found on our GitHub and OSF \citep{datalad_procedures}. We also provide a tutorial based on a public corpus \citep{vandam-day} to convert existing data to our standards and then publish it with DataLad\footnote{\url{https://childproject.readthedocs.io/en/latest/vandam.html}}.
			
 
				 We would like to emphasize that the flexibility of DataLad makes it very easy to migrate from one architecture to another. The underlying infrastructure may change, with little to no impact on the users, and little efforts from the maintainers.
			
@@ -386,7 +412,7 @@ In real datasets with many recordings and several human and automatic annotators
 
				 \centering
			
 
				 \subfloat[]{%
			
 
				 \centering
			
 
				-  \includegraphics[trim=0 250 100 25, clip, width=0.8\textwidth]{Fig3a.jpg}
			
 
				+  \includegraphics[trim=0 250 100 25, clip, width=0.8\textwidth]{Fig5a.jpg}
			
 
				   \label{Annotation:1}%
			
 
				 }
			
 
				 
			
@@ -407,7 +433,7 @@ It should be noted that these measures are most useful in the absence of ground
 
				 \begin{figure*}[htb]
			
 
				 
			
 
				 \centering
			
 
				-\includegraphics[width=0.8\textwidth]{Fig4.pdf}
			
 
				+\includegraphics[width=0.8\textwidth]{Fig6.pdf}
			
 
				 
			
 
				 \caption{\label{fig:precision}\textbf{Examples of diarization performance evaluation using recall, precision and F1 score}. Audio from the the public VanDam corpus \citep{vandam-day} is annotated automatically according to who-speaks-when, using: the LENA diarizer; the Voice Type Classifier (VTC) by \citet{lavechin2020opensource}; and manual CHAT transcriptions \citep{MacWhinney2000} adjusted with the Montreal Forced Aligner \citep{mfa} (``cha''). Speech segments are classified among four speaker types: the key child (CHI), other children (OCH), male adults (MAL) and female adults (FEM). Recall, precision and F1 score are calculated for each of these annotations, by comparing them to annotations of 5 $\times$ 1 minute clips annotated by a human annotator using ELAN (``eaf''; \citealt{wittenburg2006elan}). The clips with the most adult words were targeted.
			
 
				 }
			
@@ -418,10 +444,10 @@ It should be noted that these measures are most useful in the absence of ground
 
				 \begin{figure*}[htb]
			
 
				 
			
 
				 \centering
			
 
				-\includegraphics[width=\textwidth]{Fig5.pdf}
			
 
				+\includegraphics[width=\textwidth]{Fig7.pdf}
			
 
				 
			
 
				 \caption{\label{fig:confusion}\textbf{Example of diarization performance evaluation using confusion matrices}
			
 
				-VTC annotations of the public VanDam corpus \citep{vandam-day} are compared to a gold standard manually annotated using ELAN (eaf). The first coefficient of the left side matrix should be read as: ``43\% of CHI segments from the VTC were also labelled as CHI by the human annotator'' (i.e. as the precision). The first coefficient of the right side matrix should be read as: ``95\% of the portions labelled as CHI speech by the annotator were also labelled as CHI by the VTC'' (i.e. as the recall). The sum of each row of the right-hand plot may exceed one due to overlapping speech. However, the diagonal should ideally be only ones.
			
 
				+VTC annotations of the public VanDam corpus \citep{vandam-day} are compared to a gold standard manually annotated using ELAN (eaf). The first coefficient of the left side matrix should be read as: ``43\% of CHI segments from the VTC were also labelled as CHI by the human annotator'' (i.e. as the precision of the algorithm). The first coefficient of the right side matrix should be read as: ``95\% of the portions labelled as CHI speech by the annotator were also labelled as CHI by the VTC'' (i.e. as the recall of the algorithm). The sum of each row of the right-hand plot may exceed one due to overlapping speech. However, the diagonal should ideally be only ones.
			
 
				 }
			
 
				 
			
 
				 \end{figure*}
			
@@ -440,10 +466,12 @@ The third use case requires further adaptation, in addition to those just mentio
 
				 \section{Limitations}
			
 
				 
			
 
				 DataLad and git-annex are well-documented, and, on the user's end, little knowledge beyond that of git is needed. Maintainers and resource administrators, however, will need a certain level of understanding in order to take full advantage of these tools.
			
 
				-Recently, \citet{Powell2021} has emphasized the shortcomings of decentralization and the inconveniences of a proliferation of databases with different access protocols. In the future, sharing data could be made even easier if off-the-shelf solutions compatible with DataLad were made readily available to linguists, psychologists, and developmental scientists. To this effect, we especially call for the attention of our colleagues working on linguistic databases. We are pleased to have found a solution on GIN -- but it is possible that GIN administrators agreed to host our data because there is some potential connection with neuroimaging, whereas they may not be able to justify their use of resources for under-resourced languages and/or other projects that bear little connection to neuroimaging.
			
 
				+Recently, \citet{Powell2021} has emphasized the shortcomings of decentralization and the inconveniences of a proliferation of databases with different access protocols. In the future, sharing data could be made even easier if off-the-shelf solutions compatible with DataLad were made readily available to linguists, psychologists, and developmental scientists. To this effect, we especially call for the attention of our colleagues working on linguistic databases. We are pleased to have found a solution on GIN -- but it is possible that GIN administrators agreed to host our data because there is some potential connection with neuroimaging, whereas they may not be able to justify their use of resources for under-resourced languages and/or other projects that bear little connection to neuroimaging; also, there is no guarantee that the service will last for as long as our corpora may require it.
			
 
				 
			
 
				 We should stress again that the use of the ChildProject package does not require the datasets to be managed with DataLad. They do need, however, to follow certain standards. Standards, of course, do not come without their own issues, especially in the present case of a maturing technique. They may be challenged by ever-evolving software, hardware, and practices. However, we believe that the benefits of standardization outweigh its costs provided that it remains reasonably flexible. Such standards will further help to combine efforts from different teams across institutions. More procedures and scripts that solve recurrent tasks can be integrated into the ChildProject package, which might also speed up the development of future tools. 
			
 
				-One could argue that new proposed standards most usually end up increasing the amount of competing standards instead of bringing consensus. Nonetheless, if one were to eventually impose itself, well-structured datasets would still be easier to adapt than disordered data representations. Meanwhile, we look forward to discussing standards collaboratively with other teams via the GitHub platform, where anyone can create issues for improvements or bugs, submit pull-requests to integrate an improvement they have made, and/or have relevant conversations in the forum.
			
 
				+One could argue that new proposed standards most usually end up increasing the amount of competing standards instead of bringing consensus. Nonetheless, if one were to eventually impose itself, well-structured datasets would still be easier to adapt than disordered data representations. Meanwhile, we look forward to discussing standards collaboratively with other teams via the GitHub platform, where anyone can create issues for improvements or bugs, submit pull-requests to integrate an improvement they have made, and/or have relevant conversations in the forum. Also, it should be emphasized once again that this package does not attempt to enforce specific annotation software or formats, which would be impossible and undesirable, but rather aims at facilitating analyses by incorporating this wide variety of formats into consistent data structures.
			
 
				+
			
 
				+Finally, this paper has laid out technical solutions to key issues surrounding long-form recordings, but we have remained silent about how the community can address these while bearing in mind broader ethical issues. For example, speech and language technology has an unfortunate history of focusing on speech and language data over the communities who really own these data \citep{bird2020decolonising}. These ethical and justice issues are crucial and we hope there is more discussion about them in the community using long-form recordings. A case has recently been made about the possibility of building a secure and distributed "World Behavior Bank" \citep{cychosz2021using}, whose governance would include representatives of such populations to make sure that re-use considers potential benefits and harms for participants, participants' families (current and future), as well as more broadly their communities (see also discussions surrounding a Heritage Data Reuse Charter, \url{https://www.dariah.eu/activities/open-science/data-re-use/}).
			
 
				 
			
 
				 \section{Summary}
			
 
				 
			
@@ -473,7 +501,9 @@ This paper does not directly rely on specific data or material.
 
				 \subsubsection*{Code availability}
			
 
				 
			
 
				 The present paper can be reproduced from its source, which is hosted on GIN at \url{https://gin.g-node.org/LAAC-LSCP/managing-storing-sharing-paper}.
			
 
				-The ChildProject package is available on GitHub at \url{https://github.com/LAAC-LSCP/ChildProject}. We provide scripts and templates for DataLad managed datasets at \url{http://doi.org/10.17605/OSF.IO/6VCXK} \citep{datalad_procedures}. We also provide a DataLad extension to extract metadata from corpora of daylong recordings \citep{datalad_extension}.
			
 
				+The ChildProject package is available on GitHub at \url{https://github.com/LAAC-LSCP/ChildProject}. 
			
 
				+A step-by-step tutorial to launch annotation campaigns on Zooniverse is published along with the source code at \url{https://doi.gin.g-node.org/10.12751/g-node.k2h9az} \citep{zooniverse_example}.
			
 
				+We provide scripts and templates for DataLad managed datasets at \url{http://doi.org/10.17605/OSF.IO/6VCXK} \citep{datalad_procedures}. We also provide a DataLad extension to extract metadata from corpora of daylong recordings \citep{datalad_extension}.
			
 
				 
			
 
				 \appendix
			
 
				 
			
@@ -531,7 +561,7 @@ s3 & Amazon S3  &  recordings; annotations  & Collaborators  & AES-128 \\ \botto
 
				 \caption{\label{table:storage2}Example 2 - Storage strategy example relying on GitHub and Amazon S3.}
			
 
				 \end{table*}
			
 
				 
			
 
				-Amazon is superior to most alternatives for a number of reasons, including that it is highly tested, developed by engineers with a high-level of knowledge of the platform, and widely used. This means that the code is robust even before it is released, and it is widely tested once it is released. The fact that there are many users also entails that issues or questions can be looked up online. In addition, in the context of data durability, Amazon is a good choice because it is too big to fail, and thus probably available for the long-term. In addition, in sheer terms of flexibility and coverage, Amazon provides a whole suite of tools (for data sharing, backups, and processing), which may be useful for researchers with little access to high-capacity infrastructures.
			
 
				+Amazon is superior to most alternatives for a number of reasons, including that it is highly tested, developed by engineers with a high-level of knowledge of the platform, and widely used. This means that the code is robust even before it is released, and it is widely tested once it is released. The fact that there are many users also entails that issues or questions can be looked up online. In addition, in the context of data durability, Amazon is a good choice because it is ``too big to fail'', and thus probably available for the long-term. Moreover, in sheer terms of flexibility and coverage, Amazon provides a whole suite of tools (for data sharing, backups, and processing), which may be useful for researchers with little access to high-capacity infrastructures. Additionally, it is not very costly (see comparison table on \url{https://childproject.readthedocs.io/en/latest/vandam.html?highlight=amazon#where-to-publish-my-dataset}).
			
 
				 
			
 
				 \subsection{Example 3 - sharing large datasets with outside collaborators  and multi-tier access (GIN)}\label{appendix:gin}
			
 
				 
			
--- a/references.bib
+++ b/references.bib
@@ -175,6 +175,21 @@ journal={Interspeech}
 
				 	note={Version 1.70}
			
 
				 }
			
 
				 
			
 
				+@inproceedings{bird2020decolonising,
			
 
				+  title={Decolonising speech and language technology},
			
 
				+  author={Bird, Steven},
			
 
				+  booktitle={Proceedings of the 28th International Conference on Computational Linguistics},
			
 
				+  pages={3504--3519},
			
 
				+  year={2020}
			
 
				+}
			
 
				+
			
 
				+@article{cychosz2021using,
			
 
				+  title={Using big data from long-form recordings to study development and optimize societal impact},
			
 
				+  author={Cychosz, Meg and Cristia, Alejandrina},
			
 
				+  year={2021},
			
 
				+  journal={OSF Preprints}
			
 
				+}
			
 
				+
			
 
				 @article{Cychosz2020,
			
 
				   doi = {10.3758/s13428-020-01365-9},
			
 
				   url = {https://doi.org/10.3758/s13428-020-01365-9},
			
@@ -321,6 +336,27 @@ journal={Interspeech}
 
				 	abstract = {We introduce a set of integrated developments in web application software, networking, data citation standards, and statistical methods designed to put some of the universe of data and data sharing practices on somewhat firmer ground. We have focused on social science data, but aspects of what we have developed may apply more widely. The idea is to facilitate the public distribution of persistent, authorized, and verifiable data, with powerful but easy-to-use technology, even when the data are confidential or proprietary. We intend to solve some of the sociological problems of data sharing via technological means, with the result intended to benefit both the scientific community and the sometimes apparently contradictory goals of individual researchers.},
			
 
				 	author = {Gary King}
			
 
				 }
			
 
				+
			
 
				+@misc{pandas-software,
			
 
				+    author       = {The pandas development team},
			
 
				+    title        = {pandas-dev/pandas: Pandas},
			
 
				+    month        = feb,
			
 
				+    year         = 2020,
			
 
				+    publisher    = {Zenodo},
			
 
				+    version      = {latest},
			
 
				+    doi          = {10.5281/zenodo.3509134},
			
 
				+    url          = {https://doi.org/10.5281/zenodo.3509134}
			
 
				+}
			
 
				+
			
 
				+@InProceedings{pandas-paper,
			
 
				+  author    = { {W}es {M}c{K}inney },
			
 
				+  title     = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython },
			
 
				+  booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference },
			
 
				+  pages     = { 56 - 61 },
			
 
				+  year      = { 2010 },
			
 
				+  editor    = { {S}t\'efan van der {W}alt and {J}arrod {M}illman },
			
 
				+  doi       = { 10.25080/Majora-92bf1922-00a }
			
 
				+}
			
 
				 @misc{gilkerson2008power,
			
 
				   title={The power of talk (LENA Foundation Technical Report LTR-01-2)},
			
 
				   author={Gilkerson, J and Richards, JA},
			
@@ -448,6 +484,13 @@ url={https://doi.org/10.1038/nn.4550}
 
				   year = {2015}
			
 
				 }
			
 
				 
			
 
				+@misc{exelang-book,
			
 
				+  url = {https://laac-lscp.github.io/exelang-book/},
			
 
				+  author = {Cristia, Alejandrina and Pisani, Sara},
			
 
				+  title = {Long-form recordings: From A to Z},
			
 
				+  year = {2021}
			
 
				+}
			
 
				+
			
 
				 @INPROCEEDINGS{zooniverse,
			
 
				        author = {{Borne}, K.~D. and {Zooniverse Team}},
			
 
				         title = "{The Zooniverse: A Framework for Knowledge Discovery from Citizen Science Data}",
			
@@ -495,6 +538,20 @@ url={https://doi.org/10.1038/nn.4550}
 
				   copyright = {Creative Commons Attribution Share Alike 4.0 International}
			
 
				 }
			
 
				 
			
 
				+@article{datalad_paper,
			
 
				+  doi = {10.21105/joss.03262},
			
 
				+  url = {https://doi.org/10.21105/joss.03262},
			
 
				+  year = {2021},
			
 
				+  month = jul,
			
 
				+  publisher = {The Open Journal},
			
 
				+  volume = {6},
			
 
				+  number = {63},
			
 
				+  pages = {3262},
			
 
				+  author = {Yaroslav Halchenko and Kyle Meyer and Benjamin Poldrack and Debanjum Solanky and Adina Wagner and Jason Gors and Dave MacFarlane and Dorian Pustina and Vanessa Sochat and Satrajit Ghosh and Christian M\"{o}nch and Christopher Markiewicz and Laura Waite and Ilya Shlyakhter and Alejandro de la Vega and Soichi Hayashi and Christian H\"{a}usler and Jean-Baptiste Poline and Tobias Kadelka and Kusti Skyt{\'{e}}n and Dorota Jarecka and David Kennedy and Ted Strauss and Matt Cieslak and Peter Vavra and Horea-Ioan Ioanas and Robin Schneider and Mika Pfl\"{u}ger and James Haxby and Simon Eickhoff and Michael Hanke},
			
 
				+  title = {{DataLad}: distributed system for joint management of code,  data,  and their relationship},
			
 
				+  journal = {Journal of Open Source Software}
			
 
				+}
			
 
				+
			
 
				 @article{decentralization_hanke,
			
 
				   doi = {10.1515/nf-2020-0037},
			
 
				   url = {https://doi.org/10.1515/nf-2020-0037},
			
@@ -714,6 +771,17 @@ journal = {}
 
				 }
			
 
				 
			
 
				 % code
			
 
				+@misc{zooniverse_example,
			
 
				+  doi = {10.12751/G-NODE.K2H9AZ},
			
 
				+  url = {https://doi.gin.g-node.org/10.12751/g-node.k2h9az},
			
 
				+  author = {Gautheron,  Lucas},
			
 
				+  keywords = {daylong recordings,  crowd-sourcing,  speech data management,  annotation campaigns},
			
 
				+  language = {en},
			
 
				+  title = {Launching a campaign of annotations on Zooniverse with ChildProject},
			
 
				+  publisher = {G-Node},
			
 
				+  year = {2021},
			
 
				+  copyright = {MIT License}
			
 
				+}
			
 
				 
			
 
				 @article{datalad_procedures,
			
 
				   doi = {10.17605/OSF.IO/6VCXK},
			
--- a/sample.pdf
+++ b/sample.pdf
@@ -1 +0,0 @@
 
				-/annex/objects/MD5E-s100403--baaa8b3c1bfb17b98b9f4804d2248bf7.pdf
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5E-s99210--e2a3942abc0c080a207078724b3bfae2.pdf`
		`@@ -1 +0,0 @@`
		`-.git/annex/objects/QG/pj/MD5E-s17834--919b6b31222728bc516a01d08a660dde.pdf/MD5E-s17834--919b6b31222728bc516a01d08a660dde.pdf`
		`@@ -1 +0,0 @@`
		`-.git/annex/objects/3v/pM/MD5E-s15625--bf6d10d87c8366ebb3da2ec266102093.pdf/MD5E-s15625--bf6d10d87c8366ebb3da2ec266102093.pdf`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5E-s34898--09dc2245e5fad57d4cc29955d8aae32a.pdf`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5E-s37730--8e4ed85b5df1400785cc187a83145f13.pdf`
		`@@ -1 +0,0 @@`
		`-.git/annex/objects/wg/j3/MD5E-s366723--fd2b7a18493d831ad28a3dc950fab4b2.pdf/MD5E-s366723--fd2b7a18493d831ad28a3dc950fab4b2.pdf`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5E-s470526--97726c8157984ba440543dfc1dfa4c52.pdf`
		`@@ -1 +0,0 @@`
		`-/annex/objects/MD5E-s100403--baaa8b3c1bfb17b98b9f4804d2248bf7.pdf`