Browse Source

initial commit

Lucas Gautheron 3 years ago
commit
57350fb10e
24 changed files with 9357 additions and 0 deletions
  1. 4 0
      .gitattributes
  2. 14 0
      .gitignore
  3. 5 0
      .gitmodules
  4. 720 0
      Fig1a.eps
  5. 108 0
      Fig1a.tex
  6. 642 0
      Fig1b.eps
  7. 103 0
      Fig1b.tex
  8. 56 0
      Fig2.tex
  9. BIN
      Fig3a.jpg
  10. BIN
      Fig3b.jpg
  11. BIN
      Fig4.pdf
  12. BIN
      Fig5.pdf
  13. 28 0
      Makefile
  14. 78 0
      code/confusion_matrix.py
  15. 132 0
      code/recall.py
  16. 603 0
      main.tex
  17. 704 0
      references.bib
  18. 3 0
      requirements.txt
  19. 1658 0
      spbasic.bst
  20. 1512 0
      spmpsci.bst
  21. 1442 0
      spphys.bst
  22. 113 0
      svglov3.clo
  23. 1431 0
      svjour3.cls
  24. 1 0
      vandam-data

+ 4 - 0
.gitattributes

@@ -0,0 +1,4 @@
+
+* annex.backend=MD5E
+**/.git* annex.largefiles=nothing
+* annex.largefiles=((mimeencoding=binary)and(largerthan=0))

+ 14 - 0
.gitignore

@@ -0,0 +1,14 @@
+*.swp
+*.aux
+*.bbl
+*.blg
+*.log
+*.out
+*.fls
+*.tdo
+*.fdb_latexmk
+fglabels
+main.pdf
+example.eps
+img/*eps-converted-to.pdf
+*-stamp

+ 5 - 0
.gitmodules

@@ -0,0 +1,5 @@
+[submodule "vandam-data"]
+	path = vandam-data
+	url = git@gin.g-node.org:/LAAC-LSCP/vandam-data.git
+	datalad-id = 5b235222-b7ba-47ec-951f-73a0bcc59886
+	datalad-url = git@gin.g-node.org:/LAAC-LSCP/vandam-data.git

+ 720 - 0
Fig1a.eps

@@ -0,0 +1,720 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: versions.tex
+%%Creator: gnuplot 5.4 patchlevel 0
+%%CreationDate: Wed Feb 10 10:43:04 2021
+%%DocumentFonts: 
+%%BoundingBox: 50 50 410 302
+%%EndComments
+%%BeginProlog
+/gnudict 256 dict def
+gnudict begin
+%
+% The following true/false flags may be edited by hand if desired.
+% The unit line width and grayscale image gamma correction may also be changed.
+%
+/Color false def
+/Blacktext true def
+/Solid false def
+/Dashlength 1 def
+/Landscape false def
+/Level1 false def
+/Level3 false def
+/Rounded false def
+/ClipToBoundingBox false def
+/SuppressPDFMark false def
+/TransparentPatterns false def
+/gnulinewidth 5.000 def
+/userlinewidth gnulinewidth def
+/Gamma 1.0 def
+/BackgroundColor {-1.000 -1.000 -1.000} def
+%
+/vshift -73 def
+/dl1 {
+  10.0 Dashlength userlinewidth gnulinewidth div mul mul mul
+  Rounded { currentlinewidth 0.75 mul sub dup 0 le { pop 0.01 } if } if
+} def
+/dl2 {
+  10.0 Dashlength userlinewidth gnulinewidth div mul mul mul
+  Rounded { currentlinewidth 0.75 mul add } if
+} def
+/hpt_ 31.5 def
+/vpt_ 31.5 def
+/hpt hpt_ def
+/vpt vpt_ def
+/doclip {
+  ClipToBoundingBox {
+    newpath 50 50 moveto 410 50 lineto 410 302 lineto 50 302 lineto closepath
+    clip
+  } if
+} def
+%
+% Gnuplot Prolog Version 5.2 (Dec 2017)
+%
+%/SuppressPDFMark true def
+%
+/M {moveto} bind def
+/L {lineto} bind def
+/R {rmoveto} bind def
+/V {rlineto} bind def
+/N {newpath moveto} bind def
+/Z {closepath} bind def
+/C {setrgbcolor} bind def
+/f {rlineto fill} bind def
+/g {setgray} bind def
+/Gshow {show} def   % May be redefined later in the file to support UTF-8
+/vpt2 vpt 2 mul def
+/hpt2 hpt 2 mul def
+/Lshow {currentpoint stroke M 0 vshift R 
+	Blacktext {gsave 0 setgray textshow grestore} {textshow} ifelse} def
+/Rshow {currentpoint stroke M dup stringwidth pop neg vshift R
+	Blacktext {gsave 0 setgray textshow grestore} {textshow} ifelse} def
+/Cshow {currentpoint stroke M dup stringwidth pop -2 div vshift R 
+	Blacktext {gsave 0 setgray textshow grestore} {textshow} ifelse} def
+/UP {dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def
+  /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def} def
+/DL {Color {setrgbcolor Solid {pop []} if 0 setdash}
+ {pop pop pop 0 setgray Solid {pop []} if 0 setdash} ifelse} def
+/BL {stroke userlinewidth 2 mul setlinewidth
+	Rounded {1 setlinejoin 1 setlinecap} if} def
+/AL {stroke userlinewidth 2 div setlinewidth
+	Rounded {1 setlinejoin 1 setlinecap} if} def
+/UL {dup gnulinewidth mul /userlinewidth exch def
+	dup 1 lt {pop 1} if 10 mul /udl exch def} def
+/PL {stroke userlinewidth setlinewidth
+	Rounded {1 setlinejoin 1 setlinecap} if} def
+3.8 setmiterlimit
+% Classic Line colors (version 5.0)
+/LCw {1 1 1} def
+/LCb {0 0 0} def
+/LCa {0 0 0} def
+/LC0 {1 0 0} def
+/LC1 {0 1 0} def
+/LC2 {0 0 1} def
+/LC3 {1 0 1} def
+/LC4 {0 1 1} def
+/LC5 {1 1 0} def
+/LC6 {0 0 0} def
+/LC7 {1 0.3 0} def
+/LC8 {0.5 0.5 0.5} def
+% Default dash patterns (version 5.0)
+/LTB {BL [] LCb DL} def
+/LTw {PL [] 1 setgray} def
+/LTb {PL [] LCb DL} def
+/LTa {AL [1 udl mul 2 udl mul] 0 setdash LCa setrgbcolor} def
+/LT0 {PL [] LC0 DL} def
+/LT1 {PL [2 dl1 3 dl2] LC1 DL} def
+/LT2 {PL [1 dl1 1.5 dl2] LC2 DL} def
+/LT3 {PL [6 dl1 2 dl2 1 dl1 2 dl2] LC3 DL} def
+/LT4 {PL [1 dl1 2 dl2 6 dl1 2 dl2 1 dl1 2 dl2] LC4 DL} def
+/LT5 {PL [4 dl1 2 dl2] LC5 DL} def
+/LT6 {PL [1.5 dl1 1.5 dl2 1.5 dl1 1.5 dl2 1.5 dl1 6 dl2] LC6 DL} def
+/LT7 {PL [3 dl1 3 dl2 1 dl1 3 dl2] LC7 DL} def
+/LT8 {PL [2 dl1 2 dl2 2 dl1 6 dl2] LC8 DL} def
+/SL {[] 0 setdash} def
+/Pnt {stroke [] 0 setdash gsave 1 setlinecap M 0 0 V stroke grestore} def
+/Dia {stroke [] 0 setdash 2 copy vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V closepath stroke
+  Pnt} def
+/Pls {stroke [] 0 setdash vpt sub M 0 vpt2 V
+  currentpoint stroke M
+  hpt neg vpt neg R hpt2 0 V stroke
+ } def
+/Box {stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V closepath stroke
+  Pnt} def
+/Crs {stroke [] 0 setdash exch hpt sub exch vpt add M
+  hpt2 vpt2 neg V currentpoint stroke M
+  hpt2 neg 0 R hpt2 vpt2 V stroke} def
+/TriU {stroke [] 0 setdash 2 copy vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V closepath stroke
+  Pnt} def
+/Star {2 copy Pls Crs} def
+/BoxF {stroke [] 0 setdash exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V closepath fill} def
+/TriUF {stroke [] 0 setdash vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V closepath fill} def
+/TriD {stroke [] 0 setdash 2 copy vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V closepath stroke
+  Pnt} def
+/TriDF {stroke [] 0 setdash vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V closepath fill} def
+/DiaF {stroke [] 0 setdash vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V closepath fill} def
+/Pent {stroke [] 0 setdash 2 copy gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  closepath stroke grestore Pnt} def
+/PentF {stroke [] 0 setdash gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  closepath fill grestore} def
+/Circle {stroke [] 0 setdash 2 copy
+  hpt 0 360 arc stroke Pnt} def
+/CircleF {stroke [] 0 setdash hpt 0 360 arc fill} def
+/C0 {BL [] 0 setdash 2 copy moveto vpt 90 450 arc} bind def
+/C1 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 90 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C2 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 90 180 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C3 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 180 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C4 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 180 270 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C5 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 90 arc
+	2 copy moveto
+	2 copy vpt 180 270 arc closepath fill
+	vpt 0 360 arc} bind def
+/C6 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 90 270 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C7 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 270 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C8 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 270 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C9 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 270 450 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C10 {BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill
+	2 copy moveto
+	2 copy vpt 90 180 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C11 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 180 arc closepath fill
+	2 copy moveto
+	2 copy vpt 270 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C12 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 180 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C13 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 90 arc closepath fill
+	2 copy moveto
+	2 copy vpt 180 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C14 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 90 360 arc closepath fill
+	vpt 0 360 arc} bind def
+/C15 {BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/Rec {newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto
+	neg 0 rlineto closepath} bind def
+/Square {dup Rec} bind def
+/Bsquare {vpt sub exch vpt sub exch vpt2 Square} bind def
+/S0 {BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare} bind def
+/S1 {BL [] 0 setdash 2 copy vpt Square fill Bsquare} bind def
+/S2 {BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare} bind def
+/S3 {BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare} bind def
+/S4 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def
+/S5 {BL [] 0 setdash 2 copy 2 copy vpt Square fill
+	exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def
+/S6 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare} bind def
+/S7 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill
+	2 copy vpt Square fill Bsquare} bind def
+/S8 {BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare} bind def
+/S9 {BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare} bind def
+/S10 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill
+	Bsquare} bind def
+/S11 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill
+	Bsquare} bind def
+/S12 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare} bind def
+/S13 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+	2 copy vpt Square fill Bsquare} bind def
+/S14 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+	2 copy exch vpt sub exch vpt Square fill Bsquare} bind def
+/S15 {BL [] 0 setdash 2 copy Bsquare fill Bsquare} bind def
+/D0 {gsave translate 45 rotate 0 0 S0 stroke grestore} bind def
+/D1 {gsave translate 45 rotate 0 0 S1 stroke grestore} bind def
+/D2 {gsave translate 45 rotate 0 0 S2 stroke grestore} bind def
+/D3 {gsave translate 45 rotate 0 0 S3 stroke grestore} bind def
+/D4 {gsave translate 45 rotate 0 0 S4 stroke grestore} bind def
+/D5 {gsave translate 45 rotate 0 0 S5 stroke grestore} bind def
+/D6 {gsave translate 45 rotate 0 0 S6 stroke grestore} bind def
+/D7 {gsave translate 45 rotate 0 0 S7 stroke grestore} bind def
+/D8 {gsave translate 45 rotate 0 0 S8 stroke grestore} bind def
+/D9 {gsave translate 45 rotate 0 0 S9 stroke grestore} bind def
+/D10 {gsave translate 45 rotate 0 0 S10 stroke grestore} bind def
+/D11 {gsave translate 45 rotate 0 0 S11 stroke grestore} bind def
+/D12 {gsave translate 45 rotate 0 0 S12 stroke grestore} bind def
+/D13 {gsave translate 45 rotate 0 0 S13 stroke grestore} bind def
+/D14 {gsave translate 45 rotate 0 0 S14 stroke grestore} bind def
+/D15 {gsave translate 45 rotate 0 0 S15 stroke grestore} bind def
+/DiaE {stroke [] 0 setdash vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V closepath stroke} def
+/BoxE {stroke [] 0 setdash exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V closepath stroke} def
+/TriUE {stroke [] 0 setdash vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V closepath stroke} def
+/TriDE {stroke [] 0 setdash vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V closepath stroke} def
+/PentE {stroke [] 0 setdash gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  closepath stroke grestore} def
+/CircE {stroke [] 0 setdash 
+  hpt 0 360 arc stroke} def
+/Opaque {gsave closepath 1 setgray fill grestore 0 setgray closepath} def
+/DiaW {stroke [] 0 setdash vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V Opaque stroke} def
+/BoxW {stroke [] 0 setdash exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V Opaque stroke} def
+/TriUW {stroke [] 0 setdash vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V Opaque stroke} def
+/TriDW {stroke [] 0 setdash vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V Opaque stroke} def
+/PentW {stroke [] 0 setdash gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  Opaque stroke grestore} def
+/CircW {stroke [] 0 setdash 
+  hpt 0 360 arc Opaque stroke} def
+/BoxFill {gsave Rec 1 setgray fill grestore} def
+/Density {
+  /Fillden exch def
+  currentrgbcolor
+  /ColB exch def /ColG exch def /ColR exch def
+  /ColR ColR Fillden mul Fillden sub 1 add def
+  /ColG ColG Fillden mul Fillden sub 1 add def
+  /ColB ColB Fillden mul Fillden sub 1 add def
+  ColR ColG ColB setrgbcolor} def
+/BoxColFill {gsave Rec PolyFill} def
+/PolyFill {gsave Density fill grestore grestore} def
+/h {rlineto rlineto rlineto closepath gsave fill grestore stroke} bind def
+%
+% PostScript Level 1 Pattern Fill routine for rectangles
+% Usage: x y w h s a XX PatternFill
+%	x,y = lower left corner of box to be filled
+%	w,h = width and height of box
+%	  a = angle in degrees between lines and x-axis
+%	 XX = 0/1 for no/yes cross-hatch
+%
+/PatternFill {gsave /PFa [ 9 2 roll ] def
+  PFa 0 get PFa 2 get 2 div add PFa 1 get PFa 3 get 2 div add translate
+  PFa 2 get -2 div PFa 3 get -2 div PFa 2 get PFa 3 get Rec
+  TransparentPatterns {} {gsave 1 setgray fill grestore} ifelse
+  clip
+  currentlinewidth 0.5 mul setlinewidth
+  /PFs PFa 2 get dup mul PFa 3 get dup mul add sqrt def
+  0 0 M PFa 5 get rotate PFs -2 div dup translate
+  0 1 PFs PFa 4 get div 1 add floor cvi
+	{PFa 4 get mul 0 M 0 PFs V} for
+  0 PFa 6 get ne {
+	0 1 PFs PFa 4 get div 1 add floor cvi
+	{PFa 4 get mul 0 2 1 roll M PFs 0 V} for
+ } if
+  stroke grestore} def
+%
+/languagelevel where
+ {pop languagelevel} {1} ifelse
+dup 2 lt
+	{/InterpretLevel1 true def
+	 /InterpretLevel3 false def}
+	{/InterpretLevel1 Level1 def
+	 2 gt
+	    {/InterpretLevel3 Level3 def}
+	    {/InterpretLevel3 false def}
+	 ifelse }
+ ifelse
+%
+% PostScript level 2 pattern fill definitions
+%
+/Level2PatternFill {
+/Tile8x8 {/PaintType 2 /PatternType 1 /TilingType 1 /BBox [0 0 8 8] /XStep 8 /YStep 8}
+	bind def
+/KeepColor {currentrgbcolor [/Pattern /DeviceRGB] setcolorspace} bind def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke} 
+>> matrix makepattern
+/Pat1 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke
+	0 4 M 4 8 L 8 4 L 4 0 L 0 4 L stroke}
+>> matrix makepattern
+/Pat2 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 0 M 0 8 L
+	8 8 L 8 0 L 0 0 L fill}
+>> matrix makepattern
+/Pat3 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -4 8 M 8 -4 L
+	0 12 M 12 0 L stroke}
+>> matrix makepattern
+/Pat4 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -4 0 M 8 12 L
+	0 -4 M 12 8 L stroke}
+>> matrix makepattern
+/Pat5 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -2 8 M 4 -4 L
+	0 12 M 8 -4 L 4 12 M 10 0 L stroke}
+>> matrix makepattern
+/Pat6 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -2 0 M 4 12 L
+	0 -4 M 8 12 L 4 -4 M 10 8 L stroke}
+>> matrix makepattern
+/Pat7 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 8 -2 M -4 4 L
+	12 0 M -4 8 L 12 4 M 0 10 L stroke}
+>> matrix makepattern
+/Pat8 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 -2 M 12 4 L
+	-4 0 M 12 8 L -4 4 M 8 10 L stroke}
+>> matrix makepattern
+/Pat9 exch def
+/Pattern1 {PatternBgnd KeepColor Pat1 setpattern} bind def
+/Pattern2 {PatternBgnd KeepColor Pat2 setpattern} bind def
+/Pattern3 {PatternBgnd KeepColor Pat3 setpattern} bind def
+/Pattern4 {PatternBgnd KeepColor Landscape {Pat5} {Pat4} ifelse setpattern} bind def
+/Pattern5 {PatternBgnd KeepColor Landscape {Pat4} {Pat5} ifelse setpattern} bind def
+/Pattern6 {PatternBgnd KeepColor Landscape {Pat9} {Pat6} ifelse setpattern} bind def
+/Pattern7 {PatternBgnd KeepColor Landscape {Pat8} {Pat7} ifelse setpattern} bind def
+} def
+%
+%
+%End of PostScript Level 2 code
+%
+/PatternBgnd {
+  TransparentPatterns {} {gsave 1 setgray fill grestore} ifelse
+} def
+%
+% Substitute for Level 2 pattern fill codes with
+% grayscale if Level 2 support is not selected.
+%
+/Level1PatternFill {
+/Pattern1 {0.250 Density} bind def
+/Pattern2 {0.500 Density} bind def
+/Pattern3 {0.750 Density} bind def
+/Pattern4 {0.125 Density} bind def
+/Pattern5 {0.375 Density} bind def
+/Pattern6 {0.625 Density} bind def
+/Pattern7 {0.875 Density} bind def
+} def
+%
+% Now test for support of Level 2 code
+%
+Level1 {Level1PatternFill} {Level2PatternFill} ifelse
+%
+/Symbol-Oblique /Symbol findfont [1 0 .167 1 0 0] makefont
+dup length dict begin {1 index /FID eq {pop pop} {def} ifelse} forall
+currentdict end definefont pop
+%
+Level1 SuppressPDFMark or 
+{} {
+/SDict 10 dict def
+systemdict /pdfmark known not {
+  userdict /pdfmark systemdict /cleartomark get put
+} if
+SDict begin [
+  /Title (versions.tex)
+  /Subject (gnuplot plot)
+  /Creator (gnuplot 5.4 patchlevel 0)
+%  /Producer (gnuplot)
+%  /Keywords ()
+  /CreationDate (Wed Feb 10 10:43:04 2021)
+  /DOCINFO pdfmark
+end
+} ifelse
+%
+% Support for boxed text - Ethan A Merritt Sep 2016
+%
+/InitTextBox { userdict /TBy2 3 -1 roll put userdict /TBx2 3 -1 roll put
+           userdict /TBy1 3 -1 roll put userdict /TBx1 3 -1 roll put
+	   /Boxing true def } def
+/ExtendTextBox { dup type /stringtype eq
+    { Boxing { gsave dup false charpath pathbbox
+      dup TBy2 gt {userdict /TBy2 3 -1 roll put} {pop} ifelse
+      dup TBx2 gt {userdict /TBx2 3 -1 roll put} {pop} ifelse
+      dup TBy1 lt {userdict /TBy1 3 -1 roll put} {pop} ifelse
+      dup TBx1 lt {userdict /TBx1 3 -1 roll put} {pop} ifelse
+      grestore } if }
+    {} ifelse} def
+/PopTextBox { newpath TBx1 TBxmargin sub TBy1 TBymargin sub M
+               TBx1 TBxmargin sub TBy2 TBymargin add L
+	       TBx2 TBxmargin add TBy2 TBymargin add L
+	       TBx2 TBxmargin add TBy1 TBymargin sub L closepath } def
+/DrawTextBox { PopTextBox stroke /Boxing false def} def
+/FillTextBox { gsave PopTextBox fill grestore /Boxing false def} def
+0 0 0 0 InitTextBox
+/TBxmargin 20 def
+/TBymargin 20 def
+/Boxing false def
+/textshow { ExtendTextBox Gshow } def
+%
+end
+%%EndProlog
+%%Page: 1 1
+gnudict begin
+gsave
+doclip
+50 50 translate
+0.050 0.050 scale
+0 setgray
+newpath
+1.000 UL
+LTb
+LCb setrgbcolor
+[] 0 setdash
+814 440 M
+63 0 V
+5926 0 R
+-63 0 V
+stroke
+LTb
+814 1170 M
+63 0 V
+5926 0 R
+-63 0 V
+stroke
+814 1900 M
+63 0 V
+5926 0 R
+-63 0 V
+stroke
+814 2630 M
+63 0 V
+5926 0 R
+-63 0 V
+stroke
+814 3359 M
+63 0 V
+5926 0 R
+-63 0 V
+stroke
+814 4089 M
+63 0 V
+5926 0 R
+-63 0 V
+stroke
+814 4819 M
+63 0 V
+5926 0 R
+-63 0 V
+stroke
+814 440 M
+0 63 V
+0 4316 R
+0 -63 V
+stroke
+1061 440 M
+0 31 V
+0 4348 R
+0 -31 V
+1310 440 M
+0 31 V
+0 4348 R
+0 -31 V
+1562 440 M
+0 31 V
+0 4348 R
+0 -31 V
+1811 440 M
+0 63 V
+0 4316 R
+0 -63 V
+stroke
+2061 440 M
+0 31 V
+0 4348 R
+0 -31 V
+2310 440 M
+0 31 V
+0 4348 R
+0 -31 V
+2560 440 M
+0 31 V
+0 4348 R
+0 -31 V
+2812 440 M
+0 63 V
+0 4316 R
+0 -63 V
+stroke
+3058 440 M
+0 31 V
+0 4348 R
+0 -31 V
+3305 440 M
+0 31 V
+0 4348 R
+0 -31 V
+3557 440 M
+0 31 V
+0 4348 R
+0 -31 V
+3809 440 M
+0 63 V
+0 4316 R
+0 -63 V
+stroke
+4053 440 M
+0 31 V
+0 4348 R
+0 -31 V
+4302 440 M
+0 31 V
+0 4348 R
+0 -31 V
+4555 440 M
+0 31 V
+0 4348 R
+0 -31 V
+4804 440 M
+0 63 V
+0 4316 R
+0 -63 V
+stroke
+5051 440 M
+0 31 V
+0 4348 R
+0 -31 V
+5300 440 M
+0 31 V
+0 4348 R
+0 -31 V
+5552 440 M
+0 31 V
+0 4348 R
+0 -31 V
+5801 440 M
+0 63 V
+0 4316 R
+0 -63 V
+stroke
+6051 440 M
+0 31 V
+0 4348 R
+0 -31 V
+6300 440 M
+0 31 V
+0 4348 R
+0 -31 V
+6549 440 M
+0 31 V
+0 4348 R
+0 -31 V
+6802 440 M
+0 63 V
+0 4316 R
+0 -63 V
+stroke
+1.000 UL
+LTb
+814 4819 N
+814 440 L
+5989 0 V
+0 4379 V
+-5989 0 V
+Z stroke
+1.000 UP
+1.000 UL
+LTb
+% Begin plot #1
+2.000 UL
+LTb
+0.00 0.00 0.00 C
+1043 513 M
+80 73 V
+311 73 V
+115 73 V
+46 73 V
+598 73 V
+57 73 V
+28 73 V
+22 73 V
+237 73 V
+22 73 V
+55 73 V
+54 73 V
+355 73 V
+14 73 V
+221 73 V
+30 73 V
+96 73 V
+35 73 V
+104 73 V
+30 73 V
+421 73 V
+35 73 V
+232 73 V
+22 73 V
+60 73 V
+178 73 V
+112 73 V
+93 73 V
+196 73 V
+36 72 V
+120 73 V
+136 73 V
+183 73 V
+104 73 V
+96 73 V
+251 73 V
+11 73 V
+35 73 V
+132 73 V
+8 73 V
+38 73 V
+57 73 V
+80 73 V
+87 73 V
+65 73 V
+66 73 V
+49 73 V
+107 73 V
+65 73 V
+123 73 V
+52 66 V
+stroke
+LTw
+% End plot #1
+2.000 UL
+LTb
+LCb setrgbcolor
+[] 0 setdash
+1.000 UL
+LTb
+814 4819 N
+814 440 L
+5989 0 V
+0 4379 V
+-5989 0 V
+Z stroke
+1.000 UP
+1.000 UL
+LTb
+stroke
+grestore
+end
+showpage
+%%Trailer

+ 108 - 0
Fig1a.tex

@@ -0,0 +1,108 @@
+% GNUPLOT: LaTeX picture with Postscript
+\begingroup
+  \makeatletter
+  \providecommand\color[2][]{%
+    \GenericError{(gnuplot) \space\space\space\@spaces}{%
+      Package color not loaded in conjunction with
+      terminal option `colourtext'%
+    }{See the gnuplot documentation for explanation.%
+    }{Either use 'blacktext' in gnuplot or load the package
+      color.sty in LaTeX.}%
+    \renewcommand\color[2][]{}%
+  }%
+  \providecommand\includegraphics[2][]{%
+    \GenericError{(gnuplot) \space\space\space\@spaces}{%
+      Package graphicx or graphics not loaded%
+    }{See the gnuplot documentation for explanation.%
+    }{The gnuplot epslatex terminal needs graphicx.sty or graphics.sty.}%
+    \renewcommand\includegraphics[2][]{}%
+  }%
+  \providecommand\rotatebox[2]{#2}%
+  \@ifundefined{ifGPcolor}{%
+    \newif\ifGPcolor
+    \GPcolorfalse
+  }{}%
+  \@ifundefined{ifGPblacktext}{%
+    \newif\ifGPblacktext
+    \GPblacktexttrue
+  }{}%
+  % define a \g@addto@macro without @ in the name:
+  \let\gplgaddtomacro\g@addto@macro
+  % define empty templates for all commands taking text:
+  \gdef\gplbacktext{}%
+  \gdef\gplfronttext{}%
+  \makeatother
+  \ifGPblacktext
+    % no textcolor at all
+    \def\colorrgb#1{}%
+    \def\colorgray#1{}%
+  \else
+    % gray or color?
+    \ifGPcolor
+      \def\colorrgb#1{\color[rgb]{#1}}%
+      \def\colorgray#1{\color[gray]{#1}}%
+      \expandafter\def\csname LTw\endcsname{\color{white}}%
+      \expandafter\def\csname LTb\endcsname{\color{black}}%
+      \expandafter\def\csname LTa\endcsname{\color{black}}%
+      \expandafter\def\csname LT0\endcsname{\color[rgb]{1,0,0}}%
+      \expandafter\def\csname LT1\endcsname{\color[rgb]{0,1,0}}%
+      \expandafter\def\csname LT2\endcsname{\color[rgb]{0,0,1}}%
+      \expandafter\def\csname LT3\endcsname{\color[rgb]{1,0,1}}%
+      \expandafter\def\csname LT4\endcsname{\color[rgb]{0,1,1}}%
+      \expandafter\def\csname LT5\endcsname{\color[rgb]{1,1,0}}%
+      \expandafter\def\csname LT6\endcsname{\color[rgb]{0,0,0}}%
+      \expandafter\def\csname LT7\endcsname{\color[rgb]{1,0.3,0}}%
+      \expandafter\def\csname LT8\endcsname{\color[rgb]{0.5,0.5,0.5}}%
+    \else
+      % gray
+      \def\colorrgb#1{\color{black}}%
+      \def\colorgray#1{\color[gray]{#1}}%
+      \expandafter\def\csname LTw\endcsname{\color{white}}%
+      \expandafter\def\csname LTb\endcsname{\color{black}}%
+      \expandafter\def\csname LTa\endcsname{\color{black}}%
+      \expandafter\def\csname LT0\endcsname{\color{black}}%
+      \expandafter\def\csname LT1\endcsname{\color{black}}%
+      \expandafter\def\csname LT2\endcsname{\color{black}}%
+      \expandafter\def\csname LT3\endcsname{\color{black}}%
+      \expandafter\def\csname LT4\endcsname{\color{black}}%
+      \expandafter\def\csname LT5\endcsname{\color{black}}%
+      \expandafter\def\csname LT6\endcsname{\color{black}}%
+      \expandafter\def\csname LT7\endcsname{\color{black}}%
+      \expandafter\def\csname LT8\endcsname{\color{black}}%
+    \fi
+  \fi
+    \setlength{\unitlength}{0.0500bp}%
+    \ifx\gptboxheight\undefined%
+      \newlength{\gptboxheight}%
+      \newlength{\gptboxwidth}%
+      \newsavebox{\gptboxtext}%
+    \fi%
+    \setlength{\fboxrule}{0.5pt}%
+    \setlength{\fboxsep}{1pt}%
+\begin{picture}(7200.00,5040.00)%
+    \gplgaddtomacro\gplbacktext{%
+      \csname LTb\endcsname%%
+      \put(682,440){\makebox(0,0)[r]{\strut{}$0$}}%
+      \put(682,1170){\makebox(0,0)[r]{\strut{}$10$}}%
+      \put(682,1900){\makebox(0,0)[r]{\strut{}$20$}}%
+      \put(682,2630){\makebox(0,0)[r]{\strut{}$30$}}%
+      \put(682,3359){\makebox(0,0)[r]{\strut{}$40$}}%
+      \put(682,4089){\makebox(0,0)[r]{\strut{}$50$}}%
+      \put(682,4819){\makebox(0,0)[r]{\strut{}$60$}}%
+      \put(814,220){\makebox(0,0){\strut{}01/15}}%
+      \put(1811,220){\makebox(0,0){\strut{}01/16}}%
+      \put(2812,220){\makebox(0,0){\strut{}01/17}}%
+      \put(3809,220){\makebox(0,0){\strut{}01/18}}%
+      \put(4804,220){\makebox(0,0){\strut{}01/19}}%
+      \put(5801,220){\makebox(0,0){\strut{}01/20}}%
+      \put(6802,220){\makebox(0,0){\strut{}01/21}}%
+    }%
+    \gplgaddtomacro\gplfronttext{%
+      \csname LTb\endcsname%%
+      \put(209,2629){\rotatebox{-270}{\makebox(0,0){\strut{}DataLad versions released}}}%
+    }%
+    \gplbacktext
+    \put(0,0){\includegraphics[width={360.00bp},height={252.00bp}]{Fig1a}}%
+    \gplfronttext
+  \end{picture}%
+\endgroup

+ 642 - 0
Fig1b.eps

@@ -0,0 +1,642 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: authors.tex
+%%Creator: gnuplot 5.4 patchlevel 0
+%%CreationDate: Tue Feb  9 14:39:20 2021
+%%DocumentFonts: 
+%%BoundingBox: 50 50 410 302
+%%EndComments
+%%BeginProlog
+/gnudict 256 dict def
+gnudict begin
+%
+% The following true/false flags may be edited by hand if desired.
+% The unit line width and grayscale image gamma correction may also be changed.
+%
+/Color false def
+/Blacktext true def
+/Solid false def
+/Dashlength 1 def
+/Landscape false def
+/Level1 false def
+/Level3 false def
+/Rounded false def
+/ClipToBoundingBox false def
+/SuppressPDFMark false def
+/TransparentPatterns false def
+/gnulinewidth 5.000 def
+/userlinewidth gnulinewidth def
+/Gamma 1.0 def
+/BackgroundColor {-1.000 -1.000 -1.000} def
+%
+/vshift -73 def
+/dl1 {
+  10.0 Dashlength userlinewidth gnulinewidth div mul mul mul
+  Rounded { currentlinewidth 0.75 mul sub dup 0 le { pop 0.01 } if } if
+} def
+/dl2 {
+  10.0 Dashlength userlinewidth gnulinewidth div mul mul mul
+  Rounded { currentlinewidth 0.75 mul add } if
+} def
+/hpt_ 31.5 def
+/vpt_ 31.5 def
+/hpt hpt_ def
+/vpt vpt_ def
+/doclip {
+  ClipToBoundingBox {
+    newpath 50 50 moveto 410 50 lineto 410 302 lineto 50 302 lineto closepath
+    clip
+  } if
+} def
+%
+% Gnuplot Prolog Version 5.2 (Dec 2017)
+%
+%/SuppressPDFMark true def
+%
+/M {moveto} bind def
+/L {lineto} bind def
+/R {rmoveto} bind def
+/V {rlineto} bind def
+/N {newpath moveto} bind def
+/Z {closepath} bind def
+/C {setrgbcolor} bind def
+/f {rlineto fill} bind def
+/g {setgray} bind def
+/Gshow {show} def   % May be redefined later in the file to support UTF-8
+/vpt2 vpt 2 mul def
+/hpt2 hpt 2 mul def
+/Lshow {currentpoint stroke M 0 vshift R 
+	Blacktext {gsave 0 setgray textshow grestore} {textshow} ifelse} def
+/Rshow {currentpoint stroke M dup stringwidth pop neg vshift R
+	Blacktext {gsave 0 setgray textshow grestore} {textshow} ifelse} def
+/Cshow {currentpoint stroke M dup stringwidth pop -2 div vshift R 
+	Blacktext {gsave 0 setgray textshow grestore} {textshow} ifelse} def
+/UP {dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def
+  /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def} def
+/DL {Color {setrgbcolor Solid {pop []} if 0 setdash}
+ {pop pop pop 0 setgray Solid {pop []} if 0 setdash} ifelse} def
+/BL {stroke userlinewidth 2 mul setlinewidth
+	Rounded {1 setlinejoin 1 setlinecap} if} def
+/AL {stroke userlinewidth 2 div setlinewidth
+	Rounded {1 setlinejoin 1 setlinecap} if} def
+/UL {dup gnulinewidth mul /userlinewidth exch def
+	dup 1 lt {pop 1} if 10 mul /udl exch def} def
+/PL {stroke userlinewidth setlinewidth
+	Rounded {1 setlinejoin 1 setlinecap} if} def
+3.8 setmiterlimit
+% Classic Line colors (version 5.0)
+/LCw {1 1 1} def
+/LCb {0 0 0} def
+/LCa {0 0 0} def
+/LC0 {1 0 0} def
+/LC1 {0 1 0} def
+/LC2 {0 0 1} def
+/LC3 {1 0 1} def
+/LC4 {0 1 1} def
+/LC5 {1 1 0} def
+/LC6 {0 0 0} def
+/LC7 {1 0.3 0} def
+/LC8 {0.5 0.5 0.5} def
+% Default dash patterns (version 5.0)
+/LTB {BL [] LCb DL} def
+/LTw {PL [] 1 setgray} def
+/LTb {PL [] LCb DL} def
+/LTa {AL [1 udl mul 2 udl mul] 0 setdash LCa setrgbcolor} def
+/LT0 {PL [] LC0 DL} def
+/LT1 {PL [2 dl1 3 dl2] LC1 DL} def
+/LT2 {PL [1 dl1 1.5 dl2] LC2 DL} def
+/LT3 {PL [6 dl1 2 dl2 1 dl1 2 dl2] LC3 DL} def
+/LT4 {PL [1 dl1 2 dl2 6 dl1 2 dl2 1 dl1 2 dl2] LC4 DL} def
+/LT5 {PL [4 dl1 2 dl2] LC5 DL} def
+/LT6 {PL [1.5 dl1 1.5 dl2 1.5 dl1 1.5 dl2 1.5 dl1 6 dl2] LC6 DL} def
+/LT7 {PL [3 dl1 3 dl2 1 dl1 3 dl2] LC7 DL} def
+/LT8 {PL [2 dl1 2 dl2 2 dl1 6 dl2] LC8 DL} def
+/SL {[] 0 setdash} def
+/Pnt {stroke [] 0 setdash gsave 1 setlinecap M 0 0 V stroke grestore} def
+/Dia {stroke [] 0 setdash 2 copy vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V closepath stroke
+  Pnt} def
+/Pls {stroke [] 0 setdash vpt sub M 0 vpt2 V
+  currentpoint stroke M
+  hpt neg vpt neg R hpt2 0 V stroke
+ } def
+/Box {stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V closepath stroke
+  Pnt} def
+/Crs {stroke [] 0 setdash exch hpt sub exch vpt add M
+  hpt2 vpt2 neg V currentpoint stroke M
+  hpt2 neg 0 R hpt2 vpt2 V stroke} def
+/TriU {stroke [] 0 setdash 2 copy vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V closepath stroke
+  Pnt} def
+/Star {2 copy Pls Crs} def
+/BoxF {stroke [] 0 setdash exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V closepath fill} def
+/TriUF {stroke [] 0 setdash vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V closepath fill} def
+/TriD {stroke [] 0 setdash 2 copy vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V closepath stroke
+  Pnt} def
+/TriDF {stroke [] 0 setdash vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V closepath fill} def
+/DiaF {stroke [] 0 setdash vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V closepath fill} def
+/Pent {stroke [] 0 setdash 2 copy gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  closepath stroke grestore Pnt} def
+/PentF {stroke [] 0 setdash gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  closepath fill grestore} def
+/Circle {stroke [] 0 setdash 2 copy
+  hpt 0 360 arc stroke Pnt} def
+/CircleF {stroke [] 0 setdash hpt 0 360 arc fill} def
+/C0 {BL [] 0 setdash 2 copy moveto vpt 90 450 arc} bind def
+/C1 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 90 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C2 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 90 180 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C3 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 180 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C4 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 180 270 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C5 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 90 arc
+	2 copy moveto
+	2 copy vpt 180 270 arc closepath fill
+	vpt 0 360 arc} bind def
+/C6 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 90 270 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C7 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 270 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C8 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 270 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C9 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 270 450 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C10 {BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill
+	2 copy moveto
+	2 copy vpt 90 180 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C11 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 180 arc closepath fill
+	2 copy moveto
+	2 copy vpt 270 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C12 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 180 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C13 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 0 90 arc closepath fill
+	2 copy moveto
+	2 copy vpt 180 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/C14 {BL [] 0 setdash 2 copy moveto
+	2 copy vpt 90 360 arc closepath fill
+	vpt 0 360 arc} bind def
+/C15 {BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill
+	vpt 0 360 arc closepath} bind def
+/Rec {newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto
+	neg 0 rlineto closepath} bind def
+/Square {dup Rec} bind def
+/Bsquare {vpt sub exch vpt sub exch vpt2 Square} bind def
+/S0 {BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare} bind def
+/S1 {BL [] 0 setdash 2 copy vpt Square fill Bsquare} bind def
+/S2 {BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare} bind def
+/S3 {BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare} bind def
+/S4 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def
+/S5 {BL [] 0 setdash 2 copy 2 copy vpt Square fill
+	exch vpt sub exch vpt sub vpt Square fill Bsquare} bind def
+/S6 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare} bind def
+/S7 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill
+	2 copy vpt Square fill Bsquare} bind def
+/S8 {BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare} bind def
+/S9 {BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare} bind def
+/S10 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill
+	Bsquare} bind def
+/S11 {BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill
+	Bsquare} bind def
+/S12 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare} bind def
+/S13 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+	2 copy vpt Square fill Bsquare} bind def
+/S14 {BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+	2 copy exch vpt sub exch vpt Square fill Bsquare} bind def
+/S15 {BL [] 0 setdash 2 copy Bsquare fill Bsquare} bind def
+/D0 {gsave translate 45 rotate 0 0 S0 stroke grestore} bind def
+/D1 {gsave translate 45 rotate 0 0 S1 stroke grestore} bind def
+/D2 {gsave translate 45 rotate 0 0 S2 stroke grestore} bind def
+/D3 {gsave translate 45 rotate 0 0 S3 stroke grestore} bind def
+/D4 {gsave translate 45 rotate 0 0 S4 stroke grestore} bind def
+/D5 {gsave translate 45 rotate 0 0 S5 stroke grestore} bind def
+/D6 {gsave translate 45 rotate 0 0 S6 stroke grestore} bind def
+/D7 {gsave translate 45 rotate 0 0 S7 stroke grestore} bind def
+/D8 {gsave translate 45 rotate 0 0 S8 stroke grestore} bind def
+/D9 {gsave translate 45 rotate 0 0 S9 stroke grestore} bind def
+/D10 {gsave translate 45 rotate 0 0 S10 stroke grestore} bind def
+/D11 {gsave translate 45 rotate 0 0 S11 stroke grestore} bind def
+/D12 {gsave translate 45 rotate 0 0 S12 stroke grestore} bind def
+/D13 {gsave translate 45 rotate 0 0 S13 stroke grestore} bind def
+/D14 {gsave translate 45 rotate 0 0 S14 stroke grestore} bind def
+/D15 {gsave translate 45 rotate 0 0 S15 stroke grestore} bind def
+/DiaE {stroke [] 0 setdash vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V closepath stroke} def
+/BoxE {stroke [] 0 setdash exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V closepath stroke} def
+/TriUE {stroke [] 0 setdash vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V closepath stroke} def
+/TriDE {stroke [] 0 setdash vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V closepath stroke} def
+/PentE {stroke [] 0 setdash gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  closepath stroke grestore} def
+/CircE {stroke [] 0 setdash 
+  hpt 0 360 arc stroke} def
+/Opaque {gsave closepath 1 setgray fill grestore 0 setgray closepath} def
+/DiaW {stroke [] 0 setdash vpt add M
+  hpt neg vpt neg V hpt vpt neg V
+  hpt vpt V hpt neg vpt V Opaque stroke} def
+/BoxW {stroke [] 0 setdash exch hpt sub exch vpt add M
+  0 vpt2 neg V hpt2 0 V 0 vpt2 V
+  hpt2 neg 0 V Opaque stroke} def
+/TriUW {stroke [] 0 setdash vpt 1.12 mul add M
+  hpt neg vpt -1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt 1.62 mul V Opaque stroke} def
+/TriDW {stroke [] 0 setdash vpt 1.12 mul sub M
+  hpt neg vpt 1.62 mul V
+  hpt 2 mul 0 V
+  hpt neg vpt -1.62 mul V Opaque stroke} def
+/PentW {stroke [] 0 setdash gsave
+  translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+  Opaque stroke grestore} def
+/CircW {stroke [] 0 setdash 
+  hpt 0 360 arc Opaque stroke} def
+/BoxFill {gsave Rec 1 setgray fill grestore} def
+/Density {
+  /Fillden exch def
+  currentrgbcolor
+  /ColB exch def /ColG exch def /ColR exch def
+  /ColR ColR Fillden mul Fillden sub 1 add def
+  /ColG ColG Fillden mul Fillden sub 1 add def
+  /ColB ColB Fillden mul Fillden sub 1 add def
+  ColR ColG ColB setrgbcolor} def
+/BoxColFill {gsave Rec PolyFill} def
+/PolyFill {gsave Density fill grestore grestore} def
+/h {rlineto rlineto rlineto closepath gsave fill grestore stroke} bind def
+%
+% PostScript Level 1 Pattern Fill routine for rectangles
+% Usage: x y w h s a XX PatternFill
+%	x,y = lower left corner of box to be filled
+%	w,h = width and height of box
+%	  a = angle in degrees between lines and x-axis
+%	 XX = 0/1 for no/yes cross-hatch
+%
+/PatternFill {gsave /PFa [ 9 2 roll ] def
+  PFa 0 get PFa 2 get 2 div add PFa 1 get PFa 3 get 2 div add translate
+  PFa 2 get -2 div PFa 3 get -2 div PFa 2 get PFa 3 get Rec
+  TransparentPatterns {} {gsave 1 setgray fill grestore} ifelse
+  clip
+  currentlinewidth 0.5 mul setlinewidth
+  /PFs PFa 2 get dup mul PFa 3 get dup mul add sqrt def
+  0 0 M PFa 5 get rotate PFs -2 div dup translate
+  0 1 PFs PFa 4 get div 1 add floor cvi
+	{PFa 4 get mul 0 M 0 PFs V} for
+  0 PFa 6 get ne {
+	0 1 PFs PFa 4 get div 1 add floor cvi
+	{PFa 4 get mul 0 2 1 roll M PFs 0 V} for
+ } if
+  stroke grestore} def
+%
+/languagelevel where
+ {pop languagelevel} {1} ifelse
+dup 2 lt
+	{/InterpretLevel1 true def
+	 /InterpretLevel3 false def}
+	{/InterpretLevel1 Level1 def
+	 2 gt
+	    {/InterpretLevel3 Level3 def}
+	    {/InterpretLevel3 false def}
+	 ifelse }
+ ifelse
+%
+% PostScript level 2 pattern fill definitions
+%
+/Level2PatternFill {
+/Tile8x8 {/PaintType 2 /PatternType 1 /TilingType 1 /BBox [0 0 8 8] /XStep 8 /YStep 8}
+	bind def
+/KeepColor {currentrgbcolor [/Pattern /DeviceRGB] setcolorspace} bind def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke} 
+>> matrix makepattern
+/Pat1 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 0 M 8 8 L 0 8 M 8 0 L stroke
+	0 4 M 4 8 L 8 4 L 4 0 L 0 4 L stroke}
+>> matrix makepattern
+/Pat2 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 0 M 0 8 L
+	8 8 L 8 0 L 0 0 L fill}
+>> matrix makepattern
+/Pat3 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -4 8 M 8 -4 L
+	0 12 M 12 0 L stroke}
+>> matrix makepattern
+/Pat4 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -4 0 M 8 12 L
+	0 -4 M 12 8 L stroke}
+>> matrix makepattern
+/Pat5 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -2 8 M 4 -4 L
+	0 12 M 8 -4 L 4 12 M 10 0 L stroke}
+>> matrix makepattern
+/Pat6 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop -2 0 M 4 12 L
+	0 -4 M 8 12 L 4 -4 M 10 8 L stroke}
+>> matrix makepattern
+/Pat7 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 8 -2 M -4 4 L
+	12 0 M -4 8 L 12 4 M 0 10 L stroke}
+>> matrix makepattern
+/Pat8 exch def
+<< Tile8x8
+ /PaintProc {0.5 setlinewidth pop 0 -2 M 12 4 L
+	-4 0 M 12 8 L -4 4 M 8 10 L stroke}
+>> matrix makepattern
+/Pat9 exch def
+/Pattern1 {PatternBgnd KeepColor Pat1 setpattern} bind def
+/Pattern2 {PatternBgnd KeepColor Pat2 setpattern} bind def
+/Pattern3 {PatternBgnd KeepColor Pat3 setpattern} bind def
+/Pattern4 {PatternBgnd KeepColor Landscape {Pat5} {Pat4} ifelse setpattern} bind def
+/Pattern5 {PatternBgnd KeepColor Landscape {Pat4} {Pat5} ifelse setpattern} bind def
+/Pattern6 {PatternBgnd KeepColor Landscape {Pat9} {Pat6} ifelse setpattern} bind def
+/Pattern7 {PatternBgnd KeepColor Landscape {Pat8} {Pat7} ifelse setpattern} bind def
+} def
+%
+%
+%End of PostScript Level 2 code
+%
+/PatternBgnd {
+  TransparentPatterns {} {gsave 1 setgray fill grestore} ifelse
+} def
+%
+% Substitute for Level 2 pattern fill codes with
+% grayscale if Level 2 support is not selected.
+%
+/Level1PatternFill {
+/Pattern1 {0.250 Density} bind def
+/Pattern2 {0.500 Density} bind def
+/Pattern3 {0.750 Density} bind def
+/Pattern4 {0.125 Density} bind def
+/Pattern5 {0.375 Density} bind def
+/Pattern6 {0.625 Density} bind def
+/Pattern7 {0.875 Density} bind def
+} def
+%
+% Now test for support of Level 2 code
+%
+Level1 {Level1PatternFill} {Level2PatternFill} ifelse
+%
+/Symbol-Oblique /Symbol findfont [1 0 .167 1 0 0] makefont
+dup length dict begin {1 index /FID eq {pop pop} {def} ifelse} forall
+currentdict end definefont pop
+%
+Level1 SuppressPDFMark or 
+{} {
+/SDict 10 dict def
+systemdict /pdfmark known not {
+  userdict /pdfmark systemdict /cleartomark get put
+} if
+SDict begin [
+  /Title (authors.tex)
+  /Subject (gnuplot plot)
+  /Creator (gnuplot 5.4 patchlevel 0)
+%  /Producer (gnuplot)
+%  /Keywords ()
+  /CreationDate (Tue Feb  9 14:39:20 2021)
+  /DOCINFO pdfmark
+end
+} ifelse
+%
+% Support for boxed text - Ethan A Merritt Sep 2016
+%
+/InitTextBox { userdict /TBy2 3 -1 roll put userdict /TBx2 3 -1 roll put
+           userdict /TBy1 3 -1 roll put userdict /TBx1 3 -1 roll put
+	   /Boxing true def } def
+/ExtendTextBox { dup type /stringtype eq
+    { Boxing { gsave dup false charpath pathbbox
+      dup TBy2 gt {userdict /TBy2 3 -1 roll put} {pop} ifelse
+      dup TBx2 gt {userdict /TBx2 3 -1 roll put} {pop} ifelse
+      dup TBy1 lt {userdict /TBy1 3 -1 roll put} {pop} ifelse
+      dup TBx1 lt {userdict /TBx1 3 -1 roll put} {pop} ifelse
+      grestore } if }
+    {} ifelse} def
+/PopTextBox { newpath TBx1 TBxmargin sub TBy1 TBymargin sub M
+               TBx1 TBxmargin sub TBy2 TBymargin add L
+	       TBx2 TBxmargin add TBy2 TBymargin add L
+	       TBx2 TBxmargin add TBy1 TBymargin sub L closepath } def
+/DrawTextBox { PopTextBox stroke /Boxing false def} def
+/FillTextBox { gsave PopTextBox fill grestore /Boxing false def} def
+0 0 0 0 InitTextBox
+/TBxmargin 20 def
+/TBymargin 20 def
+/Boxing false def
+/textshow { ExtendTextBox Gshow } def
+%
+end
+%%EndProlog
+%%Page: 1 1
+gnudict begin
+gsave
+doclip
+50 50 translate
+0.050 0.050 scale
+0 setgray
+newpath
+1.000 UL
+LTb
+LCb setrgbcolor
+[] 0 setdash
+1078 374 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+LTb
+1078 1009 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+1078 1644 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+1078 2279 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+1078 2914 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+1078 3549 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+1078 4184 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+1078 4819 M
+63 0 V
+5662 0 R
+-63 0 V
+stroke
+1.000 UL
+LTb
+1078 4819 N
+0 -4445 V
+5725 0 V
+0 4445 V
+-5725 0 V
+Z stroke
+1.000 UP
+1.000 UL
+LTb
+% Begin plot #1
+1.000 UL
+LTb
+0.00 0.00 0.00 C
+0.300 1078 374 521 3951 BoxColFill
+1078 374 N
+0 3950 V
+520 0 V
+0 -3950 V
+-520 0 V
+Z stroke
+0.300 1598 374 522 3728 BoxColFill
+1598 374 N
+0 3727 V
+521 0 V
+0 -3727 V
+-521 0 V
+Z stroke
+0.300 2119 374 521 3302 BoxColFill
+2119 374 N
+0 3301 V
+520 0 V
+0 -3301 V
+-520 0 V
+Z stroke
+0.300 2639 374 522 942 BoxColFill
+2639 374 N
+0 941 V
+521 0 V
+0 -941 V
+-521 0 V
+Z stroke
+0.300 3160 374 521 299 BoxColFill
+3160 374 N
+0 298 V
+520 0 V
+0 -298 V
+-520 0 V
+Z stroke
+0.300 3680 374 522 261 BoxColFill
+3680 374 N
+0 260 V
+521 0 V
+0 -260 V
+-521 0 V
+Z stroke
+0.300 4201 374 521 65 BoxColFill
+4201 374 N
+0 64 V
+520 0 V
+0 -64 V
+-520 0 V
+Z stroke
+0.300 4721 374 522 54 BoxColFill
+4721 374 N
+0 53 V
+521 0 V
+0 -53 V
+-521 0 V
+Z stroke
+0.300 5242 374 521 49 BoxColFill
+5242 374 N
+0 48 V
+520 0 V
+0 -48 V
+-520 0 V
+Z stroke
+0.300 5762 374 522 28 BoxColFill
+5762 374 N
+0 27 V
+521 0 V
+0 -27 V
+-521 0 V
+Z stroke
+0.300 6283 374 521 12 BoxColFill
+6283 374 N
+0 11 V
+520 0 V
+0 -11 V
+-520 0 V
+Z stroke
+LTw
+% End plot #1
+2.000 UL
+LTb
+LCb setrgbcolor
+[] 0 setdash
+1.000 UL
+LTb
+1078 4819 N
+0 -4445 V
+5725 0 V
+0 4445 V
+-5725 0 V
+Z stroke
+1.000 UP
+1.000 UL
+LTb
+stroke
+grestore
+end
+showpage
+%%Trailer

+ 103 - 0
Fig1b.tex

@@ -0,0 +1,103 @@
+% GNUPLOT: LaTeX picture with Postscript
+\begingroup
+  \makeatletter
+  \providecommand\color[2][]{%
+    \GenericError{(gnuplot) \space\space\space\@spaces}{%
+      Package color not loaded in conjunction with
+      terminal option `colourtext'%
+    }{See the gnuplot documentation for explanation.%
+    }{Either use 'blacktext' in gnuplot or load the package
+      color.sty in LaTeX.}%
+    \renewcommand\color[2][]{}%
+  }%
+  \providecommand\includegraphics[2][]{%
+    \GenericError{(gnuplot) \space\space\space\@spaces}{%
+      Package graphicx or graphics not loaded%
+    }{See the gnuplot documentation for explanation.%
+    }{The gnuplot epslatex terminal needs graphicx.sty or graphics.sty.}%
+    \renewcommand\includegraphics[2][]{}%
+  }%
+  \providecommand\rotatebox[2]{#2}%
+  \@ifundefined{ifGPcolor}{%
+    \newif\ifGPcolor
+    \GPcolorfalse
+  }{}%
+  \@ifundefined{ifGPblacktext}{%
+    \newif\ifGPblacktext
+    \GPblacktexttrue
+  }{}%
+  % define a \g@addto@macro without @ in the name:
+  \let\gplgaddtomacro\g@addto@macro
+  % define empty templates for all commands taking text:
+  \gdef\gplbacktext{}%
+  \gdef\gplfronttext{}%
+  \makeatother
+  \ifGPblacktext
+    % no textcolor at all
+    \def\colorrgb#1{}%
+    \def\colorgray#1{}%
+  \else
+    % gray or color?
+    \ifGPcolor
+      \def\colorrgb#1{\color[rgb]{#1}}%
+      \def\colorgray#1{\color[gray]{#1}}%
+      \expandafter\def\csname LTw\endcsname{\color{white}}%
+      \expandafter\def\csname LTb\endcsname{\color{black}}%
+      \expandafter\def\csname LTa\endcsname{\color{black}}%
+      \expandafter\def\csname LT0\endcsname{\color[rgb]{1,0,0}}%
+      \expandafter\def\csname LT1\endcsname{\color[rgb]{0,1,0}}%
+      \expandafter\def\csname LT2\endcsname{\color[rgb]{0,0,1}}%
+      \expandafter\def\csname LT3\endcsname{\color[rgb]{1,0,1}}%
+      \expandafter\def\csname LT4\endcsname{\color[rgb]{0,1,1}}%
+      \expandafter\def\csname LT5\endcsname{\color[rgb]{1,1,0}}%
+      \expandafter\def\csname LT6\endcsname{\color[rgb]{0,0,0}}%
+      \expandafter\def\csname LT7\endcsname{\color[rgb]{1,0.3,0}}%
+      \expandafter\def\csname LT8\endcsname{\color[rgb]{0.5,0.5,0.5}}%
+    \else
+      % gray
+      \def\colorrgb#1{\color{black}}%
+      \def\colorgray#1{\color[gray]{#1}}%
+      \expandafter\def\csname LTw\endcsname{\color{white}}%
+      \expandafter\def\csname LTb\endcsname{\color{black}}%
+      \expandafter\def\csname LTa\endcsname{\color{black}}%
+      \expandafter\def\csname LT0\endcsname{\color{black}}%
+      \expandafter\def\csname LT1\endcsname{\color{black}}%
+      \expandafter\def\csname LT2\endcsname{\color{black}}%
+      \expandafter\def\csname LT3\endcsname{\color{black}}%
+      \expandafter\def\csname LT4\endcsname{\color{black}}%
+      \expandafter\def\csname LT5\endcsname{\color{black}}%
+      \expandafter\def\csname LT6\endcsname{\color{black}}%
+      \expandafter\def\csname LT7\endcsname{\color{black}}%
+      \expandafter\def\csname LT8\endcsname{\color{black}}%
+    \fi
+  \fi
+    \setlength{\unitlength}{0.0500bp}%
+    \ifx\gptboxheight\undefined%
+      \newlength{\gptboxheight}%
+      \newlength{\gptboxwidth}%
+      \newsavebox{\gptboxtext}%
+    \fi%
+    \setlength{\fboxrule}{0.5pt}%
+    \setlength{\fboxsep}{1pt}%
+\begin{picture}(7200.00,5040.00)%
+    \gplgaddtomacro\gplbacktext{%
+      \csname LTb\endcsname%%
+      \put(946,374){\makebox(0,0)[r]{\strut{}$0$}}%
+      \put(946,1009){\makebox(0,0)[r]{\strut{}$0.05$}}%
+      \put(946,1644){\makebox(0,0)[r]{\strut{}$0.1$}}%
+      \put(946,2279){\makebox(0,0)[r]{\strut{}$0.15$}}%
+      \put(946,2914){\makebox(0,0)[r]{\strut{}$0.2$}}%
+      \put(946,3549){\makebox(0,0)[r]{\strut{}$0.25$}}%
+      \put(946,4184){\makebox(0,0)[r]{\strut{}$0.3$}}%
+      \put(946,4819){\makebox(0,0)[r]{\strut{}$0.35$}}%
+    }%
+    \gplgaddtomacro\gplfronttext{%
+      \csname LTb\endcsname%%
+      \put(209,2596){\rotatebox{-270}{\makebox(0,0){\strut{}Fraction of commits}}}%
+      \put(3940,154){\makebox(0,0){\strut{}Top contributors}}%
+    }%
+    \gplbacktext
+    \put(0,0){\includegraphics[width={360.00bp},height={252.00bp}]{Fig1b}}%
+    \gplfronttext
+  \end{picture}%
+\endgroup

+ 56 - 0
Fig2.tex

@@ -0,0 +1,56 @@
+\tikzset{pics/folder/.style={code={%
+    \node[inner sep=0pt, minimum size=#1](-foldericon){};
+    \node[folder style, inner sep=0pt, minimum width=0.3*#1, minimum height=0.6*#1, above right, xshift=0.05*#1] at (-foldericon.west){};
+    \node[folder style, inner sep=0pt, minimum size=#1] at (-foldericon.center){};}
+    },
+    pics/folder/.default={20pt},
+    folder style/.style={draw=foldercolor!80!black,top color=foldercolor!40,bottom color=foldercolor}
+}
+
+\forestset{is file/.style={edge path'/.expanded={%
+        ([xshift=\forestregister{folder indent}]!u.parent anchor) |- (.child anchor)},
+        inner sep=1pt},
+    this folder size/.style={edge path'/.expanded={%
+        ([xshift=\forestregister{folder indent}]!u.parent anchor) |- (.child anchor) pic[solid]{folder=#1}}, inner ysep=0.3*#1},
+    folder tree indent/.style={before computing xy={l=#1}},
+    folder icons/.style={folder, this folder size=#1, folder tree indent=2*#1, inner xsep=10pt},
+    folder icons/.default={11pt},
+}
+
+
+  \begin{forest}
+    for tree={font=\sffamily, grow'=0,
+    folder indent=.7em, folder icons}
+    [dataset
+      [metadata%, this folder size=20pt
+          [children.csv, is file]
+          [recordings.csv, is file]
+          [annotations.csv, is file]]
+      [recordings
+          [raw
+            [recording1.wav, is file]
+%            [recording2.wav, is file]
+            ]
+          [converted
+            [standard]
+                [recording1.wav, is file]
+%                [recording2.wav, is file]
+            [vetted]
+                [recording1.wav, is file]
+%                [recording2.wav, is file]
+          ]
+        ]
+      [annotations
+        [its
+            [raw
+                [recording1.its, is file]
+%                [recording2.its, is file]
+            ]
+            [converted,
+                [recording1\_0\_0.csv, is file]
+%                [recording2\_0\_0.csv, is file]
+            ]
+        ]
+      ]
+    ]
+  \end{forest}

BIN
Fig3a.jpg


BIN
Fig3b.jpg


BIN
Fig4.pdf


BIN
Fig5.pdf


+ 28 - 0
Makefile

@@ -0,0 +1,28 @@
+all: main.pdf
+
+# This rule is executed last, and renders the full PDF from the manuscript with latexmk.
+# The -g flag is used to *always* process the document, even if no changes have been made to it.
+
+main.pdf: main.tex references.bib Fig4.pdf Fig5.pdf
+	latexmk -pdf -g $<
+
+Fig4.pdf: code/recall.py scores.csv
+	code/recall.py vandam-data
+
+Fig5.pdf: code/confusion_matrix.py vandam-data/annotations/its/converted/*.csv vandam-data/annotations/vtc/converted/*.csv
+	code/confusion_matrix.py vandam-data
+
+scores.csv: vandam-data/annotations/its/converted/*.csv vandam-data/annotations/vtc/converted/*.csv
+	code/recall.py vandam-data
+
+vandam-data/annotations/its/converted/*.csv:
+	datalad get vandam-data/annotations/its/converted
+
+vandam-data/annotations/vtc/converted/*.csv:
+	datalad get vandam-data/annotations/vtc/converted
+
+# This rule cleans up temporary LaTeX files, and result and PDF files
+clean:
+	rm -f main.bbl main.aux main.blg main.log main.out main.pdf main.tdo main.fls main.fdb_latexmk texput.log *-eps-converted-to.pdf scores.csv
+	datalad drop vandam-data/annotations/its/converted
+	datalad drop vandam-data/annotations/vtc/converted

+ 78 - 0
code/confusion_matrix.py

@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+from ChildProject.metrics import gamma, segments_to_grid
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import confusion_matrix
+from sklearn.preprocessing import normalize
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+import sys
+
+speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+
+path = sys.argv[1]
+project = ChildProject(path)
+am = AnnotationManager(project)
+am.read()
+
+intersection = AnnotationManager.intersection(am.annotations, ['vtc', 'its'])
+segments = am.get_collapsed_segments(intersection)
+segments = segments[segments['speaker_type'].isin(speakers)]
+
+vtc = segments_to_grid(segments[segments['set'] == 'vtc'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
+its = segments_to_grid(segments[segments['set'] == 'its'], 0, segments['segment_offset'].max(), 100, 'speaker_type', speakers)
+
+speakers.extend(['overlap', 'none'])
+
+def get_pick(row):
+    for cat in reversed(speakers):
+        if row[cat]:
+            return cat
+
+def conf_matrix(horizontal, vertical, categories):
+    vertical = pd.DataFrame(vertical, columns = categories)
+    vertical['pick'] = vertical.apply(
+        get_pick,
+        axis = 1
+    )
+    vertical = vertical['pick'].values
+
+    horizontal = pd.DataFrame(horizontal, columns = categories)
+    horizontal['pick'] = horizontal.apply(
+        get_pick,
+        axis = 1
+    )
+    horizontal = horizontal['pick'].values
+
+    confusion = confusion_matrix(vertical, horizontal, labels = categories)
+    confusion = normalize(confusion, axis = 1, norm = 'l1')
+
+    return confusion
+
+plt.rcParams.update({'font.size': 12})
+plt.rc('xtick', labelsize = 10)
+plt.rc('ytick', labelsize = 10)
+
+fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(6.4*2, 4.8))
+
+confusion = conf_matrix(its, vtc, speakers)
+sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[0], cmap = 'Reds')
+axes[0].set_xlabel('its')
+axes[0].set_ylabel('vtc')
+axes[0].xaxis.set_ticklabels(speakers)
+axes[0].yaxis.set_ticklabels(speakers)
+
+confusion = conf_matrix(vtc, its, speakers)
+sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[1], cmap = 'Reds')
+axes[1].set_xlabel('vtc')
+axes[1].set_ylabel('its')
+axes[1].xaxis.set_ticklabels(speakers)
+axes[1].yaxis.set_ticklabels(speakers)
+
+plt.savefig('Fig5.pdf', bbox_inches = 'tight')

+ 132 - 0
code/recall.py

@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+from ChildProject.metrics import segments_to_annotation
+
+from pyannote.metrics.detection import DetectionPrecisionRecallFMeasure
+
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pandas as pd
+import random
+import sys
+
+speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+sets = ['its', 'vtc (conf 50%)', 'vtc (drop 50%)', 'vtc (conf 75%)', 'vtc (drop 75%)']
+
+def confusion(segments, prob):
+    segments['speaker_type'] = segments['speaker_type'].apply(
+        lambda s: random.choice(speakers) if random.random() < prob else s
+    )
+    return segments
+
+def drop(segments, prob):
+    return segments.sample(frac = 1-prob)
+
+if not os.path.exists('scores.csv'):
+    path = sys.argv[1]
+    project = ChildProject(path)
+    am = AnnotationManager(project)
+    am.read()
+
+    intersection = AnnotationManager.intersection(am.annotations, ['vtc', 'its'])
+    segments = am.get_collapsed_segments(intersection)
+    segments = segments[segments['speaker_type'].isin(speakers)]
+    segments.sort_values(['segment_onset', 'segment_offset']).to_csv('test.csv', index = False)
+
+    conf50 = segments[segments['set'] == 'vtc'].copy()
+    conf50 = confusion(conf50, 0.5)
+    conf50['set'] = 'vtc (conf 50%)'
+
+    conf75 = segments[segments['set'] == 'vtc'].copy()
+    conf75 = confusion(conf75, 0.75)
+    conf75['set'] = 'vtc (conf 75%)'
+
+    drop50 = segments[segments['set'] == 'vtc'].copy()
+    drop50 = drop(drop50, 0.5)
+    drop50['set'] = 'vtc (drop 50%)'
+
+    drop75 = segments[segments['set'] == 'vtc'].copy()
+    drop75 = drop(drop75, 0.75)
+    drop75['set'] = 'vtc (drop 75%)'
+
+    segments = pd.concat([segments, conf50, conf75, drop50, drop75])
+
+    metric = DetectionPrecisionRecallFMeasure()
+
+    scores = []
+    for speaker in speakers:
+        ref = segments_to_annotation(segments[(segments['set'] == 'vtc') & (segments['speaker_type'] == speaker)], 'speaker_type')
+
+        for s in sets:
+            hyp = segments_to_annotation(segments[(segments['set'] == s) & (segments['speaker_type'] == speaker)], 'speaker_type')
+            detail = metric.compute_components(ref, hyp)
+            precision, recall, f = metric.compute_metrics(detail)
+
+            scores.append({
+                'set': s,
+                'speaker': speaker,
+                'recall': recall,
+                'precision': precision,
+                'f': f
+            })
+
+    scores = pd.DataFrame(scores)
+    scores.to_csv('scores.csv', index = False)
+
+scores = pd.read_csv('scores.csv')
+
+plt.rcParams.update({'font.size': 12})
+plt.rc('xtick', labelsize = 10)
+plt.rc('ytick', labelsize = 10)
+
+print(scores)
+
+styles = {
+    'recall': 's',
+    'precision': 'D',
+    'f': 'o'
+}
+
+labels = {
+    'recall': 'recall',
+    'precision': 'precision',
+    'f': 'F-measure'
+}
+
+plt.figure(figsize = (6.4*1, 4.8*1+0.25*4.8))
+
+for speaker in speakers:
+    i = speakers.index(speaker)
+    ax = plt.subplot(2, 2, i+1)
+    ax.set_xlim(-0.5,len(sets)-0.5)
+    ax.set_ylim(0, 1)
+
+    if i >= 2:
+        ax.set_xticks(range(len(sets)))
+        ax.set_xticklabels(sets, rotation = 45, horizontalalignment = 'right')
+    else:
+        ax.set_xticklabels(['' for i in range(len(sets))])
+
+    if i%2 == 1:
+        ax.set_yticklabels(['' for i in range(6)])
+
+    ax.set_xlabel(speaker)
+
+    _scores = scores[scores['speaker'] == speaker]
+    for metric in ['recall', 'precision', 'f']:
+        ax.scatter(
+            x = _scores['set'].apply(lambda s: sets.index(s)),
+            y = _scores[metric],
+            label = labels[metric],
+            s = 15,
+            marker = styles[metric]
+        )
+
+ax = plt.subplot(2, 2, 2)
+ax.legend(loc = "upper right", borderaxespad = 0.1, bbox_to_anchor=(1, 1.25), ncol = 3)
+
+plt.subplots_adjust(wspace = 0.15)
+plt.savefig('Fig4.pdf', bbox_inches = 'tight')

+ 603 - 0
main.tex

@@ -0,0 +1,603 @@
+\RequirePackage{fix-cm}
+%\documentclass{article}
+%\documentclass{svjour3}                     % onecolumn (standard format)
+%\documentclass[smallcondensed]{svjour3}     % onecolumn (ditto)
+\documentclass[smallextended]{svjour3}       % onecolumn (second format)
+%\documentclass[twocolumn]{svjour3}          % twocolumn
+\usepackage[utf8]{inputenc}
+
+
+\usepackage[margin=1in]{geometry}
+\usepackage[toc]{appendix}
+\usepackage{natbib}
+
+\usepackage{booktabs}
+\usepackage{hyperref}
+
+
+\makeatletter
+\newcommand\footnoteref[1]{\protected@xdef\@thefnmark{\ref{#1}}\@footnotemark}
+\makeatother
+
+
+\usepackage{tikz}
+\usetikzlibrary{arrows.meta,positioning,calc,shapes}
+\usepackage[edges]{forest}
+\definecolor{foldercolor}{RGB}{124,166,198}
+\newcommand{\inputTikZ}[2]{%  
+     \scalebox{#1}{\input{#2}}
+}
+\usepackage{subfig}
+\usepackage[outdir=./plots]{epstopdf}
+\usepackage{textcomp}
+
+
+\usepackage[Symbol]{upgreek}
+
+\graphicspath{.}
+
+\title{Managing, storing and sharing long-form recordings and their annotations}
+
+\author{%
+Lucas Gautheron \and Nicolas Rochat \and Alejandrina Cristia
+}
+
+\institute{
+Laboratoire de Sciences Cognitives et de Psycholinguistique, Département d'Etudes cognitives, ENS, EHESS, CNRS, PSL University, Paris, France. 
+\email{lucas.gautheron@gmail.com}
+}
+
+\journalname{Language Resources and Evaluation}
+
+\date{}
+
+\begin{document}
+
+\maketitle
+
+\abstract{
+The technique of \textit{in situ}, long-form recordings is gaining momentum in different fields of research, notably linguistics and pathology. This method, however, poses several technical challenges, some of which are amplified by the peculiarities of the data, including their sensitiveness and their volume. In the following paper, we begin by outlining the problems related to the management, storage, and sharing of the corpora produced using this technique. We then go on to propose a multi-component solution to these problems, in the specific case of daylong recordings of children. As part of this solution, we release \emph{ChildProject}, a python package to perform the operations typically required to work with such datasets. The package also provides built-in functions to evaluate the annotations using a number of measures commonly used in speech processing and linguistics. Our proposal, as we argue, could be generalized to broader populations.
+}
+
+\keywords{daylong recordings, speech data management, data distribution, annotation evaluation, inter-rater reliability}
+
+
+%\tableofcontents
+
+
+%\begin{itemize}
+%    \item adding the large amounts of data into the problem space ? (because it means high storage costs, delivery difficulties etc.)
+%    \item More emphasis on reproducibility and how DataLad is helpful with that
+%    \item Find some rationale to decide when to refer to git-annex or when to refer to DataLad, repository vs dataset etc.
+%    \item Referring to git files (as opposed to git annex files) as "text files" is ambiguous as annotations are text files, but usually stored in the annex. But "git files" is ambiguous to as it could mean everything inside of .git...
+%    \item OSF integration: example 3 ?
+%    \item Language Archive
+%    \item FAIR
+%    \item coin  \citep{Gorgolewski2016}
+%\end{itemize}
+
+\section{Introduction}
+
+Long-form recordings are those collected over extended periods of time, typically via a wearable. Although the technique was used with normotypical adults decades ago \citep{ear1,ear2}, it became widespread in the study of early childhood over the last 15 years or so. The LENA Foundation created a hardware-software combination that illuminated the potential of this technique for theoretical and applied purposes (e.g., \citealt{christakis2009audible,warlaumont2014social}). More recently, such data is being discussed in the context of neurological disorders (e.g., \citealt{riad2020vocal}). In this article, we define the unique space of difficulties surrounding long-form recordings, and introduce a python package that provides practical solutions, with a focus on child-centered recordings. We end by discussing ways in which these solutions could be generalized to other populations.
+
+\section{Problem space}\label{section:problemspace}
+
+Management of scientific data is a long-standing issue, which has been subject to substantial progress in the recent years. For instance, FAIR principles \citep{Wilkinson2016} - where the initials stand for Findability, Accessibility, Interoperability, and Reusability - have been proposed to help increase the usefulness of data and data analysis pipelines. Similarly, databases implementing these practices have emerged, such as Dataverse \citep{dataverse} and Zenodo \citep{zenodo}. The method of daylong recordings should incorporate such methodological advances. It should be noted, however, that some of the difficulties surrounding the management of corpora of daylong recordings are more idiosyncratic to this technique, and therefore require specific solutions to be developed. Below, we list some of the challenges researchers engaging in the technique of long-form recordings in naturalistic environments are likely to face. 
+
+\subsubsection*{The need for standards}
+
+Extant datasets rely on a wide variety of metadata structures, file formats, and naming conventions. For instance, some data from long-form recordings have been archived publicly on Databrary (such as the ACLEW starter set \citep{starter}) and HomeBank (including the VanDam Daylong corpus from \citealt{vandam-day}). Table \ref{table:datasets} shows some divergence across the two. As a result of this divergence, each lab finds itself re-inventing the wheel. For instance, the HomeBankCode organization \footnote{\url{https://github.com/homebankcode/}} contains at least 4 packages that do more or less the same operations of e.g. aggregating how much speech was produced in each recording, implemented in different languages (MatLab,  R, perl, and Python). This divergence may also hide different operationalizations, rendering comparisons across labs fraught, effectively diminishing replicability.\footnote{\textit{Replicability} is typically defined as the effort to re-do a study on a new sample, whereas \textit{reproducibility} relates to re-doing the exact same analyses in the exact same data. Reproducibility is addressed in another section.} 
+
+Designing pipelines and analyses that are consistent across datasets requires standards in how the datasets are structured. Although this may represent an initial investment, such standards facilitate the pooling of research efforts, by allowing labs to benefit from code developed in other labs. Additionally, this field operates increasingly via collaborative cross-lab efforts. For instance, the ACLEW project\footnote{\url{sites.google.com/site/aclewdid}} involved 9 principal investigators (PIs) from 5 different countries, who needed a substantive initial investment to agree on a standard organization for their 6 corpora. We expect even larger collaborations to emerge in the future, a move that would be benefited by standardization, as exemplified by the community that emerged around CHILDES for short-form recordings \citep{macwhinney2000childes}.
+
+\begin{table}
+\centering
+\begin{tabular}{@{}lll@{}}
+\toprule
+                        & ACLEW starter  & Van Dam \\ \midrule
+\begin{tabular}[t]{@{}l@{}}Audio's scope\end{tabular}             & 5-minute clips & Full day       \\
+\begin{tabular}[t]{@{}l@{}}Automated annotations'\\format\end{tabular}    & none         & LENA           \\
+\begin{tabular}[t]{@{}l@{}}Automated annotations'\\format\end{tabular} & .eaf           & .cha           \\
+Annotations' scope        & only clips     & Full day       \\
+Metadata                & none           & excel \\ \bottomrule
+\end{tabular}
+\caption{\textbf{Divergences between the \cite{starter} and \cite{vandam-day} datasets}. Audios' scope indicates the size of the audio that has been archived: all recordings last for a full day, but for ACLEW starter, three five-minute clips were selected from each child. Automated annotations' format indicates what software was used to annotate the audio automatically. Annotations' scope shows the scope of human annotation. Metadata indicates whether information about the children and recording were shared, and in what format.}
+\label{table:datasets}
+\end{table}
+
+
+\subsubsection*{Keeping up with updates and contributions}
+
+Datasets are not frozen. Rather, they are continuously enriched through annotations provided by humans or new algorithms. Human annotations may also undergo corrections, as errors are discovered. The process of collecting the recordings may also require a certain amount of time, as they are progressively returned by the field workers or the participants themselves. In the case of longitudinal studies, supplementary audio data may accumulate over several years.  Researchers should be able to keep track of these changes while also upgrading their analyses.  Moreover, several collaborators may be brought to contribute work to the same dataset simultaneously. 
+
+To take the example of ACLEW, PIs first annotated in-house a random selection of 2-minute clips for 10 children. They then exchanged some of these audios so that the annotators in another lab re-annotated the same data, for the purposes of inter-rater reliability. This revealed divergences in definitions, and all datasets needed to be revised. Finally, a second sample of 2-minute clips with high levels of speech activity were annotated -- and another process of reliability was performed.
+
+\subsubsection*{Delivering large amounts of data}
+
+Considering typical values for the bit depth and sampling rates of the recordings -- 16 bits and 16 kilohertz respectively -- yield a throughput of approximately three gigabytes per day of audio. Although there is a great deal of variation, past studies often involved at least 30 recording days (e.g., 3 days for each of 10 children). The trend, however, is for datasets to be larger; for instance, last year, we collected 2 recordings from about 200 children. Such datasets may exceed one terabyte. Moreover, these recordings can be associated with annotations spread across thousands of files. In the ACLEW example just discussed, there was one .eaf file per human annotator per type of annotation (i.e., random, high speech, random reliability, high speech reliability). In addition, the full day was analyzed with between 1 and 4 automated routines. Thus, for each recording day there were 8 annotation files, leading to 5 corpora $\times$ 10 children $\times$ 8 annotation = 400 annotation files. Other researchers will use one annotation file per clip selected for annotation, which quickly adds up to thousands of files. Even a small processing latency may add up to significant overheads while gathering so many files. 
+Data-access should be doable programmatically, and users should be able to download only the data that they need for their analysis.
+
+
+\subsubsection*{Privacy}
+
+Long-form recordings are sensitive; they contain identifying and personal information about the participating family. In some cases, for instance if the family goes shopping and forgets to notify those around them, recordings could capture conversations which involve people who are unaware that they are being recorded. In addition, they may be subject to regulations, such as the European GDPR, the American HIPAA, and, depending on the place of collection and/or storage, laws on biometric data.
+
+However, although the long-form recordings are sensitive, many of the data types derived from them are not. With appropriate file-naming and meta-data practices, it is effectively possible to completely deidentify automated annotations (which at present never include automatic speech recognition). It is also often possible to deidentify human annotations, except when these involve transcribing what participants said, since participants will use personal names and reveal other personal details. Nonetheless, since this particular case involves a human doing the annotation, this human can be trained to modify the record (e.g., replace personal names with foils) and/or tag the annotation as sensitive and not to be openly shared (a practice called vetting \citep{Cychosz2020}.)
+
+Therefore, the ideal storing-and-sharing strategy should naturally enforce security and privacy safeguards by implementing access restrictions adapted to the level of confidentiality of the data.
+
+\subsubsection*{Long-term availability}
+
+The collection of long-form recordings requires a considerable level of investment to explain the technique to families and communities, ensure a secure data management system, and, in the case of remote populations, access to and from the site. In our experience, one data collection trip to a field site costs about 15 thousand US\$.\footnote{This grossly underestimates overall costs, because the best way to do any kind of field research is by maintaining strong bonds with the community and helping them in other ways throughout the year, rather than only during our visits. A successful example for this is that of the UNM-UCSB Tsimane' Project (\url{http://tsimane.anth.ucsb.edu/}), which has been collaborating with the Tsimane' population since 2001. They are currently funded by a 5-year, 3-million US\$ NIH grant \url{https://reporter.nih.gov/project-details/9538306}}. These data are precious not only because of the investment that has gone into them, but because they capture slices of life at a given point in time, which is particularly informative in the case of populations that are experiencing market integration or other forms of societal change -- which today is most or all populations. Moreover, some communities who are collaborating in such research speak languages that are minority languages in the local context, and thus at a potential risk for being lost in the future. The conservation of naturalistic speech samples of children's language acquisition throughout a normal day could be precious to fuel future efforts of language revitalization \citep{Nee2021}. It would therefore be particularly damaging to lose such data prematurely, from a financial, a scientific, and a human standpoints.
+
+In addition, one advantage of daylong recordings over other observational methods such as parental reports is that they can be re-exploited at later times to observe behaviors that had not been foreseen at the time of data collection. This implies that their interest partly lies in long-term re-usability.
+
+Moreover, even state-of-the-art speech processing tools still perform poorly on daylong recordings, due to their intrinsic noisy nature \citep{casillas2019step}. As a result, taking full advantage of present data will necessitate new or improved computational models, which may take years to develop. For example, the DIHARD Challenge series has been running for three years in a row, and documents the difficulty of making headway with complex audio data \citep{ryant2018first,ryant2019second,ryant2020third}. For instance, the best submission for speaker diarization in their meeting subcorpus achieved about 35\% Diarization Error Rate in 2018 and 2019, with improvements seen only in 2020, when the best system scored 20\% Diarization Error Rate (Neville Ryant, personal communication, 2021-04-09). Other tasks are progressing much more slowly. For instance, the best performance in a classifier for deciding whether adult speech was addressed to the child or to an adult scored about 70\% correct in 2017 \citep{schuller2017interspeech} -- but nobody has been able to beat this record since. Recordings should therefore remain available for long periods of time -- potentially decades --, thus increasing the risk for data loss to occur at some point in their lifespan. For these reasons, the reliability of the storage design is critical, and redundancy is most certainly required. Likewise, persistent URLs may be needed in order to ensure the long-term accessibility of the datasets.
+
+\subsubsection*{Findability}
+
+FAIR Principles include findability and accessibility. A crucial aspect of findability of datasets involves their being indexed in ways that potential re-users can discover them. As we will mention below, there is one archiving option that is specific for long-form recordings, which thus makes any corpora hosted on there easily discoverable by other researchers working with that technique; and another specializing on child development, which can interest the developmental science community. However, the standard practice today is that data are archived in either one or another of these repositories, despite the fact that if an instance of the corpus were visible from one of these archives, the dataset would be overall more easily discovered. Additionally, we are uncertain the extent to which these highly re-usable long-form recordings are visible to researchers more broadly interested in spoken corpora and/or naturalistic human behavior and/or other topics that could be studied in such data. In fact, one can conceive of a future in which the technique begins to be used with people of different ages, in which case a system that allows users to discover other datasets based on relevant metadata would be ideal: For some research purposes (e.g., the study of source separation) any recording may be useful, whereas for others (neurodegenerative disorders, early language acquisition) only some ages would. In any case, options exist to allow accessibility once a dataset is archived in one of those archives.
+
+\subsubsection*{Reproducibility}
+
+Independent verification of results by a third party can be facilitated by improving the \emph{reproducibility} of the analyses, i.e. by providing third-parties with enough data and information to re-derive claimed results. This itself maybe be challenging for a number of reasons, including the variety of software requirements, unclear data dependencies, or insufficiently documented steps. Sharing data sets and analyses is more complex than delivering a collection of static files; all the information that is necessary in order to re-execute any intermediate step of the analysis should also be adequately conveyed.
+ 
+
+
+\subsubsection*{Current archiving options}
+
+The field of child-centered long-form recordings benefited from a purpose-built scientific archive from very early on. HomeBank \cite{vandam2016homebank} builds on the same architecture as CHILDES \cite{MacWhinney2000} and other TalkBank corpora. Although this architecture served the purposes of the language-oriented community well for short recordings, there are numerous issues when using it for long-form recordings. To begin with, curators do not directly control their datasets' contents and structures, and if a curator wants to make a modification, they need to ask the HomeBank management team to make it for them. Similarly, other collaborators who spot errors cannot correct them directly, but again must request changes be made by the HomeBank management team.  Only one type of annotation is innately managed, and that is CHAT \cite{MacWhinney2000}, which is ideal for transcriptions of what was said. However, transcription is less central to studies of long-form audio.
+
+Other options have been used by researchers in the community, including OSF, Databrary, and the Language Archive. Detailing all their features is beyond the scope of the present paper, but some discussion can be found in \cite{casillas2019step}. For our purposes, the key issue to bear in mind is that none of these archives supports well the very large audio files found in long-form corpora. These limitations have brought us to envision a new strategy for sharing these datasets. 
+
+ \subsubsection*{Our proposal}
+ 
+We propose a storing-and-sharing method designed to address the challenges outlined above simultaneously. It can be noted that these problems are, in many respects, similar to those faced by researchers in neuroimaging, a field which has long been confronting the need for reproducible analyses on large datasets of potentially sensitive data \citep{Poldrack2014}.
+Their experience may, therefore, provide precious insight for linguists, psychologists, and developmental scientists engaging with the big-data approach of daylong recordings.
+For instance, in the context of neuroimaging, \citet{Gorgolewski2016} have argued in favor of ``machine-readable metadata'', standard file structures and metadata, as well as consistency tests. Similarly, \citet{Eglen2017} have recommended the use of formatting standard, version control, and continuous testing. In the following, we will demonstrate how all of these practices have been implemented in our proposed design.
+
+Albeit designed for child-centered daylong recordings, we believe our solution could be replicated across a wider range of datasets with constraints similar to those exposed above. Furthermore, our approach is flexible and leaves room for customization.
+
+This solution relies on four main components, each of which is conceptually separable from the others: i) a standardized data format, optimized for child-centered long-form recordings; \citep{hanke_defense_2021}; ii) ChildProject, a python package to perform basic operations on these datasets; iii) DataLad, ``a decentralized system for integrated discovery, management, and publication of digital objects of science''  iv) GIN, a live archiving option for storage and distribution. Our choice for each one of these components can be revisited based on the needs of a project and/or as other options appear. Table \ref{table:components} summarises which of these components help address each of the challenges listed in Section \ref{section:problemspace}.
+
+\begin{table*}[ht]
+\centering
+\begin{tabular}{@{}l|llll@{}}
+\toprule
+\textbf{Problem} &
+  \begin{tabular}[t]{@{}l@{}}\textbf{ChildProject}\\(Section \ref{section:childproject})\end{tabular} &
+  \begin{tabular}[t]{@{}l@{}}\textbf{DataLad}\\(Section \ref{section:datalad})\end{tabular} &
+  \begin{tabular}[t]{@{}l@{}}\textbf{GIN}\\(Section \ref{section:gin})\end{tabular} \\ \midrule
+The need for standards &
+  \begin{tabular}[t]{@{}l@{}}documented standards;\\tests;\\conversion routines\end{tabular} &
+   &
+   \\ \midrule
+\begin{tabular}[t]{@{}l@{}}Keeping up with updates\\ and contributions\end{tabular} &
+   &
+  \begin{tabular}[t]{@{}l@{}}version control\\(git)\end{tabular} &
+  git repository host
+   \\ \midrule
+\begin{tabular}[t]{@{}l@{}}Delivering large amounts\\ of data\end{tabular} &
+   parallelised processing &
+  git-annex &
+  \begin{tabular}[t]{@{}l@{}}git-annex compatible;\\ high storage capacity;\\ parallelised operations\end{tabular}
+   \\ \midrule
+Ensuring privacy &
+   &
+  \begin{tabular}[t]{@{}l@{}}private sub-datasets;\\ private remotes;\\
+  path-based\\or metadata-based\\
+  storage rules;\end{tabular} &
+  \begin{tabular}[t]{@{}l@{}}Access Control Lists;\\SSH authentication\end{tabular}
+   \\ \midrule
+Long-term storage & \begin{tabular}[t]{@{}l@{}}tests\\(ensure integrity;\\
+detect missing files)\end{tabular}
+   &
+  \begin{tabular}[t]{@{}l@{}}git; git-annex\\
+  (remote synchronization,\\
+  file availability and\\
+  integrity checks,\\
+  safe file deletion)\end{tabular} &
+  DOI registration
+  \\ \midrule
+  Findability &
+  \begin{tabular}[t]{@{}l@{}}rich and standardized\\ metadata\end{tabular} &
+  \begin{tabular}[t]{@{}l@{}}metadata aggregation\\
+  metadata search\end{tabular} &
+  \begin{tabular}[t]{@{}l@{}}DOI registration;\\
+  DataCite support\\
+  repository search\end{tabular}
+  \\ \midrule
+Reproducibility &
+   &
+  \begin{tabular}[t]{@{}l@{}}run/rerun/container-run\\ functions\end{tabular} &
+
+\end{tabular}
+\caption{\textbf{\label{table:components}Contributions of each component of our proposed design in resolving the difficulties caused by daylong recordings} and laid out in Section \ref{section:problemspace}. ChildProject is a python package designed to perform recurrent tasks on the datasets; DataLad is a python package for the management of large, version-controlled datasets; GIN is a hosting provider dedicated to scientific data.}
+\end{table*}
+
+\section{Proposed solution}
+
+
+\subsection{\label{sec:format}Dataset format}
+
+\begin{figure}[ht]
+    \centering
+    \inputTikZ{0.8}{Fig2.tex}
+    \caption{\textbf{Structure of a dataset}. Metadata, recordings and annotations each belong to their own folder. Raw annotations (i.e., the audio files as they have been collected, before post-processing) are separated from their post-processed counterparts (in this case: standardized and vetted recordings. Similarly, raw annotations -- in this case, LENA's its annotations -- are set apart from the corresponding CSV version.}
+    \label{fig:tree}
+\end{figure}
+
+To begin with, we propose a set of tested and proven standards which we use in our lab, and which build on previous experience in several collaborative projects, including ACLEW. It must be emphasized, however, that standards should be elaborated collaboratively by the community and that the following are merely a starting point.
+
+Data that are part of the same collection effort are bundled together within one folder\footnote{We believe a reasonable unit of bundling is the collection effort, for instance a single field trip, or a full bout of data collection for a cross-sectional sample, or a set of recordings done more or less at the same time in a longitudinal sample. Given the possibilities of versioning, some users may decide they want to keep all data from a longitudinal sample in the same dataset, adding to it progressively over months and years, to avoid having duplicate children.csv files. That said, given the system of subdatasets, one can always define different datasets, each of which contains the recordings collected in subsequent time periods.}, preferably a DataLad dataset (see Section \ref{section:datalad}). Datasets are packaged according to the structure given in fig. \ref{fig:tree}. The \path{metadata} folder contains at least three dataframes in CSV format : (i) \path{children.csv} contains information about the participants, such as their age or the language(s) they speak. (ii) \path{recordings.csv} contains the metadata for each recording, such as when the recording started, which device was used, or their relative path in the dataset. (iii) \path{annotations.csv} contains information concerning the annotations provided in the dataset, how they were produced, or which range they cover, etc. The dataframes are standardized according to guidelines which set conventional names for the columns and the range of allowed values. The guidelines are enforced through tests which print all the errors and inconsistencies in a dataset implemented in the ChildProject package introduced below.
+
+The \path{recordings} folder contains two subfolders: \path{raw}, which stores the recordings as delivered by the experimenters, and \path{converted} which contains processed copies of the recordings. All the audio files in \path{recordings/raw} are indexed in the recordings dataframe. There is, thus, no need for naming conventions for the audio files themselves, and maintainers can decide how they want to organize them.
+
+The \path{annotations} folder contains all sets of annotations. Each set itself consists of a folder containing two subfolders : i) \path{raw}, which stores the output of the annotation pipelines and ii) \path{converted}, which stores the annotations after being converted to a standardized CSV format and indexed into \path{metadata/annotations.csv}. A set of annotations can contain an unlimited amount of subsets, with any amount of recursions. For instance, a set of human-produced annotations could include one subset per annotator. Recursion facilitates the inheritance of access permissions, as explained in Section \ref{section:datalad}.
+
+
+\subsection{ChildProject}\label{section:childproject}
+
+The ChildProject package is a Python 3.6+ package that performs common operations on a dataset of child-centered recordings. It can be used from the command-line or by importing the modules from within Python. Assuming the target datasets are packaged according to the standards summarized in section \ref{sec:format}, the package supports the functions listed below.
+
+\subsubsection*{Listing errors and inconsistencies in a dataset}
+
+We provide a validation script that returns a detailed reporting of all the errors found within a dataset, such as violations of the formatting guidelines or missing files. Tests help enforce the standards that allow the commensurability of the datasets while guaranteeing the integrity and coherence of the data.
+
+\subsubsection*{Converting and indexing annotations}\label{section:annotations}
+
+The package converts input annotations to standardized, wide-table CSV dataframes. The columns in these wide-table formats have been determined based on previous work, and are largely specific to the goal of studying infants' language environment and production.
+
+Annotations are indexed into a unique CSV dataframe which stores their location in the dataset, the set of annotations they belong to, and the recording and time interval they cover. Thus, the index allows an easy retrieval of all the annotations that cover any given segment of audio, regardless of their original format and the naming conventions that were used. The system interfaces well with extant annotation standards. Currently, ChildProject supports: LENA annotations in .its \citep{xu2008lenatm}; ELAN annotations following the ACLEW DAS template  \citep{Casillas2017,pympi-1.70}; the Voice Type Classifier (VTC) by \citet{lavechin2020opensource}; the Linguistic Unit Count Estimator (ALICE) by \citet{rasanen2020}; the VoCalisation Maturity Network (VCMNet) by \citet{AlFutaisi2019}. Users can also adapt routines for file types or conventions that vary; for instance, users can adapt the ELAN import developed for the ACLEW DAS template for their own template; and examples are also provided for Praat's .TextGrid files \citep{boersma2006praat}. The package also supports custom, user-defined conversion routines.
+
+Relying on the annotations index, the package can also calculate the intersection of the portions of audio covered by several annotators. This is useful when annotations from different annotators need to be combined (for instance, to retain the majority choice) or compared (e.g. for reliability evaluations).
+
+\subsubsection*{Choosing audio samples of the recordings to be annotated}\label{section:choosing}
+
+As noted in the Introduction, recordings are too extensive to be manually annotated in their entirety. We and colleagues have typically annotated manually clips of 0.5-5 minutes in length, and the way these clips are extracted and annotated constitutes one of the ways in which there is divergent standards (as illustrated in Table \ref{table:datasets}).
+
+The package allows the use of predefined or custom sampling algorithms. Samples' timestamps are exported to CSV dataframes. In order to keep track of the sample generating process, input parameters are simultaneously saved into a YAML file. Predefined samplers include a periodic sampler, a sampler targeting specific speakers' vocalizations, a sampler targeting regions of high-volubility according to input annotations, and a more agnostic sampler targeting high-energy regions. In all cases, the user can define the number of regions and their duration, as well as the context that may be inspected by human annotators. These options cover all documented sampling strategies.
+
+\subsubsection*{Generating ELAN files ready to be annotated}
+
+Although there was some variability in terms of the program used for human annotation, the field has now by and large settled on ELAN \citep{wittenburg2006elan}. ELAN employs xml files with hierarchical structure which are both customizable and flexible. The ChildProject can be used to generate .eaf files which can be annotated with the ELAN software based on samples of the recordings drawn using the package, as described in Section \ref{section:choosing}.
+
+\subsubsection*{Extracting and uploading audio samples to Zooniverse}
+
+The crowd-sourcing platform Zooniverse \citep{zooniverse} has been extensively employed in both natural \citep{gravityspy} and social sciences. More recently, researchers have been investigating its potential to classify samples of audio extracted from daylong recordings of children and the results have been encouraging  \citep{semenzin2020a,semenzin2020b}. We provide tools interfacing with Zooniverse's API for preparing and uploading audio samples to the platform and for retrieving the result, while protecting the privacy of the participants.
+
+\subsubsection*{Audio processing}
+
+ChildProject allows the batch-conversion of the recordings to any target audio format (using ffmpeg \citealt{ffmpeg}).
+
+The package also implements a ``vetting" \citep{Cychosz2020} pipeline, which mutes segments of the recordings previously annotated by humans as confidential while preserving the duration of the audio files. After being processed, the recordings can safely be shared with other researchers or annotators.
+
+If necessary, users can easily design custom audio converters suiting more specific needs.
+
+\subsubsection*{Other functionalities}
+
+The package offers additional functions such as a pipeline that strips LENA's annotations from data that could be used to identify the participants, built upon previous code by \citet{eaf-anonymizer-original}.
+
+\subsubsection*{User empowerment}
+
+The present effort is led by a research lab, and thus with personnel and funding that is not permanent. We therefore have done our best to provide information to help the community adopt and maintain this code in the future. Extensive documentation is provided on \url{https://childproject.readthedocs.io}, including detailed tutorials. The code is accessible on GitHub.com.
+
+
+\subsection{DataLad}\label{section:datalad}
+
+\begin{figure}[htb]
+\centering
+\begin{minipage}{.5\linewidth}
+\centering
+\subfloat[]{\label{datalad:a}\resizebox{!}{0.70\linewidth}{\large\input{Fig1a.tex}\normalsize}}
+\end{minipage}%
+\begin{minipage}{.5\linewidth}
+\centering
+\subfloat[]{\label{datalad:b}\resizebox{!}{0.70\linewidth}{\large\input{Fig1b.tex}\normalsize}}
+\end{minipage}\par\medskip
+
+
+\caption{\label{fig:datalad}\textbf{DataLad development activity}. (a) Amount of versions published across time. More than 50 versions have been released since 2015-01-01, at a steady pace. (b) Share of commits held by top contributors in the last year (2020). At least three developers have contributed substantially, each of them being responsible for about 30\% of the commits.}
+
+\end{figure}
+
+The combination of standards and the ChildProject package allows us to solve some of the problems laid out in the Introduction, but they do not directly provide solutions to the problems of data sharing and collaborative work. DataLad, however, has been specifically designed to address such needs.
+
+DataLad \citep{datalad_handbook} was initially developed by researchers from the computational neuroscience community for the sharing of neuroimaging datasets. It has been under active development at a steady pace for at least six years (fig. \ref{datalad:a}). It is co-funded by the United States NSF and the German Federal Ministry of Education and Research and has several major code developers (fig. \ref{datalad:b}).% thereby lowering its bus-factor\footnote{\url{https://en.wikipedia.org/wiki/Bus_factor}} :D.
+
+DataLad relies on git-annex, a software designed to manage large files with git. Over the years, git has rapidly overcome competitors such as Subversion, and it has been popularized by platforms such as GitLab and GitHub. However, git does not natively handle large binary files, our recordings included. Git-annex circumvents this issue by versioning only pointers to the large files. The actual content of the files is stored in an ``annex''. Annexes can be stored remotely on a variety of supports, including Amazon Glacier, Amazon S3, Backblaze B2, Box.com, Dropbox, FTP/SFTP, Google Cloud Storage, Google Drive, Internet Archive via S3, Microsoft Azure Blob Storage, Microsoft OneDrive, OpenDrive, OwnCloud, SkyDrive, Usenet, and Yandex Disk.
+
+A DataLad dataset is, essentially, a git repository with an annex. As such, it naturally allows version control, easy collaboration with many contributors, and continuous testing. Furthermore, its use is intuitive to git users.
+
+In using git-annex, DataLad enables users to download only the files that they need, without needing to fetch the whole dataset.
+
+DataLad improves upon git-annex by adding a number of functionalities. One of them, dataset nesting, is built upon git submodules. A DataLad dataset can include sub-datasets, with as many levels of recursion as needed. This provides a natural solution to the question of how to document analyses, as an analysis repository can have the dataset on which it depends embedded as a subdataset. It also provides a good solution for the issue of different levels of data containing more or less identifying information, via the possibility of restricting permissions to different levels of the hierarchy.
+
+Like git, DataLad is a decentralized system, meaning that data can be stored and replicated across several ``remotes''. DataLad authors have argued in favor of decentralized research data management, as it simplifies infrastructure migrations, and help improve the scalibility of the data storage and distribution design \cite{decentralization_hanke}. Additionally, Decentralization is notably useful in that it helps achieve redundancy; files can be pushed simultaneously to several storage supports (e.g.: an external hard-drive, a cloud provider). In addition to that, when deleting large files from your local repository, DataLad will automatically make sure that more than a certain amount of remotes still own a copy the data, which by default is set to one.
+
+Many of the \emph{remotes} supported by DataLad require user-authentication, thus allowing for fine-grained access permissions management, such as Access-Control Lists (ACL). There are at least two ways to implement multiple levels of access within a dataset. One involves using sub-datasets with stricter access requirements. It is also possible to store data across several git-annex remotes with varying access permissions, depending on their sensitivity. Path-based pattern matching rules may configured in order to automatically select which remote the files should be pushed to. More flexible selection rules can be implemented using git-annex metadata, which allows to label files with \texttt{key=value} pairs. For instance, one could tag confidential files as \texttt{confidential=yes} and exclude these from certain remotes (blacklist). Alternatively, some files could be pushed to a certain remote provided they are labelled \texttt{public=yes} (whitelist).
+
+DataLad's metadata\footnote{\url{http://docs.datalad.org/en/stable/metadata.html}} system can extract and aggregate information describing the contents of a collection of datasets. A search function then allows the discovery of datasets based on these metadata. We have developed a DataLad extension to extract meaningful metadata from datasets into DataLad's metadata system \citep{datalad_extension}. This allows, for instance, to search for datasets containing a given language. Moreover, DataLad's metadata can natively incorporate DataCite \citep{brase2009datacite} descriptions into its own metadata.
+
+DataLad may link data and software dependencies associated to a script as it is run. These scripts can later be re-executed by others, and the dependencies will automatically be downloaded. This way, DataLad can keep track of how each intermediate file was generated, thus simplifying the reproducibility of analyses. DataLad's handbook provides a tutorial to create a fully reproducible paper \citep[Chapter~22]{datalad_handbook}, and a template is available on GitHub \citep{reproducible_paper}.
+
+DataLad is domain-agnostic, which makes it suitable for maturing techniques such as language acquisition studies based on long-form recordings. The open-access data of the WU-Minn Human Connectome Project \citep{pub.1022076283}, totalling 80 terabytes to date, have been made available through DataLad \footnote{\label{note:hcp}\url{https://github.com/datalad-datasets/human-connectome-project-openaccess}}.
+
+
+\subsection{Storage and distribution}\label{section:gin}
+
+DataLad does not provide, by itself, the infrastructure to share data. However, it allows maintainers to publish their content to a wide range of \href{https://git-annex.branchable.com/special_remotes/}{platforms}. One can therefore implement different strategies for the storage and distribution of the data using any combination of these providers, depending on the constraints.
+
+Table \ref{table:providers} sums up the most relevant characteristics of a few providers that are appropriate for our research, although many more could be considered. Datasets can only be cloned from providers that support git, and the large files can only be stored on those that support git-annex. Platforms that only support the former, such as GitHub, should therefore be used in tandem with providers that support the latter, like Amazon S3.
+
+Among criteria of special interest are: the provider's ability to handle complex permissions; how much data it can accept; its ability to assign permanent URLs and identifiers to the datasets; and of course, whether it complies with the legislation regarding privacy. For our purposes, Table \ref{table:providers} suggests GIN is the best option, handling well large files, with highly customizable permissions, and Git-based version control and access (see Appendix \ref{appendix:gin} for a practical use-case of GIN). That said, private projects are limited in space, although at the time of writing this limit can be raised by contributions to the GIN administrators. The next best option may be S3, and some users may prefer S3 when they do not have access to a local cluster, since S3 allows both easy storage and processing. 
+
+To render comparison of options easier, detailed examples of storage designs taken from real datasets are listed in Appendix \ref{appendix:examples}. Scripts to implement these strategies can be found on our GitHub and OSF \citep{datalad_procedures}. We also provide a tutorial based on a public corpus \citep{vandam-day} to convert existing data to our standards and then publish it with DataLad\footnote{\url{https://childproject.readthedocs.io/en/latest/vandam.html}}.
+We would like to emphasize that the flexibility of DataLad makes it very easy to migrate from one architecture to another. The underlying infrastructure may change, with little to no impact on the users, and little efforts from the maintainers.
+
+In any case, we strongly recommend users to bear in mind that redundancy is important to make sure data are not lost, so a backup sibling may be hosted in an additional site (e.g., in a computer on campus in addition to the cloud-based version). 
+
+For instance, \citet{Perkel_2019} suggests several practices regarding backups, including automated backups, privacy safe-guarding, regular tests, and offline backups. Table \ref{table:backups} may orient the reader towards the functionalities of DataLad (and git-annex) which can be used to achieve these goals.
+
+\begin{table*}[ht]
+
+\begin{minipage}{\columnwidth}%
+\centering
+\renewcommand\footnoterule{ \kern -1ex}
+\renewcommand{\thempfootnote}{\alph{mpfootnote}}
+
+\begin{tabular}{@{}lllllll@{}}
+\toprule
+\multicolumn{1}{c}{\textbf{Provider}} & \multicolumn{1}{c}{\textbf{Git}\footnote{The provider can store the git history and provide an URL from which the dataset can be installed.}} & \multicolumn{1}{c}{\textbf{Large files}\footnote{The provider handles git-annex large files.}} & \multicolumn{1}{c}{\textbf{Authentication}} & \multicolumn{1}{c}{\textbf{Permissions}} & \multicolumn{1}{c}{\textbf{Quota}} &
+\multicolumn{1}{c}{\textbf{\begin{tabular}[t]{@{}l@{}}DOI\\registration\end{tabular}}} \\ \midrule
+\midrule
+SSH server                            & Yes                              & Yes                                      & SSH                                         & Unix                                     & Self-hosted & No                                 \\
+GIN                                   & Yes                              & Yes                                      & HTTPS or SSH                                            & ACL                                      & \footnote{\label{contact}Contact the administrators} 
+ & Yes\footnoteref{contact}\\
+GitHub                                & Yes                              & No                                       & HTTPS or SSH                                & ACL                                      & --  & No                                \\
+GitLab                                & Yes                              & No                                       & HTTPS or SSH                                & ACL                                      & --  & No                                \\
+
+Nextcloud                             & No                               & Yes                                      &                                             & ACL                                      & Self-hosted & No                                 \\
+Amazon S3                             & No                               & Yes                                      & API key+secret                              & IAM                                      & Unlimited & No                                  \\
+OSF                                   & Yes\footnote{\label{osf}With limitations (see \url{http://docs.datalad.org/projects/osf/en/latest/intro.html})}                             & Yes\footnoteref{osf}                                     & Token                                       & ACL                                      & \footnote{5 GB for private projects, 50 GB for publics projects}  & Yes                              \\                                 
+\end{tabular}
+\end{minipage}
+\caption{\label{table:providers}\textbf{Overview of several providers that can be used with DataLad}. The Unix permission system allows only one user and one group to be granted specific access rights. Access Control Lists (ACL) give more control, by enabling access to several groups and users. Amazon's Identity Access Management (IAM) can imitate ACLs, while providing more functionalities (fully-programmable; time-limited permissions; etc.) }
+\end{table*}
+
+\begin{table*}[ht]
+\begin{minipage}{\columnwidth}%
+\centering
+\renewcommand\footnoterule{ \kern -1ex}
+\renewcommand{\thempfootnote}{\alph{mpfootnote}}
+\begin{tabular}{@{}lll@{}}
+\toprule
+Practice                                                         & \begin{tabular}[t]{@{}l@{}}Relevant\\ software\end{tabular}  & Functionality                                                      \\ \midrule\midrule
+\begin{tabular}[t]{@{}l@{}}offline\\ backups\end{tabular}       & \begin{tabular}[t]{@{}l@{}}DataLad\\ \ \\ git-annex\end{tabular}   & \begin{tabular}[t]{@{}l@{}}create-sibling, push\footnote{creates a local sibling to which the data can be pushed, e.g. an external hard-drive.};\\export-archive;\\copy;export\footnote{exports human-readable snapshots of a dataset}\end{tabular} \\ \midrule
+\begin{tabular}[t]{@{}l@{}}backup\\ automation\end{tabular}     & DataLad   & \begin{tabular}[t]{@{}l@{}}siblings\\ ``publish-depends''\footnote{``publish-depends'' specifies which other siblings should be pushed to everytime some other sibling is updated. Maintainers can thus make sure that pushing to the main repository will trigger a push to the backup sibling.}\end{tabular} \\ \midrule
+\begin{tabular}[t]{@{}l@{}}privacy\\ safe-guarding\end{tabular} & git-annex & encryption                                                         \\ \midrule
+regular tests                                                   & git-annex & \texttt{fsck}\footnote{integrity check}                                     \\  \bottomrule
+\end{tabular}
+\caption{\label{table:backups}\textbf{Examples of recommended practices for data backups, associated to the software that could be used for their implementation}.}
+\end{minipage}
+\end{table*}
+
+\section{Application: evaluating annotations' reliability}
+
+
+Assessing the reliability of the annotations is crucial to linguistic research, but it can prove tedious in the case of daylong recordings. On one hand, analysis of the massive amounts of annotations generated by automatic tools may be computationally intensive. On the other hand, human annotations are usually sparse and thus more difficult to match with each other. Moreover, as emphasized in Section \ref{section:problemspace}, the variety of file formats used to store the annotations makes it even harder to compare them.
+
+Making use of the consistent data structures that it provides, the ChildProject package implements functions for extracting and aligning annotations regardless of their provenance or nature (human vs algorithm, ELAN vs Praat, etc.). It also provides functions to compute most of the metrics commonly used in linguistics and speech processing, relying on existing efficient and field-tested implementations.
+
+Figure \ref{fig:Annotation} illustrates a recording annotated by three annotators (Alice, Bob and John). In this case, if one is interested in comparing the annotations of Bob and Alice, then the segments A, B and C should be compared. However, if the annotations common to all of the three annotators should be simultaneously compared, only the segment B should be considered.
+In real datasets with many recordings and several human and automatic annotators, the layout of annotations coverage may become unpredictable. Relying on the index of annotations described in Section \ref{section:annotations}, the ChildProject package can calculate the intersection of the portions of audio covered by several annotators and return all matching annotations. These annotations can be filtered (e.g. excluding certain audio files), grouped according to certain characteristics (e.g. by participant), or collapsed altogether for subsequent analyses.
+
+
+\begin{figure*}[htb]
+\centering
+\subfloat[]{%
+\centering
+  \includegraphics[trim=0 250 100 25, clip, width=0.8\textwidth]{Fig3a.jpg}
+  \label{Annotation:1}%
+}
+
+%\subfloat[]{%
+%\centering
+%  \includegraphics[trim=0 250 25 25, clip, width=0.98\textwidth]{Fig3b.jpg}
+%  \label{Annotation:2}%
+%}
+
+\caption{\label{fig:Annotation}\textbf{Example of time-intervals of a recording covered by three annotators}. Automated annotations usually cover whole recordings, while human annotators typically annotate periodic or targeted clips. }
+
+\end{figure*}
+
+
+In psychometrics, the reliability of annotators is usually evaluated using inter-coder agreement indicators. The python package enables the calculation of some of these measures. Krippendorff's Alpha and Fleiss' Kappa \citep{kappa} have been implemented with NLTK \citep{nltk}. The gamma method \citep{gamma}, which aims to improve upon previous indicators by evaluating simultaneously the quality of both the segmentation and the categorization of speech, has been included using the implementation by \citet{pygamma_agreement}.
+It should be noted that these measures are most useful in the absence of ground truth, when reliability of the annotations can only be assessed by evaluating their overall agreement. Automatic annotators, however, are usually evaluated against a gold standard produced by human experts. In such cases, the package allows comparisons of pairs of annotators using metrics such as F-score, recall, and precision. Figure \ref{fig:precision} illustrates this functionality. Additionally, the package can compute confusion matrices between two annotators, allowing more informative comparisons, as demonstrated in Figure \ref{fig:confusion}. Finally, the python package interfaces well with \texttt{pyannote.metrics} \citep{pyannote.metrics}, and all the metrics implemented by the latter can be effectively used on the annotations managed with ChildProject.
+
+\begin{figure*}[htb]
+
+\centering
+\includegraphics[width=0.8\textwidth]{Fig4.pdf}
+
+\caption{\label{fig:precision}\textbf{Examples of diarization performance evaluation using recall, precision and F1 score}. Audio from the the public VanDam corpus \citep{vandam-day} is annotated according to who-speak-when, using both the LENA diarizer (its) and the Voice Type Classifier (VTC) by \citet{lavechin2020opensource}. Speech segments are classified among four speaker types: the key child (CHI), other children (OCH), male adults (MAL) and female adults (FEM). For illustration purposes, fake annotations are generated from that of the VTC. Two are computed by randomly assigning the speaker type to 50\% and 75\% (conf) of the VTC's speech segments. Two are computed by dropping 50\% of speech segments from the VTC (drop). Recall, precision and F1 score are calculated for each of these annotations, by comparing them to the VTC. The worst F-score for the LENA is reached for OCH segments. Dropping segments does not alter precision; however, as expected, it has a substantially negative impact on recall.
+}
+
+\end{figure*}
+
+
+\begin{figure*}[htb]
+
+\centering
+\includegraphics[width=\textwidth]{Fig5.pdf}
+
+\caption{\label{fig:confusion}\textbf{Example of diarization performance comparison using confusion matrices}
+LENA's annotations (its) of the public VanDam corpus \citep{vandam-day} are compared to the VTC's. The first coefficient of the left side matrix should be read as: ``41\% of CHI segments from the VTC are labelled as CHI by the LENA's''. The first coefficient of the right side matrix should be read as: ``71\% of the CHI segments of the LENA are labelled as CHI by the VTC''. It can also be seen that the LENA does not produce overlapping speech segments, i.e. it cannot disambiguate two overlapping speakers.
+}
+
+\end{figure*}
+
+
+%\subsubsection{Possible improvements}
+% adding pyanote 
+% pygamma
+% generalizing to other annotation types
+
+
+\section{Generalization}
+
+The kinds of problems that our proposed approach addresses are relevant to at least three other bodies of data, all of them based on large datasets collected with wearables. First, there is a line of research on interaction and its effects on well-being among neurotypical adults (e.g., \cite{ear1}). Second, audio data from wearables holds promise for individuals with medical and psychological conditions that have behavioral consequences which can evolve over time, including conditions that lead to coughing \citep{Wu2018} and/or neurogenerative disorders \citep{riad2020vocal}. Third, some researchers hope to gather datasets on child development combining multiple information sources, such as parental reports, as well as other sensors picking up motion and psychophysiological data, with the goal of potentially intervening when it is needed \citep{levin2021sensing}.
+
+Our proposed solution can be readily adapted to the first body of data: All that would need to be changed is renaming children.csv to participants.csv; renaming child\_id to participant\_id; and adapting which columns are mandatory and their format (e.g., it is cumbersome to express age in days for adults).
+
+Generalizing our solution to the second body of data requires more adaptation. For such use cases, it would be ideal for the equipment to be left in the patients' house, so that it can be used for instance one day a week or month. Additional work is needed to facilitate this, ranging from making the equipment easier to use and more robust by for instance facilitating charging and secure data transfer from such off-site locations.
+
+The third use case requires further adaptation, in addition to those just mentioned (making the sensors easy to use and allowing data transfer from potentially insecure home settings). In particular, multiple sensors' data need to be integrated together and time-stamped. We have made some progress in this sense in the context of the collection of multiple audio tracks collected with different physical devices (example on XXX), but have not yet developed structure and code to support the integration of pictures, videos, heart rate data, parental questionnaire data, etc. 
+
+\section{Limitations}
+
+DataLad and git-annex are well-documented, and, on the user's end, little knowledge beyond that of git is needed. Maintainers and resource administrators, however, will need a certain level of understanding in order to take full advantage of these tools.
+Recently, \citet{Powell2021} has emphasized the shortcomings of decentralization and the inconveniences of a proliferation of databases with different access protocols. In the future, sharing data could be made even easier if off-the-shelf solutions compatible with DataLad were made readily available to linguists, psychologists, and developmental scientists. To this effect, we especially call for the attention of our colleagues working on linguistic databases. We are pleased to have found a solution on GIN -- but it is possible that GIN administrators agreed to host our data because there is some potential connection with neuroimaging, whereas they may not be able to justify their use of resources for under-resourced languages and/or other projects that bear little connection to neuroimaging.
+
+We should stress again that the use of the ChildProject package does not require the datasets to be managed with DataLad. They do need, however, to follow certain standards. Standards, of course, do not come without their own issues, especially in the present case of a maturing technique. They may be challenged by ever-evolving software, hardware, and practices. However, we believe that the benefits of standardization outweigh its costs provided that it remains reasonably flexible. Such standards will further help to combine efforts from different teams across institutions. More procedures and scripts that solve recurrent tasks can be integrated into the ChildProject package, which might also speed up the development of future tools. 
+One could argue that new standards usually most usually end up increasing the amount of competing standards instead of bringing consensus. Nonetheless, if one were to eventually impose itself, well-structured datasets would still be easier to adapt than disordered data representations. Meanwhile, we look forward to discussing standards collaboratively with other teams via the GitHub platform, where anyone can create issues for improvements or bugs, submit pull-requests to integrate an improvement they have made, and/or have relevant conversations in the forum.
+
+\section{Summary}
+
+% removed: assessing data reliability
+% Data managers should be interested in DataLad because it might benefit to many studies, beyond long-form recordings. We should convince them it is worth diving into it
+
+We provide a solution to the technical challenges related to the management, storage and sharing of datasets of child-centered day-long recordings. This solution relies on four components: i) a set of standards for the structuring of the datasets; ii) \emph{ChildProject}, a python package to enforce these standards and perform useful operations on the datasets; iii) DataLad, a mature and actively developed version-control software for the management of scientific datasets; and iv) GIN, a storage provider compatible with Datalad. Building upon these standards, we have also provide tools to simplify the extraction of information from the annotations and the evaluation of their reliability along with the python package. The four components of our proposed design serve partially independent goals and can thus be decoupled, but we believe their combination would greatly benefit the technique of long-form recordings applied to language acquisition studies.
+
+\section*{Declarations}
+
+\subsubsection*{Funding} 
+This work has benefited from funding and/or institutional support from Agence Nationale de la Recherche (ANR-17-CE28-0007 LangAge,
+ANR-16-DATA-0004 ACLEW, ANR-14-CE30-0003 MechELex, ANR-17-EURE-0017);
+and the J. S. McDonnell Foundation Understanding Human Cognition Scholar Award. We also benefited from code developed in the Bridges system, which is
+supported by NSF award number ACI-1445606, at the Pittsburgh
+Supercomputing Center (PSC), using the Extreme Science and Engineering Discovery Environment
+(XSEDE), which is supported by National Science Foundation grant number OCI-1053575. Additionally, we benefited from processing in GENCI-IDRIS, France (Grant-A0071011046). Some capabilities of our software depend on  the Zooniverse.org platform, the development of which is funded by generous support, including a Global Impact Award from Google, and by a grant from the Alfred P. Sloan Foundation. The funders had no impact on this study.
+
+\subsubsection*{Conflicts of interest/Competing interests}
+
+The authors have no conflict of interests to disclose.
+
+\subsubsection*{Availability of data and material}
+This paper does not directly rely on specific data or material.
+
+\subsubsection*{Code availability}
+
+The ChildProject package is available on GitHub at \url{https://github.com/LAAC-LSCP/ChildProject}. We provide scripts and templates for DataLad managed datasets at \url{http://doi.org/10.17605/OSF.IO/6VCXK} \citep{datalad_procedures}. We also provide a DataLad extension to extract metadata from corpora of daylong recordings \citep{datalad_extension}.
+Examples of annotations evaluations using the package can be found at XXX.
+
+\appendix
+
+\section{Examples of storage strategies}\label{appendix:examples}
+
+\subsection{\label{sec:example1}Example 1 - sharing a dataset within the lab}
+
+In the first example, Alice is hosting large datasets of a few terabytes of recordings and annotations and she wants to share them with Bob - a collaborator from her own institution - in a secure manner. Alice and Bob are familiar with GitHub, and they like its user-friendly features such as issues and pull requests. However, GitHub cannot handle such amounts of data.
+
+Alice decides to store the git repository itself on GitHub -- or a GitLab instance, it would not matter -- thus allowing to benefit from their nice features while hosting the large files -- the recordings and annotations -- elsewhere. Alice's laboratory has its own cluster, with a large storage capacity. Thus, she decides to host the files there for free rather than using a Cloud provider.
+
+Since Bob has been given SSH access to the cluster and belongs to the right UNIX group, he can download recordings and annotations from their joint institution cluster. Alice also made sure to configure the dataset in a way that makes sure every change published to GitHub is also published to the cluster, with DataLad's ``publish-depends'' option. 
+
+For backup purposes, a third sibling is hosted on Amazon S3 Glacier -- which is cheaper than S3 at the expense of higher retrieval costs and delays -- as a git-annex \href{https://git-annex.branchable.com/special_remotes/}{special remote}. Special remotes do not store the git history and they cannot be used to clone the dataset. However, they can be used as a storage support for the recordings and other large files. In order to increase the security of the data, Alice uses encryption. Git-annex implements several encryption schemes\footnote{\url{https://git-annex.branchable.com/encryption/}}. The hybrid scheme allows to add public GPG keys at any time without additional decryption/encryption steps. Each user can then later decrypt the data with their own private key. This way, as long as at least one private GPG key has not been lost, data are still recoverable. This is especially valuable in that in naturally ensures redundancy of the decryption keys, which is critical in the case of encrypted backups.
+
+By default, file names are hashed with an HMAC algorithm, and their content is encrypted with AES-128 -- GPG's default --, although another algorithm could be selected.
+
+This setup ensures redundancy of git files (hosted on both GitHub and the cluster) as well as large files (stored on both the cluster and Amazon Deep Glacier). It also allows Bob to signal and correct errors he finds, and/or to add annotations in a straightforward manner, benefiting Alice. By virtue of having siblings, they can make sure that their local dataset is organized in an identical manner, facilitating collaboration and reproducibility in their analyses.
+
+Table \ref{table:storage1} illustrates such a strategy. In this example, users install the dataset from a private GitHub repository. Continuous testing is configured with Travis CI\footnote{https://travis-ci.com/}, in order to ensure the integrity of the dataset at every step. GitHub Actions could also be used for that purpose\footnote{https://docs.github.com/en/actions}.
+
+We used this strategy -- minus the Glacier backups -- to maintain and deliver 4 datasets with 8700 hours of audio\footnote{\url{https://github.com/LAAC-LSCP/datasets}} for several months. The associated scripts can be found on \cite{datalad_procedures}. We have now transitioned to using GIN for our main site, with our cluster as the backup. The scripts associated to this set-up can be found at the same location.
+
+
+\begin{table*}[!htbp]
+\centering
+\begin{tabular}{@{}lllllll@{}}
+\toprule
+\textbf{Sibling} & \textbf{Provider} & \textbf{Content} & \textbf{Access} & \textbf{Encryption} \\ \midrule
+origin   & GitHub     & metadata; scripts & Lab & No  \\
+cluster  & SSH server & everything & Lab & No  \\
+backup & Amazon Deep Glacier  &  recordings; annotations  & Lab            & AES-128 \\ \bottomrule
+\end{tabular}
+\caption{\label{table:storage1}Example 1 - Storage strategy example relying on GitHub and a cluster to deliver the data.}
+\end{table*}
+
+\subsection{Example 2 - sharing large datasets with outside collaborators (S3)}
+
+The previous strategy is not suitable when complex permissions are required, since SSH remotes only handle Unix-style permissions (user, group, all).
+
+Moreover, Alice may want to share the dataset with collaborators outside her lab, without giving them SSH access to its cluster. Or, she may not even own the infrastructure that would allow her to store and share such large amounts of data. 
+
+Instead, she decides to use Amazon S3 together with GitHub. Authorized users are provided their own Amazon S3 API key and secret, which are managed with Amazon's Identity and Access Manager (IAM). The GitHub is stripped from all confidential data, which are stored in the S3 annex only, allowing to manage access permissions entirely through IAM. This strategy is used by the Human Connectome Project\footnoteref{note:hcp}.
+
+Furthermore, Alice makes sure to encrypt GDPR relevant data, using strong symmetric encryption (AES-128). This strategy is illustrated in Table \ref{table:storage2}.
+
+\begin{table*}[!htbp]
+\centering
+\begin{tabular}{@{}lllllll@{}}
+\toprule
+\textbf{Sibling} & \textbf{Provider} & \textbf{Content} & \textbf{Access} & \textbf{Encryption} \\ \midrule
+origin   & GitHub     & metadata; scripts & Collaborators & No  \\
+s3 & Amazon S3  &  recordings; annotations  & Collaborators  & AES-128 \\ \bottomrule
+\end{tabular}
+\caption{\label{table:storage2}Example 2 - Storage strategy example relying on GitHub and Amazon S3.}
+\end{table*}
+
+Amazon is superior to most alternatives for a number of reasons, including that it is highly tested, developed by engineers with a high-level of knowledge of the platform, and widely used. This means that the code is robust even before it is released, and it is widely tested once it is released. The fact that there are many users also entails that issues or questions can be looked up online. In addition, in the context of data durability, Amazon is a good choice because it is too big to fail, and thus probably available for the long-term. In addition, in sheer terms of flexibility and coverage, Amazon provides a whole suite of tools (for data sharing, backups, and processing), which may be useful for researchers with little access to high-capacity infrastructures.
+
+\subsection{Example 3 - sharing large datasets with outside collaborators  and multi-tier access (GIN)}\label{appendix:gin}
+
+Due to legislation in some countries, there are researchers who may not be authorized to store their data on Amazon. If they also do not have access to a local cluster (see Example 1) and/or even in the case that they have a local cluster, but need finer control of access permissions, there are alternatives which can be used as a workaround.
+
+Finding herself in this setting, Alice decides to use the G-Node Infrastructure (GIN)\footnote{\url{https://gin.g-node.org/}}, which is dedicated to providing ``Modern Research Data Management for Neuroscience''. GIN is similar to GitLab and GitHub in many aspects, except that is also supports git-annex and thus can directly host the large files that required third-party providers while using those platforms.
+
+Just like GitLab or GitHub, it can handle complex permissions, at the user or group-level, thus surpassing Unix-style permissions management.
+
+In this case, Alice needs three permission tiers: 1) read-only access to anonymized data, 2) read-only access to confidential data, and 3) read and write access to the whole data. In order to achieve this, she creates two GIN siblings per dataset: \texttt{origin} and \texttt{confidential}. The dataset is configured to publish all the files whose path contains \path{/confidential/} to the \path{confidential} repository, while the rest of the data is published to \texttt{origin}. Alice could then great read-only access to \texttt{origin} to both Bob and Carol, while restricting the access to \texttt{confidential} to Bob only.
+
+Since Alice has not been allowed to use a cloud provider, and is lacking a local infrastructure, she needs an alternate solution for her backups. She may use external hard drives, as DataLad allows to push data to a local storage as with any other kind of storage.
+
+Table \ref{table:storage3} sums up this strategy, which is currently used to deliver the EL1000 dataset\footnote{\url{https://gin.g-node.org/EL1000/EL1000}} -- except for the backup, which is located at our cluster --. The EL1000 is a composite dataset, created by the contribution of 18 different teams that collected data independently but using comparable methods.
+
+\begin{table*}[!htbp]
+\centering
+\begin{tabular}{@{}lllllll@{}}
+\toprule
+\textbf{Sibling} & \textbf{Provider} & \textbf{Content} & \textbf{Access} & \textbf{Encryption} \\ \midrule
+origin   & GIN     & files NOT matching \path{**/confidential/*} & \begin{tabular}[t]{@{}l@{}}Alice (read+write);\\Bob, Carol (read-only)\end{tabular} & No  \\
+confidential & GIN  & files matching \path{**/confidential/*}  & \begin{tabular}[t]{@{}l@{}}Alice (read+write);\\Bob (read-only)\end{tabular}  & No \\ 
+backup & \begin{tabular}[t]{@{}l@{}}external\\hard drive\end{tabular}  & everything  & Alice  & No \\\bottomrule
+\end{tabular}
+\caption{\label{table:storage3}Example 3 - Storage strategy example relying solely on GIN to deliver the data.}
+\end{table*}
+
+
+\subsection{Example 4 - Sharing smaller datasets (OSF)}
+
+
+The Open Science Framework (OSF) is especially interesting because it supports DOI registration, providing permanent URLs to access the datasets. Moreover, an extension of DataLad has specifically been developed to work with OSF, which may host both the git repository and the large files (see Table \ref{table:providers}). In addition, Shibboleth credentials can be used with OSF.
+
+Low quotas are an important downside with OSF. Public projects are limited to 50 GB, and private projects cannot exceed 5 GB, which is too low for most long-form datasets.
+However, OSF could be used only to host the git repository, effectively providing a permanent URL from which the dataset can be installed, as long as the content of the large files remains available from a third-party provider, e.g. with Amazon S3. Table \ref{table:storage4} illustrates such a strategy.
+
+\begin{table*}[!htbp]
+\centering
+\begin{tabular}{@{}lllllll@{}}
+\toprule
+\textbf{Sibling} & \textbf{Provider} & \textbf{Content} & \textbf{Access} & \textbf{Encryption} \\ \midrule
+origin   & OSF     & metadata; scripts & Everyone & No  \\
+s3 & Amazon S3  & annotations; recordings & Alice, Bob and Carol  & No \\ \bottomrule
+\end{tabular}
+\caption{\label{table:storage4}Example 4 - Storage strategy example relying on OSF and Amazon S3 to deliver the data.}
+\end{table*}
+
+We use a reversed approach for our demo dataset\footnote{https://github.com/LAAC-LSCP/vandam-daylong-demo} based on  \citep{vandam-day}, by hosting the git repository on GitHub, and hosting the large files on OSF. This is possible only because of the small size of the dataset.
+
+\bibliographystyle{spbasic}
+\bibliography{references}
+
+\end{document}

+ 704 - 0
references.bib

@@ -0,0 +1,704 @@
+@article{ear1,
+  doi = {10.3758/bf03195410},
+  url = {https://doi.org/10.3758/bf03195410},
+  year = {2001},
+  month = nov,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {33},
+  number = {4},
+  pages = {517--523},
+  author = {Matthias R. Mehl and James W. Pennebaker and D. Michael Crow and James Dabbs and John H. Price},
+  title = {The Electronically Activated Recorder ({EAR}): A device for sampling naturalistic daily activities and conversations},
+  journal = {Behavior Research Methods,  Instruments,  {\&} Computers}
+}
+
+@inproceedings{schuller2017interspeech,
+  title={The Interspeech 2017 computational paralinguistics challenge: Addressee, cold \& snoring},
+  author={Schuller, Bj{\"o}rn and Steidl, Stefan and Batliner, Anton and Bergelson, Elika and Krajewski, Jarek and Janott, Christoph and Amatuni, Andrei and Casillas, Marisa and Seidl, Amdanda and Soderstrom, Melanie and others},
+  booktitle={Interspeech},
+  year={2017}
+}
+
+@article{lenaeval1,
+  title={A thorough evaluation of the Language Environment Analysis (LENA) system},
+  author={Cristia, Alejandrina and Lavechin, Marvin and Scaff, Camila and Soderstrom, Melanie and Rowland, Caroline and R{\"a}s{\"a}nen, Okko and Bunce, John and Bergelson, Elika},
+  year={2019},
+  publisher={OSF Preprints},
+  journal={Behavior Research Methods}
+}
+@article{boersma2006praat,
+  title={Praat: doing phonetics by computer},
+  author={Boersma, Paul},
+  journal={http://www. praat. org/},
+  year={2006}
+}
+@inproceedings{krishnamachari2021developing,
+  title={Developing Neural Representations for Robust Child-Adult Diarization},
+  author={Krishnamachari, Suchitra and Kumar, Manoj and Kim, So Hyun and Lord, Catherine and Narayanan, Shrikanth},
+  booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)},
+  pages={590--597},
+  year={2021},
+  organization={IEEE}
+}
+
+
+@article{warlaumont2014social,
+  title={A social feedback loop for speech development and its reduction in autism},
+  author={Warlaumont, Anne S and Richards, Jeffrey A and Gilkerson, Jill and Oller, D Kimbrough},
+  journal={Psychological science},
+  volume={25},
+  number={7},
+  pages={1314--1324},
+  year={2014},
+  publisher={Sage Publications Sage CA: Los Angeles, CA}
+}
+
+@article{riad2020vocal,
+  title={Vocal markers from sustained phonation in Huntington's Disease},
+  author={Riad, Rachid and Titeux, Hadrien and Lemoine, Laurie and Montillot, Justine and Bagnou, Jennifer Hamet and Cao, Xuan Nga and Dupoux, Emmanuel and Bachoud-L{\'e}vi, Anne-Catherine},
+  journal={Interspeech},
+  year={2020}
+}
+
+@article{ear2,
+  doi = {10.1037/0022-3514.84.4.857},
+  url = {https://doi.org/10.1037/0022-3514.84.4.857},
+  year = {2003},
+  publisher = {American Psychological Association ({APA})},
+  volume = {84},
+  number = {4},
+  pages = {857--870},
+  author = {Matthias R. Mehl and James W. Pennebaker},
+  title = {The sounds of social life: A psychometric analysis of students{\textquotesingle} daily social environments and natural conversations.},
+  journal = {Journal of Personality and Social Psychology}
+}
+@book{macwhinney2000childes,
+  title={The CHILDES project: The database},
+  author={MacWhinney, Brian},
+  volume={2},
+  year={2000},
+  publisher={Psychology Press}
+}
+
+
+@inproceedings{vandam2016homebank,
+  title={HomeBank: An online repository of daylong child-centered audio recordings},
+  author={VanDam, Mark and Warlaumont, Anne S and Bergelson, Elika and Cristia, Alejandrina and Soderstrom, Melanie and De Palma, Paul and MacWhinney, Brian},
+  booktitle={Seminars in Speech and Language},
+  volume={37},
+  pages={128},
+  year={2016},
+  organization={NIH Public Access}
+}
+
+@article{christakis2009audible,
+  title={Audible television and decreased adult words, infant vocalizations, and conversational turns: a population-based study},
+  author={Christakis, Dimitri A and Gilkerson, Jill and Richards, Jeffrey A and Zimmerman, Frederick J and Garrison, Michelle M and Xu, Dongxin and Gray, Sharmistha and Yapanel, Umit},
+  journal={Archives of pediatrics \& adolescent medicine},
+  volume={163},
+  number={6},
+  pages={554--558},
+  year={2009},
+  publisher={American Medical Association}
+}
+
+@misc{warl,
+  doi = {10.21415/T54S3C},
+  url = {http://homebank.talkbank.org/access/Password/Warlaumont.html},
+  author = {Warlaumont,  Anne},
+  title = {HomeBank Warlaumont Corpus},
+  publisher = {TalkBank},
+  year = {2016}
+}
+
+@article{Nee2021,
+  doi = {10.3765/plsa.v6i1.4967},
+  url = {https://doi.org/10.3765/plsa.v6i1.4967},
+  year = {2021},
+  month = mar,
+  publisher = {Linguistic Society of America},
+  volume = {6},
+  number = {1},
+  pages = {213},
+  author = {Julia Nee},
+  title = {Understanding the effects of language revitalization workshops using long-format speech environment recordings},
+  journal = {Proceedings of the Linguistic Society of America}
+}
+
+@article{casillas2019step,
+    author = {Casillas, Marisa and Cristia, Alejandrina and Zwaan, Rolf and Dingemanse, Mark},
+    title = "{A step-by-step guide to collecting and analyzing long-format speech environment (LFSE) recordings}",
+    journal = {Collabra: Psychology},
+    volume = {5},
+    number = {1},
+    year = {2019},
+    month = {05},
+    issn = {2474-7394},
+    doi = {10.1525/collabra.209},
+    url = {https://doi.org/10.1525/collabra.209},
+    note = {24},
+    eprint = {https://online.ucpress.edu/collabra/article-pdf/5/1/24/437539/209-3199-1-pb.pdf},
+}
+
+@article{lavechin2020opensource,
+title={An open-source voice type classifier for child-centered daylong recordings},
+author={Marvin Lavechin and Ruben Bousbib and Hervé Bredin and Emmanuel Dupoux and Alejandrina Cristia},
+year={2020},
+journal={Interspeech}
+}
+
+@article{rasanen2020,
+  title={ALICE: An open-source tool for automatic measurement of phoneme, syllable, and word counts from child-centered daylong recordings},
+  author={R{\"a}s{\"a}nen, Okko and Seshadri, Shreyas and Lavechin, Marvin and Cristia, Alejandrina and Casillas, Marisa},
+  journal={Behavior Research Methods},
+  pages={1--18},
+  year={2020},
+  publisher={Springer}
+}
+
+@inproceedings{AlFutaisi2019,
+  doi = {10.1145/3340555.3353751},
+  url = {https://doi.org/10.1145/3340555.3353751},
+  year = {2019},
+  month = oct,
+  publisher = {{ACM}},
+  author = {Najla Al Futaisi and Zixing Zhang and Alejandrina Cristia and Anne Warlaumont and Bjorn Schuller},
+  title = {{VCMNet}: Weakly Supervised Learning for Automatic Infant Vocalisation Maturity Analysis},
+  booktitle = {2019 International Conference on Multimodal Interaction}
+}
+
+@misc{pympi-1.70,
+	author={Lubbers, Mart and Torreira, Francisco},
+	title={pympi-ling: a {Python} module for processing {ELAN}s {EAF} and {Praat}s {TextGrid} annotation files.},
+	howpublished={\url{https://pypi.python.org/pypi/pympi-ling}},
+	year={2013-2021},
+	note={Version 1.70}
+}
+
+@article{Cychosz2020,
+  doi = {10.3758/s13428-020-01365-9},
+  url = {https://doi.org/10.3758/s13428-020-01365-9},
+  year = {2020},
+  month = feb,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {52},
+  number = {5},
+  pages = {1951--1969},
+  author = {Margaret Cychosz and Rachel Romeo and Melanie Soderstrom and Camila Scaff and Hillary Ganek and Alejandrina Cristia and Marisa Casillas and Kaya de Barbaro and Janet Y. Bang and Adriana Weisleder},
+  title = {Longform recordings of everyday life: Ethics for best practices},
+  journal = {Behavior Research Methods}
+}
+
+@misc{eaf-anonymizer-original,
+  title={HomeBank ITS file anonymizer},
+  url={https://github.com/HomeBankCode/ITS_annonymizer},
+  journal={GitHub},
+  author={Sarah MacEwan},
+  year = {2019}
+ }
+
+@inproceedings{Casillas2017,
+  author={Marisa Casillas and Elika Bergelson and Anne S. Warlaumont and Alejandrina Cristia and Melanie Soderstrom and Mark VanDam and Han Sloetjes},
+  title={A New Workflow for Semi-Automatized Annotations: Tests with Long-Form Naturalistic Recordings of Childrens Language Environments},
+  year=2017,
+  booktitle={Proc. Interspeech 2017},
+  pages={2098--2102},
+  doi={10.21437/Interspeech.2017-1418},
+  url={http://dx.doi.org/10.21437/Interspeech.2017-1418}
+}
+
+@article{Poldrack2014,
+  doi = {10.1038/nn.3818},
+  url = {https://doi.org/10.1038/nn.3818},
+  year = {2014},
+  month = oct,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {17},
+  number = {11},
+  pages = {1510--1517},
+  author = {Russell A Poldrack and Krzysztof J Gorgolewski},
+  title = {Making big data open: data sharing in neuroimaging},
+  journal = {Nature Neuroscience}
+}
+
+@misc{starter,
+  doi = {10.17910/B7.390},
+  url = {http://databrary.org/volume/390},
+  author = {Bergelson, Elika and Warlaumont, Anne and Cristia, Alejandrina and Casillas, Marisa and Rosemberg, Celia and Soderstrom, Melanie and Rowland, Caroline and Durrant, Samantha and Bunce, John},
+  title = {Starter-ACLEW},
+  publisher = {Databrary},
+  year = {2017},
+  copyright = {Databrary Access Agreement}
+}
+
+
+@article{Perkel_2019,
+	doi = {10.1038/d41586-019-01040-w},
+	url = {https://doi.org/10.1038/d41586-019-01040-w},
+	year = 2019,
+	month = {apr},
+	publisher = {Springer Science and Business Media {LLC}},
+	volume = {568},
+	number = {7750},
+	pages = {131--132},
+	author = {Jeffrey M. Perkel},
+	title = {11 ways to avert a data-storage disaster},
+	journal = {Nature}
+}
+@misc{ffmpeg,
+  title={ffmpeg tool},
+  author={ffmpeg{ }Developers},
+  url={http://ffmpeg.org/},
+  year={2021}
+}
+@article{Brase2009datacite,
+  doi = {10.2139/ssrn.1639998},
+  url = {https://doi.org/10.2139/ssrn.1639998},
+  year = {2010},
+  publisher = {Elsevier {BV}},
+  author = {Jan Brase},
+  title = {Datacite - A Global Registration Agency for Research Data},
+  journal = {{SSRN} Electronic Journal}
+}
+@misc{robert2018pydub,
+  title={Pydub},
+  author={Robert, James and Webbie, Marc and others},
+  year={2018},
+  publisher={GitHub},
+  url={http://pydub.com/}
+}
+@article{ryant2019second,
+  title={The second dihard diarization challenge: Dataset, task, and baselines},
+  author={Ryant, Neville and Church, Kenneth and Cieri, Christopher and Cristia, Alejandrina and Du, Jun and Ganapathy, Sriram and Liberman, Mark},
+  journal={arXiv preprint arXiv:1906.07839},
+  year={2019}
+}
+
+@article{ryant2018first,
+  title={First DIHARD challenge evaluation plan},
+  author={Ryant, Neville and Church, Kenneth and Cieri, Christopher and Cristia, Alejandrina and Du, Jun and Ganapathy, Sriram and Liberman, Mark},
+  journal={2018, tech. Rep.},
+  year={2018}
+}
+
+@article{ryant2020third,
+  title={Third DIHARD Challenge Evaluation Plan},
+  author={Ryant, Neville and Church, Kenneth and Cieri, Christopher and Du, Jun and Ganapathy, Sriram and Liberman, Mark},
+  journal={arXiv preprint arXiv:2006.05815},
+  year={2020}
+}
+
+@article{Wilkinson2016,
+  doi = {10.1038/sdata.2016.18},
+  url = {https://doi.org/10.1038/sdata.2016.18},
+  year = {2016},
+  month = mar,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {3},
+  number = {1},
+  author = {Mark D. Wilkinson and Michel Dumontier and IJsbrand Jan Aalbersberg and Gabrielle Appleton and Myles Axton and Arie Baak and Niklas Blomberg and Jan-Willem Boiten and Luiz Bonino da Silva Santos and Philip E. Bourne and Jildau Bouwman and Anthony J. Brookes and Tim Clark and Merc{\`{e}} Crosas and Ingrid Dillo and Olivier Dumon and Scott Edmunds and Chris T. Evelo and Richard Finkers and Alejandra Gonzalez-Beltran and Alasdair J.G. Gray and Paul Groth and Carole Goble and Jeffrey S. Grethe and Jaap Heringa and Peter A.C 't Hoen and Rob Hooft and Tobias Kuhn and Ruben Kok and Joost Kok and Scott J. Lusher and Maryann E. Martone and Albert Mons and Abel L. Packer and Bengt Persson and Philippe Rocca-Serra and Marco Roos and Rene van Schaik and Susanna-Assunta Sansone and Erik Schultes and Thierry Sengstag and Ted Slater and George Strawn and Morris A. Swertz and Mark Thompson and Johan van der Lei and Erik van Mulligen and Jan Velterop and Andra Waagmeester and Peter Wittenburg and Katherine Wolstencroft and Jun Zhao and Barend Mons},
+  title = {The {FAIR} Guiding Principles for scientific data management and stewardship},
+  journal = {Scientific Data}
+}
+
+@misc{zenodo,
+  doi = {10.25495/7GXK-RD71},
+  url = {https://www.zenodo.org/},
+  author = {{European Organization For Nuclear Research} and {OpenAIRE}},
+  keywords = {FOS: Physical sciences, Publication, Dataset},
+  language = {en},
+  title = {Zenodo},
+  publisher = {CERN},
+  year = {2013}
+}
+
+@article {dataverse,
+	title = {An Introduction to the Dataverse Network as an Infrastructure for Data Sharing},
+	journal = {Sociological Methods and Research},
+	volume = {36},
+	year = {2007},
+	pages = {173{\textendash}199},
+	abstract = {We introduce a set of integrated developments in web application software, networking, data citation standards, and statistical methods designed to put some of the universe of data and data sharing practices on somewhat firmer ground. We have focused on social science data, but aspects of what we have developed may apply more widely. The idea is to facilitate the public distribution of persistent, authorized, and verifiable data, with powerful but easy-to-use technology, even when the data are confidential or proprietary. We intend to solve some of the sociological problems of data sharing via technological means, with the result intended to benefit both the scientific community and the sometimes apparently contradictory goals of individual researchers.},
+	author = {Gary King}
+}
+
+
+@Article{10.12688/f1000research.10783.1,
+AUTHOR = { Ghosh, SS and Poline, JB and Keator, DB and Halchenko, YO and Thomas, AG and Kessler, DA and Kennedy, DN},
+TITLE = {A very simple, re-executable neuroimaging publication [version 1; peer review: 2 approved with reservations]
+},
+JOURNAL = {F1000Research},
+VOLUME = {6},
+YEAR = {2017},
+NUMBER = {124},
+DOI = {10.12688/f1000research.10783.1}
+}
+
+@Article{Eglen2017,
+author={Eglen, Stephen J.
+and Marwick, Ben
+and Halchenko, Yaroslav O.
+and Hanke, Michael
+and Sufi, Shoaib
+and Gleeson, Padraig
+and Silver, R. Angus
+and Davison, Andrew P.
+and Lanyon, Linda
+and Abrams, Mathew
+and Wachtler, Thomas
+and Willshaw, David J.
+and Pouzat, Christophe
+and Poline, Jean-Baptiste},
+title={Toward standard practices for sharing computer code and programs in neuroscience},
+journal={Nature Neuroscience},
+year={2017},
+month={Jun},
+day={01},
+volume={20},
+number={6},
+pages={770-773},
+abstract={Computational techniques are central in many areas of neuroscience and are relatively easy to share. This paper describes why computer programs underlying scientific publications should be shared and lists simple steps for sharing. Together with ongoing efforts in data sharing, this should aid reproducibility of research.},
+issn={1546-1726},
+doi={10.1038/nn.4550},
+url={https://doi.org/10.1038/nn.4550}
+}
+@article{xu2008lenatm,
+  title={The lenaTM language environment analysis system: The interpretive time segments (its) file},
+  author={Xu, D and Yapanel, U and Gray, S and Baer, CT},
+  journal={LENA Research Foundation Technical Report LTR-04-2},
+  year={2008}
+}
+
+@article{levin2021sensing,
+  title={Sensing everyday activity: Parent perceptions and feasibility},
+  author={Levin, Hannah I and Egger, Dominique and Andres, Lara and Johnson, Mckensey and Bearman, Sarah Kate and de Barbaro, Kaya},
+  journal={Infant Behavior and Development},
+  volume={62},
+  pages={101511},
+  year={2021},
+  publisher={Elsevier}
+}
+
+@inproceedings{wittenburg2006elan,
+  title={ELAN: a professional framework for multimodality research},
+  author={Wittenburg, Peter and Brugman, Hennie and Russel, Albert and Klassmann, Alex and Sloetjes, Han},
+  booktitle={5th International Conference on Language Resources and Evaluation (LREC 2006)},
+  pages={1556--1559},
+  year={2006}
+}
+
+@article{MacWhinney2000,
+  doi = {10.1162/coli.2000.26.4.657},
+  url = {https://doi.org/10.1162/coli.2000.26.4.657},
+  year = {2000},
+  month = dec,
+  publisher = {{MIT} Press - Journals},
+  volume = {26},
+  number = {4},
+  pages = {657--657},
+  author = {Brian MacWhinney},
+  title = {The {CHILDES} Project: Tools for Analyzing Talk (third edition): Volume I: Transcription format and programs,  Volume {II}: The database},
+  journal = {Computational Linguistics}
+}
+
+@article{pub.1022076283,
+ author = {Van Essen, David C. and Smith, Stephen M. and Barch, Deanna M. and Behrens, Timothy E.J. and Yacoub, Essa and Ugurbil, Kamil and Consortium, for the WU-Minn HCP},
+ doi = {10.1016/j.neuroimage.2013.05.041},
+ journal = {NeuroImage},
+ keywords = {},
+ number = {},
+ pages = {62-79},
+ title = {The WU-Minn Human Connectome Project: An overview},
+ url = {http://europepmc.org/articles/pmc3724347?pdf=render},
+ volume = {80},
+ year = {2013}
+}
+
+
+
+@misc{vandam-day,
+  doi = {10.21415/T5388S},
+  url = {http://homebank.talkbank.org/access/Public/VanDam-5minute.html},
+  author = {VanDam,  Mark},
+  title = {HomeBank VanDam Public 5-minute Corpus},
+  publisher = {TalkBank},
+  year = {2015}
+}
+
+@INPROCEEDINGS{zooniverse,
+       author = {{Borne}, K.~D. and {Zooniverse Team}},
+        title = "{The Zooniverse: A Framework for Knowledge Discovery from Citizen Science Data}",
+     keywords = {0800 EDUCATION, 0815 EDUCATION / Informal education, 1914 INFORMATICS / Data mining, 1942 INFORMATICS / Machine learning},
+    booktitle = {AGU Fall Meeting Abstracts},
+         year = 2011,
+       volume = {2011},
+        month = dec,
+          eid = {ED23C-0650},
+        pages = {ED23C-0650},
+       adsurl = {https://ui.adsabs.harvard.edu/abs/2011AGUFMED23C0650B},
+      adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@article{semenzin2020a,
+  doi = {10.31219/osf.io/z6exv},
+  url = {https://doi.org/10.31219/osf.io/z6exv},
+  year = {2020},
+  month = nov,
+  publisher = {Center for Open Science},
+  author = {Semenzin, Chiara and Hamrick, Lisa  and Seidl, Amanda and Lynne Kelleher, Bridgette and Cristia, Alejandrina},
+  title = {Describing vocalizations in young children: A big data approach through citizen science annotation},
+  journal = {}
+}
+
+@article{semenzin2020b,
+  doi = {10.31219/osf.io/gpxf5},
+  url = {https://doi.org/10.31219/osf.io/gpxf5},
+  year = {2020},
+  month = nov,
+  publisher = {Center for Open Science},
+  author = {Semenzin, Chiara and Hamrick, Lisa  and Seidl, Amanda and Lynne Kelleher, Bridgette and Cristia, Alejandrina},
+  title = {Towards Large-Scale Data Annotation of Audio from Wearables: Validating Zooniverse Annotations of Infant Vocalization Types},
+  journal = {}
+}
+
+@book{datalad_handbook,
+  doi = {10.5281/ZENODO.3608612},
+  url = {https://zenodo.org/record/3608612},
+  author = {Wagner,  Adina S. and Waite,  Laura K. and Meyer,  Kyle and Heckner,  Marisa K. and Kadelka,  Tobias and Reuter,  Niels and Waite,  Alexander Q. and Poldrack,  Benjamin and Markiewicz,  Christopher J. and Halchenko,  Yaroslav O. and Vavra,  Peter and Chormai,  Pattarawat and Poline,  Jean-Baptiste and Paas,  Lya K. and Herholz,  Peer and Mochalski,  Lisa N. and Kraljevic,  Nevena and Wiersch,  Lisa and Hutton,  Alexandre and Hanke,  Michael},
+  keywords = {data management,  book,  datalad,  version control,  teaching resource},
+  title = {The DataLad Handbook},
+  publisher = {Zenodo},
+  year = {2020},
+  copyright = {Creative Commons Attribution Share Alike 4.0 International}
+}
+
+@article{decentralization_hanke,
+  doi = {10.1515/nf-2020-0037},
+  url = {https://doi.org/10.1515/nf-2020-0037},
+  year = {2021},
+  month = jan,
+  publisher = {Walter de Gruyter {GmbH}},
+  volume = {0},
+  number = {0},
+  author = {Michael Hanke and Franco Pestilli and Adina S. Wagner and Christopher J. Markiewicz and Jean-Baptiste Poline and Yaroslav O. Halchenko},
+  title = {In defense of decentralized research data management},
+  journal = {Neuroforum}
+}
+
+@article{decentralization_robinson,
+  doi = {10.1038/sdata.2018.221},
+  url = {https://doi.org/10.1038/sdata.2018.221},
+  year = {2018},
+  month = oct,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {5},
+  number = {1},
+  author = {Danielle C. Robinson and Joe A. Hand and Mathias Buus Madsen and Karissa R. McKelvey},
+  title = {The Dat Project,  an open and decentralized research data tool},
+  journal = {Scientific Data}
+}
+
+@misc{reproducible_paper,
+author = {Adina Wagner},
+title = {datalad-handbook/repro-paper-sketch: A template to create a reproducible paper with LaTeX, Makefiles, Python, and DataLad},
+howpublished = {\url{https://github.com/datalad-handbook/repro-paper-sketch/}},
+month = {},
+year = {2020},
+note = {(Accessed on 04/30/2021)},
+journal = {}
+}
+
+@book{alpha,
+ author = {Krippendorff, Klaus},
+ title = {Content analysis : an introduction to its methodology},
+ publisher = {SAGE},
+ year = {2013},
+ address = {Los Angeles London},
+ isbn = {1412983150},
+ chapter = {12}
+ }
+
+
+@article{kappa,
+  doi = {10.1037/h0031619},
+  url = {https://doi.org/10.1037/h0031619},
+  year = {1971},
+  publisher = {American Psychological Association ({APA})},
+  volume = {76},
+  number = {5},
+  pages = {378--382},
+  author = {Joseph L. Fleiss},
+  title = {Measuring nominal scale agreement among many raters.},
+  journal = {Psychological Bulletin}
+}
+
+@Book{AC1,
+ author = {Gwet, Kilem},
+ title = {Handbook of inter-rater reliability : the definitive guide to measuring the extent of agreement among raters},
+ publisher = {Advanced Analytics, LLC},
+ year = {2014},
+ address = {Gaithersburg, MD},
+ isbn = {0970806280}
+ }
+ 
+ @article{nltk,
+  added-at = {2020-01-10T00:00:00.000+0100},
+  author = {Loper, Edward and Bird, Steven},
+  biburl = {https://www.bibsonomy.org/bibtex/2eac35636d7e2bb4a0264313ed0791372/dblp},
+  ee = {https://arxiv.org/abs/cs/0205028},
+  interhash = {1af05e5f1cea0feeea8da5f68707a841},
+  intrahash = {eac35636d7e2bb4a0264313ed0791372},
+  journal = {CoRR},
+  keywords = {dblp},
+  timestamp = {2020-01-11T11:43:05.000+0100},
+  title = {NLTK: The Natural Language Toolkit},
+  url = {http://dblp.uni-trier.de/db/journals/corr/corr0205.html#cs-CL-0205028},
+  volume = {cs.CL/0205028},
+  year = 2002
+}
+
+@article{gamma,
+  doi = {10.1162/coli_a_00227},
+  url = {https://doi.org/10.1162/coli_a_00227},
+  year = {2015},
+  month = sep,
+  publisher = {{MIT} Press - Journals},
+  volume = {41},
+  number = {3},
+  pages = {437--479},
+  author = {Yann Mathet and Antoine Widl\"{o}cher and Jean-Philippe M{\'{e}}tivier},
+  title = {The Unified and Holistic Method Gamma ($\upgamma$) for Inter-Annotator Agreement Measure and Alignment},
+  journal = {Computational Linguistics}
+}
+
+@unpublished{pygamma_agreement,
+  TITLE = {{pygamma-agreement: Gamma $\gamma$ measure for inter/intra-annotator agreement in Python}},
+  AUTHOR = {Titeux, Hadrien and Riad, Rachid},
+  URL = {https://hal.archives-ouvertes.fr/hal-03144116},
+  NOTE = {working paper or preprint},
+  YEAR = {2021},
+  MONTH = Feb,
+  KEYWORDS = {Annotation de corpus},
+  PDF = {https://hal.archives-ouvertes.fr/hal-03144116/file/gamma-paper.pdf},
+  HAL_ID = {hal-03144116},
+  HAL_VERSION = {v1},
+}
+
+@inproceedings{pyannote.metrics,
+  author = {Herv\'e Bredin},
+  title = {{pyannote.metrics: a toolkit for reproducible evaluation, diagnostic, and error analysis of speaker diarization systems}},
+  booktitle = {{Interspeech 2017, 18th Annual Conference of the International Speech Communication Association}},
+  year = {2017},
+  month = {August},
+  address = {Stockholm, Sweden},
+  url = {http://pyannote.github.io/pyannote-metrics},
+}
+
+@article{Wu2018,
+  doi = {10.2196/10046},
+  url = {https://doi.org/10.2196/10046},
+  year = {2018},
+  month = jun,
+  publisher = {{JMIR} Publications Inc.},
+  volume = {6},
+  number = {6},
+  pages = {e10046},
+  author = {Robert Wu and Daniyal Liaqat and Eyal de Lara and Tatiana Son and Frank Rudzicz and Hisham Alshaer and Pegah Abed-Esfahani and Andrea S Gershon},
+  title = {Feasibility of Using a Smartwatch to Intensively Monitor Patients With Chronic Obstructive Pulmonary Disease: Prospective Cohort Study},
+  journal = {{JMIR} {mHealth} and {uHealth}}
+}
+
+@article{Gorgolewski2016,
+  doi = {10.1038/sdata.2016.44},
+  url = {https://doi.org/10.1038/sdata.2016.44},
+  year = {2016},
+  month = jun,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {3},
+  number = {1},
+  author = {Krzysztof J. Gorgolewski and Tibor Auer and Vince D. Calhoun and R. Cameron Craddock and Samir Das and Eugene P. Duff and Guillaume Flandin and Satrajit S. Ghosh and Tristan Glatard and Yaroslav O. Halchenko and Daniel A. Handwerker and Michael Hanke and David Keator and Xiangrui Li and Zachary Michael and Camille Maumet and B. Nolan Nichols and Thomas E. Nichols and John Pellman and Jean-Baptiste Poline and Ariel Rokem and Gunnar Schaefer and Vanessa Sochat and William Triplett and Jessica A. Turner and Gaël Varoquaux and Russell A. Poldrack},
+  title = {The brain imaging data structure,  a format for organizing and describing outputs of neuroimaging experiments},
+  journal = {Scientific Data}
+}
+
+@article{Horien2020,
+  doi = {10.1038/s41562-020-01005-4},
+  url = {https://doi.org/10.1038/s41562-020-01005-4},
+  year = {2020},
+  month = dec,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {5},
+  number = {2},
+  pages = {185--193},
+  author = {Corey Horien and Stephanie Noble and Abigail S. Greene and Kangjoo Lee and Daniel S. Barron and Siyuan Gao and David O'Connor and Mehraveh Salehi and Javid Dadashkarimi and Xilin Shen and Evelyn M. R. Lake and R. Todd Constable and Dustin Scheinost},
+  title = {A hitchhiker's guide to working with large,  open-source neuroimaging datasets},
+  journal = {Nature Human Behaviour}
+}
+
+@article{Powell2021,
+  doi = {10.1038/d41586-021-00331-5},
+  url = {https://doi.org/10.1038/d41586-021-00331-5},
+  year = {2021},
+  month = feb,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {590},
+  number = {7845},
+  pages = {198--201},
+  author = {Kendall Powell},
+  title = {The broken promise that undermines human genome research},
+  journal = {Nature}
+}
+
+
+@article{hanke_defense_2021,
+	title = {In defense of decentralized research data management},
+	volume = {0},
+	issn = {2363-7013, 0947-0875},
+	url = {https://www.degruyter.com/document/doi/10.1515/nf-2020-0037/html},
+	doi = {10.1515/nf-2020-0037},
+	language = {en},
+	number = {0},
+	urldate = {2021-03-22},
+	journal = {Neuroforum},
+	author = {Hanke, Michael and Pestilli, Franco and Wagner, Adina S. and Markiewicz, Christopher J. and Poline, Jean-Baptiste and Halchenko, Yaroslav O.},
+	month = jan,
+	year = {2021},
+	pages = {000010151520200037},
+}
+
+@article{gravityspy,
+  doi = {10.1088/1361-6382/aa5cea},
+  url = {https://doi.org/10.1088/1361-6382/aa5cea},
+  year = {2017},
+  month = feb,
+  publisher = {{IOP} Publishing},
+  volume = {34},
+  number = {6},
+  pages = {064003},
+  author = {M Zevin and S Coughlin and S Bahaadini and E Besler and N Rohani and S Allen and M Cabero and K Crowston and A K Katsaggelos and S L Larson and T K Lee and C Lintott and T B Littenberg and A Lundgren and C {\O}sterlund and J R Smith and L Trouille and V Kalogera},
+  title = {Gravity Spy: integrating advanced {LIGO} detector characterization,  machine learning,  and citizen science},
+  journal = {Classical and Quantum Gravity}
+}
+
+% code
+
+@article{datalad_procedures,
+  doi = {10.17605/OSF.IO/6VCXK},
+  url = {https://osf.io/6vcxk/},
+  author = {Gautheron,  Lucas},
+  title = {DataLad Procedures for the management of long-form recordings},
+  publisher = {Open Science Framework},
+  year = {2021},
+  journal = {}
+}
+
+@article{datalad_extension,
+  doi = {10.17605/OSF.IO/C2J5A},
+  url = {https://osf.io/c2j5a/},
+  author = {Gautheron,  Lucas},
+  title = {DataLad extension for child-centered in-situ recordings},
+  publisher = {Open Science Framework},
+  year = {2021},
+  copyright = {MIT License},
+  journal = {}
+}

+ 3 - 0
requirements.txt

@@ -0,0 +1,3 @@
+seaborn
+sklearn
+git+git://github.com/LAAC-LSCP/ChildProject.git#egg=ChildProject

File diff suppressed because it is too large
+ 1658 - 0
spbasic.bst


File diff suppressed because it is too large
+ 1512 - 0
spmpsci.bst


File diff suppressed because it is too large
+ 1442 - 0
spphys.bst


+ 113 - 0
svglov3.clo

@@ -0,0 +1,113 @@
+% SVJour3 DOCUMENT CLASS OPTION SVGLOV3 -- for standardised journals
+%
+% This is an enhancement for the LaTeX
+% SVJour3 document class for Springer journals
+%
+%%
+%%
+%% \CharacterTable
+%%  {Upper-case    \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
+%%   Lower-case    \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
+%%   Digits        \0\1\2\3\4\5\6\7\8\9
+%%   Exclamation   \!     Double quote  \"     Hash (number) \#
+%%   Dollar        \$     Percent       \%     Ampersand     \&
+%%   Acute accent  \'     Left paren    \(     Right paren   \)
+%%   Asterisk      \*     Plus          \+     Comma         \,
+%%   Minus         \-     Point         \.     Solidus       \/
+%%   Colon         \:     Semicolon     \;     Less than     \<
+%%   Equals        \=     Greater than  \>     Question mark \?
+%%   Commercial at \@     Left bracket  \[     Backslash     \\
+%%   Right bracket \]     Circumflex    \^     Underscore    \_
+%%   Grave accent  \`     Left brace    \{     Vertical bar  \|
+%%   Right brace   \}     Tilde         \~}
+\ProvidesFile{svglov3.clo}
+              [2009/12/18 v3.2
+      style option for standardised journals]
+\typeout{SVJour Class option: svglov3.clo for standardised journals}
+\def\validfor{svjour3}
+\global\let\if@runhead\iftrue
+\ExecuteOptions{final,10pt}
+% No size changing allowed, hence a "copy" of size10.clo is included
+\DeclareFontShape{OT1}{cmr}{m}{n}{
+        <-6>    cmr5
+        <6-7>   cmr6
+        <7-8>   cmr7
+        <8-9>   cmr8
+        <9-10>  cmr9
+        <10-12> cmr10
+        <12-17> cmr12
+        <17->   cmr17
+      }{}
+%
+\renewcommand\normalsize{%
+\if@twocolumn
+   \@setfontsize\normalsize\@xpt{12.5pt}%
+\else
+   \if@smallext
+      \@setfontsize\normalsize\@xpt\@xiipt
+   \else
+      \@setfontsize\normalsize{9.5pt}{11.5pt}%
+   \fi
+\fi
+   \abovedisplayskip=3 mm plus6pt minus 4pt
+   \belowdisplayskip=3 mm plus6pt minus 4pt
+   \abovedisplayshortskip=0.0 mm plus6pt
+   \belowdisplayshortskip=2 mm plus4pt minus 4pt
+   \let\@listi\@listI}
+\normalsize
+\newcommand\small{%
+\if@twocolumn
+   \@setfontsize\small{8.5pt}\@xpt
+\else
+   \if@smallext
+      \@setfontsize\small\@viiipt{9.5pt}%
+   \else
+      \@setfontsize\small\@viiipt{9.25pt}%
+   \fi
+\fi
+   \abovedisplayskip 8.5\p@ \@plus3\p@ \@minus4\p@
+   \abovedisplayshortskip \z@ \@plus2\p@
+   \belowdisplayshortskip 4\p@ \@plus2\p@ \@minus2\p@
+   \def\@listi{\leftmargin\leftmargini
+               \parsep 0\p@ \@plus1\p@ \@minus\p@
+               \topsep 4\p@ \@plus2\p@ \@minus4\p@
+               \itemsep0\p@}%
+   \belowdisplayskip \abovedisplayskip
+}
+\let\footnotesize\small
+\newcommand\scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt}
+\newcommand\tiny{\@setfontsize\tiny\@vpt\@vipt}
+\if@twocolumn
+   \newcommand\large{\@setfontsize\large\@xiipt\@xivpt}
+   \newcommand\LARGE{\@setfontsize\LARGE{16pt}{18pt}}
+\else
+   \newcommand\large{\@setfontsize\large\@xipt\@xiipt}
+   \newcommand\LARGE{\@setfontsize\LARGE{13pt}{15pt}}
+\fi
+\newcommand\Large{\@setfontsize\Large\@xivpt{16dd}}
+\newcommand\huge{\@setfontsize\huge\@xxpt{25}}
+\newcommand\Huge{\@setfontsize\Huge\@xxvpt{30}}
+%
+\def\runheadhook{\rlap{\smash{\lower6.5pt\hbox to\textwidth{\hrulefill}}}}
+\if@twocolumn
+\setlength{\textwidth}{17.4cm}
+\setlength{\textheight}{234mm}
+\AtEndOfClass{\setlength\columnsep{6mm}}
+\else
+   \if@smallext
+      \setlength{\textwidth}{11.9cm}
+      \setlength{\textheight}{19.4cm}
+   \else
+      \setlength{\textwidth}{12.2cm}
+      \setlength{\textheight}{19.8cm}
+   \fi
+\fi
+%
+\AtBeginDocument{%
+\@ifundefined{@journalname}
+ {\typeout{Unknown journal: specify \string\journalname\string{%
+<name of your journal>\string} in preambel^^J}}{}}
+%
+\endinput
+%%
+%% End of file `svglov3.clo'.

File diff suppressed because it is too large
+ 1431 - 0
svjour3.cls


+ 1 - 0
vandam-data

@@ -0,0 +1 @@
+Subproject commit 9c39475ffd287d7ebd40ba37b2bc43159d1a73e3