LAAC-LSCP
/
RELIVAL


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876
							---
title: Supplementary Materials to Establishing the reliability and validity of measures extracted from long-form recordings
output:
  html_document:
    toc: yes
    toc_depth: '3'
    df_print: paged
  pdf_document:
    toc: yes
    toc_depth: 3
---

```{r setup, include=FALSE, eval=TRUE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)

set.seed(726282)

library("lme4")
library("performance") # ICC
library("ggplot2")
library("ggthemes")
library("ggpubr")
library("kableExtra")
library("psych")
library("dplyr")
library("tidyr")
library("stringr")
library("car")
library("ggbeeswarm")

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


# List of corpora we are interested in
corpora <- c('bergelson', 'cougar','fausey-trio','lucid', 'lyon', 'warlaumont','winnipeg', 'quechua') 

# Columns that should not be scaled or taken into account as metrics
no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s', 'nobs', 'nchi',
                      'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","session_id",
                      "normative","age","duration_alice", "duration_vcm" ,  "duration_vtc","duration_its" ,"its_filename")

data_sets = c('aclew', 'lena')

set.seed(726282)


source("functions.R")


```

## About this document

The pdf version of this document was generated using an RMarkDown document, which includes all the computer code to reproduce key figures, tables, and analyses mentioned in the main manuscript, as well as additional texts and analyses. In the pdf version, you will see two types of text:

> Quoted text indicates sections that appear verbatim in the main manuscript.

In contrast, unmarked text are additional texts and analyses, as well as comments for the reader who is attempting to reproduce our analyses. Readers attempting to reproduce our analyses are asked to also read comments in chunks of code in the RMarkDown version of the document, which are invisible in the pdf version. This is indicated with "Please see code in the RMarkDown version of the document."


```{r recalc-all}

# If RECALC is set to TRUE, then the ICC tables will be re-generated and the simulation re-ran. Notice that you can only do this if you have access to underlying data. If you are not one of the paper's authors, please email us for access to reproduce this section. You do not need access to reproduce all the rest.

RECALC=FALSE

if(RECALC) source("regenerate_data.R") # This code cannot be reproduced without access to the underlying datasets

if(RECALC) source("create-sib-subdataset.R") #This code may not be reproducible in the future

if(RECALC) source("create-all-icc.R") 

if(RECALC) source("simulate-r-for-icc.R") # note, this takes several minutes to run

if(RECALC) source("create-all-rs.R")
```


## Read in all data

```{r readin}

df.icc.simu <- read.csv("../OUTPUT/df.icc.simu.csv") #365 rows because there are multiple rows per metric as a function of r

mydat_aclew <- read.csv(paste0('../data_output/', 'aclew','_metrics_scaled.csv')) 
mydat_aclew <- mydat_aclew[is.element(mydat_aclew$experiment, corpora),]

mydat_lena <- read.csv(paste0('../data_output/', 'lena','_metrics_scaled.csv'))
mydat_lena <- mydat_lena[is.element(mydat_lena$experiment, corpora),]

all_rs <- read.csv("../data_output/all_rs.csv")
dist_contig_lena <- read.csv("../data_output/dist_contig_lena.csv")

df.icc.mixed<-read.csv("../OUTPUT/df.icc.mixed.csv")
df.icc.mixed$Type<-get_type(df.icc.mixed)


mydat2 <- read.csv("../data_output/dat_sib_ana.csv")

df.icc.age<-read.csv("../OUTPUT/df.icc.age.csv")
age_levels=c("(0,6]" , "(6,12]",  "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]" )
#not present in data: , "(36,42]", "(42,48]", "(48,54]"
df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=age_levels)
df.icc.age$Type<-get_type(df.icc.age)

df.icc.corpus<-read.csv("../OUTPUT/df.icc.corpus.csv")
df.icc.corpus$Type <- get_type(df.icc.corpus)

```


## SM A: Simulation to better understand ICC

We were uncertain of how to interpret ICC's numeric values. It is described as "proportion of variance explained", but we do not know if it should be considered as a percentage (like R^2) or a correlation (like r). We therefore simulated data controlling the underlying r between paired datapoints to see how ICC recovered that underlying r.

We informed the simulation by the data we have as follows:

- we had the same number of corpora, and of children in each corpus
- we had the same metrics for each (i.e., CVC, AWC, etc) -- and these metrics had the same mean & SD for day 1 of recordings as in observed data

In the simulation, we departed from reality as follows:

- we did not consider the r across multiple days observed in the data, but instead generate data points to vary r from a small correlation (r=.1), a moderate one (r=.3), a larger one (r=.5). It is unlikely that test-retest correlations in infancy would be much greater than .5 (based on previous studies using test-retest in experimental tasks), but for completeness, we also use .7 and .9
- we did not consider child age, nor variable re-recording periods
- we had a single pair of recordings (rather than variable number of re-recordings)

We used simstudy, a package created for such simulations, following the vignette https://cran.r-project.org/web/packages/simstudy/vignettes/correlated.html to create correlated data providing a correlation matrix. Results are shown in the Figure below. Each point represents the ICC extracted from a mixed model applied to one metric, combining data from all corpora. It appears that ICC values reflect underlying r values, but underestimating r more the larger r is. 

```{r icc-sim-plot, fig.cap="Results from simulations aimed at helping understand the relationship between underlying r and estimated ICC."}

plot(df.icc.simu$icc_child_id~df.icc.simu$myr,xlab="r used to simulate the data",ylab="Estimated ICC from simulated data")


```

## SM B: More information for benchmarking our results against previously reported reliability studies

First, we looked for measures of language development used with observational data that can be employed with children aged 0-3 years, and which are available at least in English. To this end, we did searches in scholar.google.com and google.com with keywords like ("language development" AND "standardized test" AND "infancy") around February 2021, complementing these searches with our own knowledge. 

All of the instruments we found rely on reports from caregivers, including the Child Observation Record Advantage (Schweinhart, McNair, & Larner, 1993); the Desired Results Developmental Profile (Kriener-Althen, Newton, Draney, & Mangione, 2020); and the MacArthur-Bates Communicative Development Inventory (MB-CDI, Fenson et al., 1994). Readers are likely most familiar with the MB-CDI, so we focus our discussion on this instrument here, to illustrate the level of test-retest reliability such instruments can attain. 

In the work most commonly cited in relation to the MB-CDI's creation, Fenson et al. (1994) reported a correlation of r=.95 in their sample of North American, monolingual infants. Since it has been over 20 years, and this instrument has been used with other samples and languages, a more relevant comparison point for our own study would be one that takes into account more attempts at using this instrument. We did not find a systematic review or meta-analysis on test-retest estimates for the MB-CDI. However, Frank et al. (2021) analyzed data archived in an MB-CDI repository, concentrating on two corpora where longitudinal data was available, American English and Norwegian. They found that, for both datasets, correlations within 2-4 months were above r=.8, with correlations at 16 months of distance (i.e., a form filled in at 8 and 24 months) at their lowest, r=.5. These correlations are very high when considering that the MB-CDI tends to have ceiling and floor effects at these extreme ages. Alcock, Meints, and Rowland (n.d.) looked at correlations when British English-speaking parents completed two versions of the form in two media (e.g., short form in paper and long form online, or vice versa) within a month. Here, the correlation was r=.8 for comprehension and r=.6 for production. 

Thus, although evidence is scant, it is sufficient to see that the MB-CDI can lead to high correlations, that are nearly comparable to those expected in standardized testing among adults. It is worth bearing in mind that test-retest reliability in parental report measures does not depend only on the consistency in the individual infants' relative scores for a given behavior, but also on the consistency of the adult in reporting it. Moreover, they are based on cumulative experience, rather than a one-shot observation, as in the case of long-form recordings. Therefore, they do not constitute an appropriate comparison point, and by and large we can be quite certain that they will yield higher reliability than metrics based on the children themselves.  Thus, parental report may not be the most appropriate comparisons for our own study because they may overestimate reliability in the child's own behavior.

<!-- For example, a meta-analysis of infant laboratory tasks, including test-retest data for sound discrimination, word recognition, and prosodic processing, found that the meta-analytic weighted average was not different from zero , suggesting that performance in these short laboratory tasks may not be captured in a stable way across testing days.  -->

Second, we did a bibliographic search for systematic reviews of test-retest reliability of standardized instruments to measure language development up to age three years that are available at least in English using similar methods to those mentioned above, with the searches performed in June 2023. Although reliability in these tests may also partially reflect consistency in the adults' reports, they are at least based on a one-shot observation of how the child behaves, rather than the adult's cumulative experience with the child. Note that some promising tests are currently in development or have only recently been released, including the NIH Baby Toolbox (NIH, 2023) and GSED (Cavallera et al., 2023), and thus we cannot include them in the present summary.

The Ages and Stages Questionnaire (ASQ) is a screening tool to measure infants and children's development based on observation of age-specific behaviors: For instance, at 6 months, in the domain of communication, one of the items asks whether the child smiles. The ASQ's reliability has been the object of a systematic review of independent evidence (Velikonja et al., 2017). Across 10 articles with data from children in the USA, Canada, China, Brazil, India, Chile, the Netherlands, Korea, and Turkey, only three articles reported test-retest correlations, two in the USA (r=.84-1)  and one in Turkey (r=.67). However, the meta-analysis authors judged these three studies to be "poor" in quality. Moreover, the ASQ is a questionnaire that an observer fills in, but it can also be administered as a parental questionnaire, reducing its comparability with our purely observational method. 

For the other tests, reliability is mainly available from reports by the companies commercializing the test. The Goldman Fristoe Articulation Test – 3rd edition (GFTA-3) (Goldman & Fristoe, 2015) focuses on the ability to articulate certain sounds through picture-based elicitation and can be used from two to five years of age. Available in English and Spanish for a USA context, it has a reported reliability of r=.92 -- although we do not know whether it is this high for two-year-olds specifically.  The Preschool Language Scales – 5th edition (Zimmerman, Steiner, & Pond, 2011) can be used to measure both comprehension and production from birth to 7 years of age. According to one report (Leaders Project, 2013), its test-retest reliability is .86 to .95, depending on the age bracket (0;0-2;11, 3;0-4;11, and 5;0-7;11, 0-7 years). The test has also been adapted to other languages, with good reported test-retest reliability (r=.93; Turkish, Sahli & Belgin, 2017). One issue we see with both of these reports is that children tested varied greatly in age, and the correlation seems to have been calculated based on the raw (rather than the normed) score. As a result, children's relative scores may have been stable over test and retest mainly because they varied greatly in age, rather than capturing variance from more meaningful individual differences. The Expressive Vocabulary Test – 2nd Edition (Williams, 1997) is a picture-based elicitation method that can be used with children from 2.5 years of age. The company Springer reports a test-retest reliability of r=.95 by age (Gurley, 2011).

Many other standardized tests exist for children over three years of age. Given that most children included in the present study were under three, these other tests do not constitute an ideal comparison point. Nonetheless, in the spirit of informativeness, it is useful to consider that a systematic review and meta-analysis has been done looking at all psychometric properties (internal consistency, reliability, measurement error, content and structural validity, convergent and discriminant validity) for standardized assessments targeted at children aged 4-12 years (Denman et al., 2017). Out of 76 assessments found in the literature, only 15 could be evaluated for their psychometric properties, and 14 reported on reliability based on independent evidence (i.e., the researchers tested and retested children, rather than relying on the company's report of reliability). Among these, correlations for test-retest reliability averaged r=.67, with a range from .35 to .76. The authors concluded that psychometric quality was limited for all assessments, but based on the available evidence, PLS-5 (whose test-retest reliability was r=.69) was among those recommended for use.

Third, and perhaps most relevant, we looked for references that evaluated the psychometric properties of measures extracted from wearable data, also in June 2023. We found no previous work attempting to do so on the basis of completely ecological, unconstrained data like ours. The closest references we could find reported on reliability and/or validity of measurements from wearable data collected in constrained situations, such as having 4.5 year old children wear interior sensors and asking them to complete four tests of balance (e.g., standing with their eyes closed; Liu et al., 2022). It is likely that consistency and test-retest reliability are higher in such cases than in data like ours, making it hard to compare. Nonetheless, to give an idea, a recent meta-analysis of wearable inertial sensors in healthy adults (Kobsar et al., 2020) found correlations between these instruments and gold standards above r= .88 for one set of measures (based on means) but much lower for another (based on variability, max weighted mean effect r = .58). Regarding test-retest reliability, the meta-analysts report ICCs above .6 for all measures for which they could find multiple studies reporting them. However, those authors point out that the majority of the included studies were classified as low quality, according to a standardized quality assessment for that work. 


## SM C: Code to reproduce Table 2

Please see code in the RMarkDown version of the document.

```{r tab2}
chiXcor= aggregate(data = mydat_aclew, child_id ~ experiment, function(child_id) length(unique(child_id)))[,2]

recXcor = aggregate(data = mydat_aclew, session_id ~ experiment, function(session_id) length(unique(session_id)))[,2]

rec_per_child = setNames(aggregate(data = mydat_aclew, session_id ~ experiment*child_id, function(session_id) length(unique(session_id))), c('experiment', 'Chi', 'No_rec'))

min_rec_per_child = aggregate(data = rec_per_child, No_rec ~ experiment, min)[,2]
max_rec_per_child = aggregate(data = rec_per_child, No_rec ~ experiment, max)[,2]
recRXchi = paste(min_rec_per_child,max_rec_per_child,sep="-")

durXcor = aggregate(data = mydat_aclew, duration_vtc ~ experiment, function(duration_vtc) round(mean(duration_vtc)/3.6e+6,1))[,2]

ageXcor = aggregate(data = mydat_aclew, age ~ experiment, function(age) round(mean(age),1))[,2]

age_min_per_corpus = aggregate(data = mydat_aclew, age ~ experiment, function(age) min(age))[,2]

age_max_per_corpus = aggregate(data = mydat_aclew, age ~ experiment, function(age) max(age))[,2]

ageRXcor = paste(age_min_per_corpus,age_max_per_corpus,sep="-")

corpus=c("bergelson", "cougar", "fausey-trio", "lucid","lyon", "quechua",  "warlaumont", "winnipeg")
location=c("Northeast US", "Northwest US", "Western US", "Northwest England", "Central France", "Highlands Bolivia", "Western US", "Western Canada")

corpus_description=cbind(corpus,location,chiXcor, recRXchi, recXcor, durXcor, ageXcor,ageRXcor)

write.table(corpus_description, "../OUTPUT/corpus_description.csv", sep='\t')

kable(corpus_description,caption="Table 2 (reproduced).")

nkids=length(levels(mydat_aclew$child_id))
nrecs=length(levels(mydat_aclew$session_id))
```


## SM D: Code to reproduce Fig. 1

Please see code in the RMarkDown version of the document.

> A variable with one of the lowest Child ICC (LENA's peak hourly AWC, Child ICC = `r round(df.icc.mixed[df.icc.mixed$data_set=="lena" & df.icc.mixed$metric=="peak_wc_adu","icc_child_id"],2)`) has a flatter regression line and wider confidence intervals, with points spread widely around the regression line. In contrast, the variable with the highest Child ICC (ACLEW's other child vocalization duration, Child ICC = `r round(df.icc.mixed[df.icc.mixed$data_set=="aclew" & df.icc.mixed$metric=="voc_och_ph","icc_child_id"],2)`) has points that are less scattered and closer to the diagonal. 


```{r icc-examples-fig1,  fig.width=6, fig.height=4.5,fig.cap="Figure 2 (reproduced). Scatterplots for two selected variables. The left one has relatively low ICCs; the right one has relatively higher ICCs."}
# figure of bad ICC: lena     used to be: avg_voc_dur_chi, now is: peak_wc_adu_ph; good ICC: lena used to be: voc_och_ph, now is: voc_dur_och_ph

# remove missing data points altogether
mydat_lena_nomissing <- mydat_lena[!is.na(mydat_lena$peak_wc_adu) & !is.na(mydat_lena$voc_dur_och_ph),]

#sample down to get 2 recs per child
mysample = NULL
for(thischild in levels(as.factor(mydat_lena_nomissing$child_id))){
  onechidat <- mydat_lena_nomissing[mydat_lena_nomissing$child_id == thischild,c("child_id","experiment","age","child_id","peak_wc_adu","voc_dur_och_ph")]
  
  if(dim(onechidat)[1]>=2){
    selrows=sample(1:dim(onechidat)[1],2)
    mysample = rbind(mysample,
                     cbind(onechidat[selrows[1],],
                           onechidat[selrows[2],c("child_id","peak_wc_adu","voc_dur_och_ph")]
                     )
    )
  } 
}
colnames(mysample) <-
  c(
    "child_id",
    "corpus",
    "age",
    "child_id1",
    "peak_wc_adu1",
    "voc_dur_och_ph1",
    "child_id2",
    "peak_wc_adu2",
    "voc_dur_och_ph2"
  )

mysample$corpus = factor(mysample$corpus)

mylimits=range(mysample[,c("peak_wc_adu1","peak_wc_adu2")])

bad <-
  ggplot(mysample, aes(peak_wc_adu1, peak_wc_adu2)) + 
  geom_point(aes(colour = factor(corpus))) +
  geom_smooth(method = 'lm', formula = y ~ x)  +
  labs(color = "Corpus")  +  
  theme(text = element_text(size = 16)) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  scale_x_continuous(name="Peak hourly AWC rec 1",limits=mylimits) +
  scale_y_continuous(name="Peak hourly AWC rec 2",limits=mylimits) +
  geom_abline(intercept = 0, slope = 1)

mylimits=range(mysample[,c("voc_dur_och_ph1","voc_dur_och_ph2")])
good <-
  ggplot(mysample, aes(voc_dur_och_ph1, voc_dur_och_ph2)) + 
  geom_point(aes(colour = factor(corpus))) +
  geom_smooth(method = 'lm', formula = y ~ x)  + 
  labs(color = "Corpus")  +  
  theme(text = element_text(size = 16)) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  scale_x_continuous(name="Other ch voc dur rec 1",limits=mylimits) +
  scale_y_continuous(name="Other ch voc dur rec 2",limits=mylimits) +
  geom_abline(intercept = 0, slope = 1)


fig2 = ggarrange(bad, good,
          ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
          font.label = list(size = 20))  + labs(color= "Corpus")  +  theme(text = element_text(size = 20))
fig2 

ggsave("fig1.png", plot = fig2, width = 6, height = 4.5, units = "in")
```


## SM E: Code to reproduce text at the beginning of the "Setting the stage" section

Please see code in the RMarkDown version of the document.


```{r gen-table}

#this code is a bit ugly-looking: 1:20 indicates the columns that contain the r coefficients for the 20 samples we drew
rval_tab=cbind(apply(all_rs[,1:20],1,mean),apply(all_rs[,1:20],1,sd),all_rs[,c("data_set","metric")])
colnames(rval_tab) <-c("m","sd","data_set","metric") 
rval_tab[,1:2]=round(rval_tab[,1:2],2)

rval_tab$fin<-paste0(rval_tab$m," ","[",rval_tab$m-2*rval_tab$sd,",",rval_tab$m+2*rval_tab$sd,"]")

mytab=rbind(cbind(rval_tab[rval_tab$data_set=="aclew" & rval_tab$metric== "wc_adu_ph","fin"],rval_tab[rval_tab$data_set=="lena" & rval_tab$metric== "wc_adu_ph","fin"]),
            cbind(rval_tab[rval_tab$metric== "can_voc_chi_ph","fin"],rval_tab[rval_tab$metric== "lena_CVC_ph","fin"]),
            cbind(rval_tab[rval_tab$metric== "simple_CTC_ph","fin"],rval_tab[rval_tab$metric== "lena_CTC_ph","fin"]),
            cbind(rval_tab[rval_tab$data_set=="aclew" & rval_tab$metric== "voc_chi_ph","fin"],rval_tab[rval_tab$data_set=="lena" & rval_tab$metric== "voc_chi_ph","fin"])
       )
mytab=gsub("0.",".",mytab,fixed=T)
colnames(mytab)<-c("aclew","lena")
rownames(mytab)<-c("AWC","CVC","CTC","Chi vocs")

rval_tab$Type<-get_type(rval_tab)

```

> Out of  `r length(levels(factor(mydat_aclew$child_id)))` children in `r length(levels(factor(mydat_aclew$experiment)))` corpora, `r length(levels(factor(dist_contig_lena$child_id)))` children (belonging to `r length(levels(factor(gsub(" .*","",dist_contig_lena$child_id))))`  corpora) could be included in this analysis, as some children did not have recordings less than two months apart. In particular, no child from the Warlaumont corpus did. 

## SM F: Code to reproduce Table 3

Please see code in the RMarkDown version of the document.


```{r tab3}
kable(mytab,caption="Table 3 (reproduced). Mean and range for Pearson correlations across sampled pairs of recordings ")

```

## SM G: Code to reproduce text above Figure 4

Please see code in the RMarkDown version of the document.


```{r reg model cor}


lr_cor <- lm(m ~ Type * data_set, data=rval_tab) 
#plot(lr_cor)
#binomial could be used, but diagnostic plots look great

reg_sum_cor=summary(lr_cor)

reg_anova_cor=Anova(lr_cor)

r_msds=aggregate(rval_tab$m,by=list(rval_tab$data_set),get_msd)
rownames(r_msds)<-r_msds$Group.1


r_msds_type=aggregate(rval_tab$m,by=list(rval_tab$Type),get_msd)
rownames(r_msds_type)<-r_msds_type$Group.1

cor_t=t.test(rval_tab$m[rval_tab$Type %in% c("Other children","Adults")] ~ rval_tab$Type[rval_tab$Type %in% c("Other children","Adults")]) #marginal

cor_t=t.test(rval_tab$m[rval_tab$Type %in% c("Other children","Male")] ~ rval_tab$Type[rval_tab$Type %in% c("Other children","Male")]) #sig

cor_t=t.test(rval_tab$m[rval_tab$Type %in% c("Other children","Female")] ~ rval_tab$Type[rval_tab$Type %in% c("Other children","Female")]) #sig

cor_t=t.test(rval_tab$m[rval_tab$Type %in% c("Other children","Output")] ~ rval_tab$Type[rval_tab$Type %in% c("Other children","Output")]) #not sig

cor_t=t.test(rval_tab$m[rval_tab$Type %in% c("Male","Output")] ~ rval_tab$Type[rval_tab$Type %in% c("Male","Output")]) #sig

cor_t=t.test(rval_tab$m[rval_tab$Type %in% c("Female","Output")] ~ rval_tab$Type[rval_tab$Type %in% c("Female","Output")]) #sig

cor_t=t.test(rval_tab$m[rval_tab$Type %in% c("Adults","Output")] ~ rval_tab$Type[rval_tab$Type %in% c("Adults","Output")]) #not sig

```

> To see whether correlations in this analysis differed by talker types and pipelines, we fit a linear model with the formula $lm(cor ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. The model was overall significant (F(`r round(reg_sum_cor$fstatistic["dendf"],2)`) = `r round(reg_sum_cor$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum_cor$adj.r.squared*100)`%. A Type 3 ANOVA on this model revealed   a significant effect of type (F = `r round(reg_anova_cor["Type","F value"],2)`, p < .001), with higher correlations for other children metrics (`r r_msds_type["Other children","x"]`) and output metrics (`r r_msds_type["Output","x"]`) than female (`r r_msds_type["Female","x"]`), and male (`r r_msds_type["Male","x"]`) metrics. There was also a marginal effect of pipeline (F = `r round(reg_anova_cor["data_set","F value"],2)`, p = `r round(reg_anova_cor["data_set","Pr(>F)"],2)`), with numerically higher correlations  for ACLEW (`r r_msds["aclew","x"]`) than for LENA metrics (`r r_msds["lena","x"]`). 

See table below for results of the Type 3 ANOVA.

```{r print out anova results rec on cor}

kable(round(reg_anova_cor,2),caption="Type 3 ANOVA on model attempting to explain variation in Child ICC as a function of talker types and pipelines.")
```


## SM H: Code to reproduce Figure 4

Please see code in the RMarkDown version of the document.


```{r r-fig4, echo=F,fig.width=4, fig.height=3,fig.cap="Figure 4 (reproduced). Violin plot reflecting the distribution of correlations."}


fig4 <- ggplot(rval_tab, aes(y = m, x = toupper(data_set))) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  stat_summary(fun = mean, geom = "point", aes(x = as.numeric(factor(toupper(data_set))) - 0.5), shape = 23, size = 3, fill = "white", color = "black", position = position_dodge(width = 0.75)) +
  #stat_summary(fun = mean, geom = "hline", aes(yintercept = mean(m)), color = "red", linetype = "dashed", size = 1, position = position_dodge(width = 0.75)) +
  theme() +labs( y = "r",x="Pipeline") + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

fig4


ggsave("fig4.png", plot = fig4, width = 4, height = 3, units = "in")

```


## SM I: Is lower Child ICC than correlations due to the fact that we are controlling for age? (Exploration)


```{r}
with_age_cvc <-lmer(can_voc_chi_ph ~ age_s + (1|experiment/child_id),data=mydat_aclew)
no_age_no_cor_cvc <-lmer(can_voc_chi_ph ~ 1 + (1|child_id),data=mydat_aclew)


```

May it be that correlation coefficients are higher than Child ICC because we control for age in the latter but not the former? To assess this, we take a metric that shows a large difference across the correlation and the Child ICC analysis, ACLEW's CVC per hour. We then refit our mixed model, but this time we do not declare age nor corpus, so that the situation is more similar to the simple correlations. When we do this, Child ICC goes up from `r round(df.icc.mixed[df.icc.mixed$metric=="can_voc_chi_ph","icc_child_id"],2)` to `r round(as.numeric(icc(no_age_no_cor_cvc)[1]),2)`.

This is still lower than the correlation observed for this same variable, `r rval_tab[rval_tab$metric=="can_voc_chi_ph","m"]`. We believe this is due to another difference between the two analyses, namely that here we are including all recordings, whereas the other analysis is focused on recordings less than 2 months away.

## SM J: Information that is incorporated in Figure 2 (model fitting)

> Out of the `r dim(df.icc.mixed)[1]` fitted models, `r table(df.icc.mixed$formula)["full"]` could be fit with the full model, yielding a measure of Corpus ICC.  For the `r table(df.icc.mixed$formula)["no_exp"]` for which the full model was singular, we fit the data with the No Corpus model, and none was singular then, allowing us to have Child ICC for all `r dim(df.icc.mixed)[1]` metrics. 


## SM J: Code to reproduce Table 4

Please see code in the RMarkDown version of the document.


```{r tab4, results="as.is"}
key_metrics = c("wc_adu_ph", "lena_CVC_ph", "lena_CTC_ph",  "voc_fem_ph",  "voc_chi_ph")
x <- merge(df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="lena",c("metric","icc_child_id")] ,
           df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="aclew",c("metric","icc_child_id")],
           by='metric', all=TRUE)
colnames(x) <- c("metric", "LENA ICC", "ACLEW ICC")
rownames(x)<-x$metric
x["lena_CTC_ph","ACLEW ICC"]<-df.icc.mixed[df.icc.mixed$metric=="simple_CTC_ph","icc_child_id"]
x["lena_CVC_ph","ACLEW ICC"]<-df.icc.mixed[df.icc.mixed$metric=="can_voc_chi_ph","icc_child_id"]
x[,2:3]=round(x[,2:3],2)
x=x[key_metrics,]

kable(x,row.names = F,digits=2,caption="Table 4 (reproduced). Most commonly used metrics.")
#x
```


## SM K: Code to reproduce text below Table 4 at the beginning of the "Overall reliability" section

Please see code in the RMarkDown version of the document.


```{r best_metric}

ismax = aggregate(df.icc.mixed$icc_child_id,by=list(df.icc.mixed$Type),max)
best_metric<-df.icc.mixed[df.icc.mixed$icc_child_id%in%ismax$x,c("metric","data_set","icc_child_id","Type")]
best_metric$icc_child_id=round(best_metric$icc_child_id,2)

```


> Figure 5 shows the distribution of Child ICC across all `r dim(df.icc.mixed)[1]` metrics, separately for each pipeline. The majority of measures had Child ICCs between .3 and .5. `r sum(df.icc.mixed$icc_child_id > .5)` measures had Child ICCs higher or equal to .5. Surprisingly, the top 6 metrics in terms of Child ICC corresponded to the "other child" category, known to have the worst accuracy according to previous analyses (Cristia et al., 2020). In an analysis fully reported in supplementary materials (SM K), we find some evidence that this may be due to the presence versus absence of siblings. The next metric with the highest Child ICC corresponded to an output measure, namely the total vocalization duration per hour extracted from ACLEW annotations (`r best_metric[best_metric$Type=="Output",c("metric","data_set")]`), with a Child ICC of `r best_metric[best_metric$Type=="Output","icc_child_id"]`. Among adult metrics, the average vocalization duration for female vocalizations for ACLEW (`r best_metric[best_metric$Type=="Female",c("metric","data_set")]`) and the ACLEW equivalent of CTC had the highest Child ICC (`r best_metric[best_metric$Type=="Female","icc_child_id"]` and `r best_metric[best_metric$Type=="Adults","icc_child_id"]`, respectively). 

## SM K: Are high Child ICCs for "other child" measures due to number or presence of siblings? (Exploration)

```{r explo-och-sibn}


model<-lmer(voc_dur_och_ph~ age_s + n_of_siblings  + (1|experiment/child_id),data=mydat2)

#is sing
model<-lmer(voc_dur_och_ph~ age_s + n_of_siblings  + (1|child_id),data=mydat2)
icc.result.split<- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
#names(icc.result.split)=c("icc_child_id", "icc_corpus")
names(icc.result.split)=c("icc_child_id")


has_n_of_sib=table(mydat2$experiment,!is.na(mydat2$n_of_siblings))
corp_w_sib=levels(factor(mydat2$experiment[!is.na(mydat2$n_of_siblings)]))
corp_w_sib_clean=corp_w_sib[1]
for(i in 2:length(corp_w_sib)) corp_w_sib_clean=paste(corp_w_sib_clean,corp_w_sib[i],sep=", ")


```


We reasoned the high Child ICC for metrics related to other children may be because children in our corpora vary in terms of the number of siblings they have, that siblings' presence may be stable across recordings, and that a greater number of siblings would lead to more other child vocalizations. As a result, any measure based on other child vocalizations would result in stable relative scores of children due to the number of siblings present. To test this hypothesis, we selected the metric with the highest Child ICC, namely ACLEW's total vocalization duration by other children. We fit the full model again to predict this metric, but this time, in addition to controlling for age, we included sibling number as a fixed effect $lmer(metric~ age + sibling\_number + (1|corpus/child))$, so that individual variation that was actually due to sibling number was captured by that fixed effect instead of the random effect for child. We had sibling number data for `r sum(has_n_of_sib[,"TRUE"])` recordings from `r length(levels(factor(mydat2$child_id[!is.na(mydat2$n_of_siblings)])))` in `r length(levels(factor(mydat2$experiment[!is.na(mydat2$n_of_siblings)])))` corpora (`r corp_w_sib_clean`). The number of siblings varied from `r min(mydat2$n_of_siblings,na.rm=T)` to `r max(mydat2$n_of_siblings,na.rm=T)`, with a mean of `r round(mean(mydat2$n_of_siblings,na.rm=T),1)` and a median of `r round(median(mydat2$n_of_siblings,na.rm=T),1)`.  Results indicated the full model was singular, so we fitted a No Corpus model to be able to extract a Child ICC. As a sanity check, we verified that the number of siblings predicted the outcome, total vocalization duration by other children -- and found that it did: ß = `r round(summary(model)$coefficients["n_of_siblings","Estimate"],2)`, t = `r round(summary(model)$coefficients["n_of_siblings","t value"],2)`, p < .001. This effect is relatively small: It means that per additional sibling, there is a .2 standard deviation increase in this variable. Turning now to how much variance is allocated to the random factor of Child, there was no difference in Child ICC in our original analysis (`r round(df.icc.mixed[df.icc.mixed$metric=="voc_dur_och_ph" & df.icc.mixed$data_set=="aclew","icc_child_id"],2)`) versus this re-analysis including the number of siblings (`r round(icc.result.split["icc_child_id"],2)`).


```{r sib-presence}
model_sib_presence<-lmer(voc_dur_och_ph~ age_s + sib_presence  + (1|experiment/child_id),data=mydat2)

#is sing
model_sib_presence<-lmer(voc_dur_och_ph~ age_s +sib_presence  + (1|child_id),data=mydat2)
icc.result.split<- t(as.data.frame(icc(model_sib_presence, by_group=TRUE))$ICC)
#names(icc.result.split)=c("icc_child_id", "icc_corpus")
names(icc.result.split)=c("icc_child_id")


```


Perhaps it is not so much the sheer number of siblings that explains variance, but the sheer presence versus absence. After all, we can imagine that the effect of the number of siblings is not monotonic. We therefore repeated the analysis above but rather than adding the actual number of siblings, we had a binary variable that was "present" if the child had any siblings, and "absent" otherwise. 

As in the sibling number analysis, the full model was singular, so we fitted a No Corpus model to be able to extract a Child ICC. We again verified that sibling presence predicted the outcome, total vocalization duration by other children -- and found that it did: ß = `r round(summary(model_sib_presence)$coefficients["sib_presencepresent","Estimate"],2)`, t = `r round(summary(model_sib_presence)$coefficients["sib_presencepresent","t value"],2)`, p < .001. This effect is, as expected, sizable: It means that there is nearly one whole standard deviation increase in this variable when there are any siblings. In addition to being a better predictor, in this model, the amount of variance allocated to individual children as measured by Child ICC was considerably higher in our original analysis (`r round(df.icc.mixed[df.icc.mixed$metric=="voc_dur_och_ph" & df.icc.mixed$data_set=="aclew","icc_child_id"],2)`) than in this re-analysis including sibling presence (`r round(icc.result.split["icc_child_id"],2)`).


## SM L: Are "bad" output measures those coming from VCM? (Exploration)

Among ACLEW measures, a fair number of them come from VCM, a module that classifies child vocalizations in terms of vocal maturity types into cry, canonical, and non-canonical categories. In unpublished analyses, we have found that VCM labels are inaccurate when compared to human labels of the same vocalizations, relatively to other metrics. In this analysis, we checked whether VCM-derived measures had lower Child ICC than other ACLEW measures. As shown in the next Figure, this was not the case: Some output measures from the ACLEW pipeline have lower Child ICC than VCM ones.

```{r,fig.cap="Figure SM L. Violin plot reflecting the distribution of Child ICC for ACLEW VCM versus other ACLEW or LENA metrics."}
vcm_type<-rep("Other ACLEW",dim(df.icc.mixed)[1])
vcm_type[df.icc.mixed$data_set=="lena"]<-"LENA"
vcm_type[grep("lp",df.icc.mixed$metric)]<-"ACLEW VCM"
vcm_type[grep("cp",df.icc.mixed$metric)]<-"ACLEW VCM"
vcm_type[grep("can",df.icc.mixed$metric)]<-"ACLEW VCM"
vcm_type[grep("cry",df.icc.mixed$metric)]<-"ACLEW VCM"
#cbind(df.icc.mixed[,c("metric","data_set","Type")],vcm_type)

ggplot(df.icc.mixed, aes(y = icc_child_id, x = vcm_type)) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  labs( y = "Child ICC",x="Type") +  theme(text = element_text(size = 20)) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

```

## SM M: Code to reproduce Figure 5

Please see code in the RMarkDown version of the document.


```{r icc-allexp-fig5, echo=F,fig.width=6, fig.height=3,fig.cap="Figure 5 (reproduced). Violin plot reflecting the distribution of Child ICC."}


fig5 <- ggplot(df.icc.mixed, aes(y = icc_child_id, x = toupper(data_set))) +
  geom_violin(alpha = 0.5) +
  stat_summary(fun = mean, geom = "point", aes(x = as.numeric(factor(toupper(data_set))) - 0.5), shape = 23, size = 3, fill = "white", color = "black", position = position_dodge(width = 0.75)) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  labs( y = "Child ICC",x="Pipeline") +  theme(text = element_text(size = 16)) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

fig5

ggsave("fig5.png", plot = fig5, width = 4, height = 3, units = "in")

```


## SM N: Code to reproduce text below Figure 5

Please see code in the RMarkDown version of the document.


```{r reg model icc}

lr_icc_chi <- lm(icc_child_id ~ Type * data_set, data=df.icc.mixed) 
#binomial could be used, but diagnostic plots look pretty good (other than some outliers)
reg_sum=summary(lr_icc_chi)

reg_anova=Anova(lr_icc_chi)


msds = aggregate(df.icc.mixed$icc_child_id,by=list(df.icc.mixed$Type),get_msd)
msds$x=gsub("0.",".",msds$x,fixed=T)
rownames(msds)<-msds$Group.1

msds_p = aggregate(df.icc.mixed$icc_child_id,by=list(df.icc.mixed$data_set),get_msd)
msds_p$x=gsub("0.",".",msds_p$x,fixed=T)
rownames(msds_p)<-msds_p$Group.1

```


> Next, we explored how similar Child ICCs were across different talker types and pipelines. We fit a linear model with the formula $lm(icc\_child\_id ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. The model was overall significant (F(`r round(reg_sum$fstatistic["dendf"],2)`) = `r round(reg_sum$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum$adj.r.squared*100)`%, suggesting much of the variance across Child ICCs was explained by these factors. A Type 3 ANOVA on this model revealed type was a significant predictor (F(`r reg_anova["Type","Df"]`) = `r round(reg_anova["Type","F value"],1)`, p<.001), as was pipeline (F(`r reg_anova["data_set","Df"]`) = `r round(reg_anova["data_set","F value"],1)`, p = `r round(reg_anova["data_set","Pr(>F)"],3)`); the interaction between type and pipeline was not significant. The main effect of type emerged because output metrics tended to have higher Child ICC (`r msds["Output","x"]`)  than those associated to adults in general (`r msds["Adults","x"]`), females (`r msds["Female","x"]`), and males (`r msds["Male","x"]`); whereas those associated with other children had even higher Child ICCs (`r msds["Other children","x"]`). The main effect of pipeline arose because of slightly higher Child ICCs for the ACLEW metrics (`r msds_p["aclew","x"]`) than for LENA metrics (`r msds_p["lena","x"]`). 


## SM P: Code to reproduce text at the beginning of the "Reliability across age groups" section

Please see code in the RMarkDown version of the document.


> Out of `r dim(df.icc.age)[1]` fitted models (`r dim(df.icc.mixed)[1]` metrics times `r length(levels(factor(df.icc.age$age_bin)))` age bins), `r sum(df.icc.age$formula=="no_chi_effect")` were singular when including a random intercept per child, and therefore they could not be included in these analyses at all. In addition, `r sum(df.icc.age$formula=="no_exp")` were singular when including a random intercept per corpus. The remaining `r sum(df.icc.age$formula=="full")` could be analyzed with the full model.

## SM Q: Code to reproduce Figure 6

Please see code in the RMarkDown version of the document.


```{r relBYage-fig6,fig.width=6, fig.height=10,fig.cap="Figure 6 (reproduced). Child ICC by metric type and pipeline, when considering each age bin separately."}

#this complicated section is just to add N of participants in each facet, we first estimate it:
facet_labels_chi=facet_labels_cor=NULL
for(thisage in levels(df.icc.age$age_bin)){#thisage="(0,6]"
  facet_labels_cor = c(facet_labels_cor,paste0("N cor=",mean(df.icc.age$ncor[df.icc.age$age_bin==thisage],na.rm=T))) #checked: there is no variation across metrics in n of corpora included
    if(min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T) !=max(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T)){
    facet_labels_chi = c(facet_labels_chi,paste0("N chi=",paste(range(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T),collapse="-"))) 
  } else {
    facet_labels_chi = c(facet_labels_chi,paste0("N chi=",min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T))) 
  }
}


#and then we structure it so that it goes on the plot
f_labels<-data.frame(age_bin=levels(df.icc.age$age_bin),facet_labels_chi=facet_labels_chi,facet_labels_cor=facet_labels_cor)

f_labels$age_bin<-factor(f_labels$age_bin,levels=age_levels)

fig6 <- ggplot(df.icc.age, aes(y = icc_child_id, x = toupper(data_set))) +
  geom_violin(alpha = 0.5) +
    stat_summary(fun = mean, geom = "point", aes(x = as.numeric(factor(toupper(data_set))) - 0.5), shape = 23, size = 3, fill = "white", color = "black", position = position_dodge(width = 0.75)) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  theme(legend.position="none") +labs( y = "r",x="Pipeline") + facet_wrap(~age_bin, ncol = 3) +
  geom_text(x=1.5,y=max(df.icc.age$icc_child_id,na.rm=T),aes(label=facet_labels_chi),data=f_labels,size=3) +
  geom_text(x=1.5,y=max(df.icc.age$icc_child_id,na.rm=T)*.95,aes(label=facet_labels_cor),data=f_labels,size=3) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

fig6

ggsave("fig6.png", plot = fig6, width = 6, height = 10, units = "in")


```

## SM R: Code to reproduce text below Figure 6

Please see code in the RMarkDown version of the document.


```{r reg model age}


age_icc <- lm(icc_child_id ~ Type * data_set * age_bin, data=df.icc.age) 
#plot(age_icc)
#binomial could be used,  diagnostic plots look good

reg_sum_age_icc=summary(age_icc)

reg_anova_age_icc=Anova(age_icc)

```

> To interrogate these results statistically, and assess whether Child ICCs tended to be higher or lower in certain age bins, we fit a linear model with the formula $lm(Child\_ICC ~ type * pipeline * age\_bin)$. The model was overall significant (F(`r round(reg_sum_age_icc$fstatistic["dendf"],2)`) = `r round(reg_sum_age_icc$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum_age_icc$adj.r.squared*100)`%, suggesting this model explained more than a third of the variance in Child ICC.  A Type 3 ANOVA on this model revealed that the interactions between type and pipeline, pipeline and age, and the three-way interaction (type, pipeline, age) were not significant. However, both the type by age bin interaction (F(`r reg_anova_age_icc["Type:age_bin","Df"]`) = `r round(reg_anova_age_icc["Type:age_bin","F value"],1)`, p < .001) and the three main effects were significant (
type: F(`r reg_anova_age_icc["Type","Df"]`) = `r round(reg_anova_age_icc["Type","F value"],1)`, p < .001; 
age: F(`r reg_anova_age_icc["age_bin","Df"]`) = `r round(reg_anova_age_icc["age_bin","F value"],1)`, p < .001; 
pipeline: F(`r reg_anova_age_icc["data_set","Df"]`) = `r round(reg_anova_age_icc["age_bin","F value"],1)`, p = .01).

See table below for results of the Type 3 ANOVA.

```{r print out anova results rec on icc by age}
kable(round(reg_anova_age_icc,2),caption="Type 3 ANOVA on model attempting to explain variation in Child ICC as a function of talker types, pipelines, and age bins.")

```


## SM S: Code to reproduce Figure 7

Please see code in the RMarkDown version of the document.


```{r icc-bycor-fig7, echo=F,fig.width=4, fig.height=4,fig.cap="Figure 7 (reproduced). Correlations in Child ICC across age bins. Each point indicates the correlation in Child ICC for the age bin named in the x-axis with every other age bin."}

r_X_age = NULL

for(ageA in levels(factor(df.icc.age$age_bin))){#ageA="(0,6]"
  for(ageB in levels(factor(df.icc.age$age_bin))) if(ageA!=ageB){#ageB="(6,12]"
    #check:
    # print(sum(df.icc.age[df.icc.age$age_bin==ageA,c("metric","data_set")] == df.icc.age[df.icc.age$age_bin==ageB,c("metric","data_set")]))
    #dim(df.icc.age[df.icc.age$age_bin==ageA,c("metric","data_set")])
    #yes, the metric & data_set are matching across the two
    r_X_age = rbind(r_X_age, cbind(ageA,ageB,cor.test(df.icc.age$icc_child_id[df.icc.age$age_bin==ageA],df.icc.age$icc_child_id[df.icc.age$age_bin==ageB])$estimate))
  }
}
r_X_age=data.frame(r_X_age)
colnames(r_X_age)[3]<-c("cor")
r_X_age$cor=as.numeric(as.character(r_X_age$cor))
r_X_age$ageA=factor(r_X_age$ageA,levels=age_levels)
#r_X_age$ageB=factor(r_X_age$ageB,levels=age_levels)

#summary(r_X_age$cor) #mean correlation across corpora is zero!

fig7 <- ggplot(r_X_age, aes(y = cor, x = ageA)) +
  geom_violin(alpha = 0.5) +
#    stat_summary(fun = mean, geom = "point", aes(x = as.numeric(factor(ageA)) - 0.5), shape = 23, size = 3, fill = "white", color = "black", position = position_dodge(width = 0.75)) +
  geom_quasirandom() +
  theme() +labs( y = "Correlation coefficient r",x="Age") + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

fig7

ggsave("fig7.png", plot = fig7, width = 4, height = 4, units = "in")
```


## SM T: Code to reproduce text in the "Reliability within corpus" section

Please see code in the RMarkDown version of the document.


> Out of `r dim(df.icc.corpus)[1]` fitted models (`r dim(df.icc.mixed)[1]` metrics times `r length(levels(factor(df.icc.corpus$corpus)))` corpora), `r sum(df.icc.corpus$formula=="no_chi_effect")` were singular when including a random intercept per child, and therefore they could not be included in these analyses at all. (Including a random intercept per corpus is not relevant here, since only data from one corpus is included in each model fit.)


## SM U: Code to reproduce Figure 8

Please see code in the RMarkDown version of the document.


```{r icc-bycor-fig8, echo=F,fig.width=8, fig.height=4,fig.cap="Figure 8 (reproduced). Child ICC by metric type and pipeline, when considering each corpus separately."}

facet_labels_chi = paste0("N chi=",chiXcor)

#and then we structure it so that it goes on the plot
f_labels<-data.frame(levels(factor(df.icc.corpus$corpus)),facet_labels_chi=facet_labels_chi)

colnames(f_labels)<-c("corpus","nchi")

fig8 <- ggplot(df.icc.corpus, aes(y = icc_child_id, x = toupper(data_set))) +
  geom_violin(alpha = 0.5) +
#    stat_summary(fun = mean, geom = "point", aes(x = as.numeric(factor(toupper(data_set))) - 0.5), shape = 23, size = 3, fill = "white", color = "black", position = position_dodge(width = 0.75)) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  theme(legend.position = "top", axis.title.y=element_blank() ,axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +labs( y = "Child ICC",x="Pipeline") +   
  facet_wrap(~corpus,nrow=1) +
  geom_text(x=1.5,y=max(df.icc.corpus$icc_child_id,na.rm=T),aes(label=facet_labels_chi),data=f_labels,size=3)+ 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

fig8

ggsave("fig8.png", plot = fig8, width = 8, height = 4, units = "in")

```

## SM V: Code to reproduce text below Figure 8

Please see code in the RMarkDown version of the document.


```{r reg model corpus}
# note, there is a warning here because some of the corpus names contain a dash. This does not affect the results

cor_icc <- lm(icc_child_id ~ Type * data_set * corpus, data=df.icc.corpus) 
#plot(cor_icc)
#binomial could be used,  diagnostic plots look okay-ish, inequality of variance with high ICCs being less numerous &  less spread out

reg_sum_cor_icc=summary(cor_icc)

reg_anova_cor_icc=Anova(cor_icc)

```

> The fact that we cannot infer reliability from one corpus based on another one was confirmed statistically: We checked whether Child ICC differed by talker types and pipelines across corpora by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * corpus)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children;  pipeline LENA or ACLEW; and corpus the corpus ID. The model was overall significant (F(`r round(reg_sum_cor_icc$fstatistic["dendf"],2)`) = `r round(reg_sum_cor_icc$fstatistic["value"],2)`, p < .001).  We found an adjusted R-squared of `r round(reg_sum_cor_icc$adj.r.squared*100)`%, suggesting this model explained nearly half of the variance in Child ICC. A Type 3 ANOVA on this model revealed several significant effects and interactions, including a three-way interaction of type, pipeline, and corpus  (F(`r reg_anova_cor_icc["Type:data_set:corpus","Df"]`) = `r round(reg_anova_cor_icc["Type:data_set:corpus","F value"],1)`, p<.001); a two-way interaction of type and corpus  (F(`r reg_anova_cor_icc["data_set:corpus","Df"]`) = `r round(reg_anova_cor_icc["data_set:corpus","F value"],1)`, p<.001); and a main effect of corpus (F(`r reg_anova_cor_icc["corpus","Df"]`) = `r round(reg_anova_cor_icc["corpus","F value"],1)`, p<.001). 

See Table below for results of the Type 3 ANOVA.


```{r print out anova results rec on icc by corpus}
kable(round(reg_anova_cor_icc,2),caption="Type 3 ANOVA on model attempting to explain variation in Child ICC as a function of talker types and pipelines across corpora.")

```

## SM W: Code to reproduce Figure 9

Please see code in the RMarkDown version of the document.


```{r icc-bycor-fig9, echo=F,fig.width=8, fig.height=4,fig.cap="Figure 9 (reproduced). Correlations in Child ICC across corpora."}


r_X_corpus = NULL

for(corpusA in levels(factor(df.icc.corpus$corpus))){
  for(corpusB in levels(factor(df.icc.corpus$corpus))) if(corpusA!=corpusB){
    #check:
    # print(sum(df.icc.corpus[df.icc.corpus$corpus==corpusA,c("metric","data_set")] == df.icc.corpus[df.icc.corpus$corpus==corpusB,c("metric","data_set")]))
    #yes, the metric & data_set are matching across the two
    r_X_corpus = rbind(r_X_corpus, cbind(corpusA,corpusB,cor.test(df.icc.corpus$icc_child_id[df.icc.corpus$corpus==corpusA],df.icc.corpus$icc_child_id[df.icc.corpus$corpus==corpusB])$estimate))
  }
}
r_X_corpus=data.frame(r_X_corpus)
colnames(r_X_corpus)[3]<-c("cor")
r_X_corpus$cor=as.numeric(as.character(r_X_corpus$cor))

#summary(r_X_corpus$cor) #mean correlation across corpora is zero!

fig9 <- ggplot(r_X_corpus, aes(y = cor, x = corpusA)) +
  geom_violin(alpha = 0.5) +
    stat_summary(fun = mean, geom = "point", aes(x = as.numeric(factor(corpusA)) - 0.5), shape = 23, size = 3, fill = "white", color = "black", position = position_dodge(width = 0.75)) +
  geom_quasirandom() +  
  theme() +labs( y = "Correlation coefficient r",x="Corpus") + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

fig9

ggsave("fig9.png", plot = fig9, width = 8, height = 4, units = "in")
```

## SM X: Code to reproduce text in the Discussion section

Please see code in the RMarkDown version of the document.


```{r bias}

urban<-rep(T,length(location))
urban[grep("Bolivia",location)]<-F

english<-rep(T,length(location))
english[grep("Bolivia",location)]<-F
english[grep("France",location)]<-F

northam<-rep(T,length(location))
northam[grep("Bolivia",location)]<-F
northam[grep("France",location)]<-F
northam[grep("England",location)]<-F

bias_tab<-data.frame(cbind(chiXcor, recXcor))
bias_tab$chiXcor<-bias_tab$chiXcor/sum(bias_tab$chiXcor)
bias_tab$recXcor<-bias_tab$recXcor/sum(bias_tab$recXcor)


```

> Our data draws mainly from urban (`r round(sum(bias_tab$recXcor[urban])*100)`% of recordings, `r round(sum(bias_tab$chiXcor[urban])*100)`% of the children, `r round(sum(urban)/length(urban)*100)`% of the corpora), English-speaking settings (`r round(sum(bias_tab$recXcor[english])*100)`% of recordings, `r round(sum(bias_tab$chiXcor[english])*100)`% of the children, `r round(sum(english)/length(english)*100)`% of the corpora), largely from North America (`r round(sum(bias_tab$recXcor[northam])*100)`% of recordings, `r round(sum(bias_tab$chiXcor[northam])*100)`% of the children, `r round(sum(northam)/length(northam)*100)`% of the corpora). 

## SM Y: Variability as a function of hardware

Another potential negative contribution to reliability that is currently not discussed is variability in the experimental setup. In a corpus collected in the Solomon Islands, children were wearing two recorders simultaneously. These were USB devices, sourced from two different providers. In this dataset, the duration of the recordings could be very different within the same pair (a ~10% difference was not atypical), which means that what is actually recorded is somewhat random in itself. Even comparing identical audio ranges covered by both recordings of each pair, the corresponding ACLEW metrics differed slightly; they were strongly correlated (R^2 was close to 0.95) but not perfectly correlated. This suggests that randomness in the recorders properties and their placement may also contribute to a decrease reliability. This reliability is importantly not at all due to changing underlying conditions, as both recorders picked up on the exact same day, so it is not due to variability in underlying behaviors. It is also not due to algorithmic variation because ACLEW algorithms are deterministic. Thus, this variability is only due to hardware differences and potentially also differences in e.g. USB placement.

## SM Z: References


Alcock, K., Meints, K., & Rowland, C. (2020). The UK Communicative Development Inventories. London, UK: J&R Press. 

Cavallera, V., Lancaster, G., Gladstone, M., Black, M. M., McCray, G., Nizar, A., ... & Janus, M. (2023). Protocol for validation of the Global Scales for Early Development (GSED) for children under 3 years of age in seven countries. BMJ open, 13(1), e062562. http://dx.doi.org/10.1136/bmjopen-2022-062562

Denman, D., Speyer, R., Munro, N., Pearce, W. M., Chen, Y.-W., & Cordier, R. (2017). Psychometric Properties of Language Assessments for Children Aged 4–12 Years: A Systematic Review. Frontiers in Psychology, 8. https://doi.org/10.3389/fpsyg.2017.01515


Fenson, L., Dale, P. S., Reznick, J. S., Bates, E., Thal, D. J., & Pethick, S. J. (1994). Variability in early communicative development. Monographs of the Society for Research in Child Development, 59(5).

Frank, M. C., Braginsky, M., Yurovsky, D., & Marchman, V. A. (2021). Variability and Consistency in Early Language Learning: The Wordbank Project. Cambridge: MIT Press.

Goldman, R., & Fristoe, M. (2015). GFTA-3: Goldman Fristoe 3 Test of Articulation. PsychCorp.

Gurley, J. (2013). Expressive Vocabulary Test, Second Edition. In Goldstein, S. & Nagliery, J. A., Encyclopedia of Child Behavior and Development. Boston, MA: Springer US.

Kobsar, D., Charlton, J. M., Tse, C. T. F., Esculier, J.-F., Graffos, A., Krowchuk, N. M., Thatcher, D., & Hunt, M. A. (2020). Validity and reliability of wearable inertial sensors in healthy adult walking: A systematic review and meta-analysis. Journal of NeuroEngineering and Rehabilitation, 17(1), 62. https://doi.org/10.1186/s12984-020-00685-3

Kriener-Althen, K., Newton, E. K., Draney, K., & Mangione, P. L. (2020). Measuring readiness for kindergarten using the Desired Results Developmental Profile. Early Education and Development, 31(5), 739-763. https://doi.org/10.1080/10409289.2020.1743160

Leaders Project (Nov 25, 2013). Test Review: PSL-5 English. Retrieved December 8, 2023, from https://www.leadersproject.org/2013/11/25/test-review-pls-5-english/.

Liu, R., Zhang, R., Qu, Y., Jin, W., Dong, B., Liu, Y., & Mao, L. (2022). Reliability analysis of inertial sensors for testing static balance of 4-to-5-year-old preschoolers. Gait & Posture, 92, 176–180. https://doi.org/10.1016/j.gaitpost.2021.11.029

NIH (2023). NIH Infant and Toddler (Baby) Toolbox. Retrieved March 16, 2023, from https://neuroscienceblueprint.nih.gov/resources-tools/blueprint-resources-tools-library/nih-infant-and-toddler-baby-toolbox

Sahli, A. S., & Belgin, E. (2017). Adaptation, validity, and reliability of the Preschool Language Scale–Fifth Edition (PLS–5) in the Turkish context: The Turkish Preschool Language Scale–5 (TPLS–5). International Journal of Pediatric Otorhinolaryngology, 98, 143–149. https://doi.org/10.1016/j.ijporl.2017.05.003

Schweinhart, L. J., Mcnair, S., Barnes, H., & Larner, A. M. (1993). Observing Young Children in Action to Assess their Development: The High/Scope Child Observation Record Study. Educational and Psychological Measurement, 53(2), 445–455. https://doi.org/10.1177/0013164493053002014

Velikonja, T., Edbrooke-Childs, J., Calderon, A., Sleed, M., Brown, A., & Deighton, J. (2017). The psychometric properties of the Ages & Stages Questionnaires for ages 2-2.5: A systematic review. Child: Care, Health and Development, 43(1), 1–17. https://doi.org/10.1111/cch.12397

Williams, K. T. (1997). Expressive vocabulary test second edition (EVT™ 2). J. Am. Acad. Child Adolesc. Psychiatry, 42, 864-872.

Zimmerman, I. L. , Steiner, V. G. , & Pond, R. E. (2011). Preschool Language Scales–5th Edition (PLS-5). Pearson. https://doi.org/10.1037/t15141-000


```{r}
# Save information about packages used
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
```