LAAC-LSCP
/
RELIVAL


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718
							---
title: Supplementary Materials to Establishing the reliability and validity of measures extracted from long-form recordings
output:
  pdf_document:
    toc: yes
    toc_depth: 3
  html_document:
    toc: yes
    toc_depth: '3'
    df_print: paged
---

```{r setup, include=FALSE, eval=TRUE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)

set.seed(726282)

library("lme4")
library("performance") # ICC
library("ggplot2")
library("ggthemes")
library("ggpubr")
library("kableExtra")
library("psych")
library("dplyr")
library("tidyr")
library("stringr")
library("car")
library("ggbeeswarm")

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


# List of corpora we are interested in
corpora <- c('bergelson', 'cougar','fausey-trio','lucid', 'lyon', 'warlaumont','winnipeg', 'quechua') 

# Columns that should not be scaled or taken into account as metrics
no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s', 'nobs', 'nchi',
                      'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","session_id",
                      "normative","age","duration_alice", "duration_vcm" ,  "duration_vtc","duration_its" ,"its_filename")

data_sets = c('aclew', 'lena')

set.seed(726282)


source("functions.R")


define_contiguous<-function(mydat){
  #this function creates a dataset with one line per paired recordings, with session_id & next_session being the paired recs
      paired <- mydat %>% 
      #the next line sorts the table by child id then age
 arrange(child_id, age) %>% 
      #the next line keeps only one line per combination of experiment and child_id
   group_by(child_id) %>% 
      #the next line, mutate, defines 3 new variables in the dataset, n_rec, age_dist_next_rec, and next_session
  mutate( #n_rec = n(), #this var isn't used later
         age_dist_next_rec = lead(age) - age, 
         #this gets the diff corresponding cell in the preceding row and a given row
         next_session = lead(session_id)) %>% 
      #the next line keeps only recs that are less than 2 months away
  filter(age_dist_next_rec<2) 
      
  return(paired)
    }

get_type<-function(mytab){
  temp_tab<-mytab
  temp_tab$Type="Output"
  temp_tab$Type[grep("fem", temp_tab$met)] <- "Female"
  temp_tab$Type[grep("mal", temp_tab$met)] <- "Male"
  temp_tab$Type[grep("och", temp_tab$met)] <- "Other children"
  temp_tab$Type[grep("adu", temp_tab$met)] <- "Adults"
  temp_tab$Type[grep("CTC", temp_tab$met)] <- "Adults"
  temp_tab$Type[grep("AWC", temp_tab$met)] <- "Adults"
  temp_tab$Type=factor(temp_tab$Type)
  temp_tab$Type
}
```

## Recalculate everything or not?

If RECALC is set to TRUE, then the ICC tables will be re-generated and the simulation re-ran. Notice that you can only do this if you have access to underlying data. If you are not one of the paper's authors, please email us for access to reproduce this section. You do not need access to reproduce all the rest.

```{r}

RECALC=FALSE

if(RECALC) source("regenerate_data.R") # This code cannot be reproduced without access to the underlying datasets

if(RECALC) source("create-all-icc.R") 

if(RECALC) source("simulate-r-for-icc.R") #when ran on Aug 2 2023, it gave an error chol(): decomposition failed -- perhaps one of our newly added datasets (lyon, quechua) is too small? -- happened again on Aug 15, need to double check how this affects simulation; I suspect this is bc overlap bet function names across this & previous code
#aug 18, now it runs for ever...
```


## Simulation to better understand ICC

We were uncertain of how to interpret ICC's numeric values. It is described as "proportion of variance explained", but we do not know if it should be considered as a percentage (like R^2) or a correlation (like r). We therefore simulated data controlling the underlying r between paired datapoints to see how ICC recovered that underlying r.

We will inform the simulation by the data we have as follows:

- we'll have the same N of corpora, and of children in each corpus
- we'll have the same metrics for each (i.e., CVC, AWC, etc) -- and these metrics will have the same mean & SD for day 1 of recordings as in observed data

We'll depart from reality as follows:

- we will not consider the r across multiple days observed in the data, but instead generate data points to vary r from a small correlation (r=.1), a moderate one (r=.3), a larger one (r=.5). It is unlikely that test-retest correlations in infancy would be much greater than .5 (based on previous studies using test-retest in experimental tasks), but for completeness, we also use .7 and .9
- we will not consider child age, nor variable re-recording periods
- we will have a single pair of recordings (rather than variable N of re-recordings)

We use simstudy, a package created for such simulations, following the vignette https://cran.r-project.org/web/packages/simstudy/vignettes/correlated.html to create correlated data providing a correlation matrix


In the following plot, each point represents the ICC extracted from a mixed model applied to one metric, combining data from all corpora. It appears that ICC values reflect underlying r values, but underestimating r more the larger r is. 

```{r}
read.csv("../output/df.icc.simu.csv")->df.icc.mixed

plot(df.icc.mixed$icc_child_id~df.icc.mixed$myr,xlab="r used to simulate the data",ylab="Estimated ICC from simulated data")

```

## More information for benchmarking our results against previously reported reliability studies

First, we looked for measures of language development used with observational data that can be employed with children aged 0-3 years, and which are available at least in English. All of the instruments we found rely on reports from caregivers, who are basing their judgments on their cumulative experience with the child (e.g., the Child Observation Record Advantage, Schweinhart, McNair, & Larner, 1993; the Desired Results Developmental Profile, REF; the MacArthur-Bates Communicative Development Inventory, REF). Readers are likely most familiar with the MB-CDI,  Fenson et al., 1994 report a correlation of r=.95 in their sample of North American, monolingual infants. We did not find a systematic review or meta-analysis providing more such estimates. However, @frank2021 analyzed data archived in a CDI repository, concentrating on American English and Norwegian, where longitudinal data was available. They found that for both datasets, correlations within 2-4 months were above r=.8, with correlations at 16 months of distance (i.e., a form filled in at 8 and 24 months) at their lowest, r=.5. These correlations are very high when considering that the CDI tends to have ceiling and floor effects at these extreme ages. Another report looked at correlations when parents completed two versions of the form in two media (e.g., short form in paper and long form online, or vice versa) within a month. Here, the correlation was r=.8 for comprehension and r=.6 for production. It is worth bearing in mind that test-retest reliability in parental report measures does not depend only on the consistency in the individual infants' ranking for a given behavior, but also on the consistency of the adult in reporting it. Moreover, they are based on cumulative experience, rather than a one-shot observation, as in the case of long-form recordings. Therefore, they do not constitute an appropriate comparison point, and by and large we can be quite certain that they will yield higher reliability than metrics based on the children themselves.  For example, a meta-analysis of infant laboratory tasks, including test-retest data for sound discrimination, word recognition, and prosodic processing, found that the meta-analytic weighted average was not different from zero, suggesting that performance in these short laboratory tasks may not be captured in a stable way across testing days. Thus, parental report (or short lab studies) may not be the most appropriate comparisons for our own study.

Second, we did a bibliographic search for systematic reviews of test-retest reliability of standardized instruments to measure language development up to age three years that are available at least in English. Although reliability in these ones may also partially reflect consistency in the adults' reports, they are at least based on a one-shot observation of how the child behaves, rather than the adult's cumulative experience with the child. Note that some promising tests are currently in development or have only recently been released, including the NIH Baby Toolbox (REF) and GSED (REF), and thus we cannot include them in the present summary.

The Ages and Stages Questionnaire (ASQ) is a screening tool to measure infants and children's development based on observation of age-specific behaviors: For instance, at 6 months, in the domain of communication, one of the items asks whether the child smiles. The ASQ's reliability has been the object of a systematic review of independent evidence (Velikonja et al., 2017). Across 10 articles with data from children in the USA, Canada, China, Brazil, India, Chile, the Netherlands, Korea, and Turkey, only three articles reported test-retest correlations, two in the USA (r=.84-1)  and one in Turkey (r=.67). However, the meta-analysis authors judged these three studies to be "poor" in quality. Moreover, the ASQ is a questionnaire that an observer fills in, but it can also be administered as a parental questionnaire, reducing its comparability with our purely observational method. 
For the other tests, reliability is mainly available from reports by the companies commercializing the test. The Goldman Fristoe Articulation Test – 3rd edition (GFTA-3) (REF) focuses on the ability to articulate certain sounds through picture-based elicitation and can be used from two to five years of age. Available in English and Spanish for a USA context, it has a reported reliability of r=.92 -- although we do not know whether it is this high for two-year-olds specifically.  The Preschool Language Scales – 5th edition can be used to measure both comprehension and production from birth to 7 years of age. According to Pearson's report, its test-retest reliability is .86 to .95, depending on the age bracket (0;0-2;11, 3;0-4;11, and 5;0-7;11, 0-7 years). The test has also been adapted to other languages, with good reported test-retest reliability (r=.93; Turkish, Sahli & Belgin, 2017). One issue we see with both of these reports is that children tested varied greatly in age, and the correlation seems to have been calculated based on the raw (rather than the normed) score. As a result, children's ranking may have been stable over test and retest mainly because they varied greatly in age. The Expressive Vocabulary Test – 2nd Edition (EVT-2) is a picture-based elicitation method that can be used with children from 2.5 years of age. The company Springer reports a test-retest reliability of r=.95 by age (REF).

Many other standardized tests exist for children over three years of age. Given that most children included in the present study were under three, these other tests do not constitute an ideal comparison point. Nonetheless, in the spirit of informativeness, it is useful to consider that a systematic review and meta-analysis has been done looking at all psychometric properties (internal consistency, reliability, measurement error, content and structural validity, convergent and discriminant validity) for standardized assessments targeted at children aged 4-12 years (REF). Out of 76 assessments found in the literature, only 15 could be evaluated for their psychometric properties, and 14 reported on reliability based on independent evidence (i.e., the researchers tested and retested children, rather than relying on the company's report of reliability). Among these, correlations for test-retest reliability averaged r=.67, with a range from .35 to .76. The authors concluded that psychometric quality was limited for all assessments, but based on the available evidence, PLS-5 (whose test-retest reliability was r=.69) was among those recommended for use.

Third, and perhaps most relevant, we looked for references that evaluated the psychometric properties of measures extracted from wearable data. We found no previous work attempting to do so on the basis of completely ecological, unconstrained data like ours. The closest references we could find reported on reliability and/or validity of measurements from wearable data collected in constrained situations, such as having 4.5 year old children wear interior sensors and asking them to complete four tests of balance (e.g., standing with their eyes closed; Liu et al., 2022). It is likely that consistency and test-retest reliability are higher in such cases than in data like ours, making it hard to compare. Nonetheless, to give an idea, a recent meta-analysis of wea?read.tablerable inertial sensors in healthy adults found correlations between these instruments and gold standards above r= .88 for one set of measures (based on means) but much lower for another (based on variability, max weighted mean effect r = .58). Regarding test-retest reliability, the meta-analysts report ICCs above .6 for all measures for which they could find multiple studies reporting them. However, those authors point out that the majority of the included studies were classified as low quality, according to a standardized quality assessment for that work. 


## Code to reproduce Table 2


```{r tab2}
mydat <- read.csv(paste0('../data_output/', 'aclew','_metrics.csv'))

child_per_corpus= aggregate(data = mydat, child_id ~ experiment, function(child_id) length(unique(child_id)))[,2]

rec_per_corpus = aggregate(data = mydat, session_id ~ experiment, function(session_id) length(unique(session_id)))[,2]

rec_per_child = setNames(aggregate(data = mydat, session_id ~ experiment*child_id, function(session_id) length(unique(session_id))), c('experiment', 'Chi', 'No_rec'))

min_rec_per_child = aggregate(data = rec_per_child, No_rec ~ experiment, min)[,2]
max_rec_per_child = aggregate(data = rec_per_child, No_rec ~ experiment, max)[,2]
rec_r_per_child = paste(min_rec_per_child,max_rec_per_child,sep="-")

dur_per_corpus = aggregate(data = mydat, duration_vtc ~ experiment, function(duration_vtc) round(mean(duration_vtc)/3.6e+6,1))[,2]

age_mean_per_corpus = aggregate(data = mydat, age ~ experiment, function(age) round(mean(age),1))[,2]

age_min_per_corpus = aggregate(data = mydat, age ~ experiment, function(age) min(age))[,2]

age_max_per_corpus = aggregate(data = mydat, age ~ experiment, function(age) max(age))[,2]

age_r_per_corpus = paste(age_min_per_corpus,age_max_per_corpus,sep="-")


experiments=c("bergelson", "cougar", "fausey-trio", "lucid","lyon", "quechua",  "warlaumont", "winnipeg")
locations=c("Northeast US", "Northwest US", "Western US", "Northwest England", "Central France", "Highlands Bolivia", "Western US", "Western Canada")

  corpus_description=cbind(experiments,child_per_corpus, rec_r_per_child, rec_per_corpus, dur_per_corpus, age_mean_per_corpus,age_r_per_corpus)


write.table(corpus_description, "../output/corpus_description.csv", sep='\t')


nkids=length(levels(factor(paste(mydat$experiment,mydat$child_id))))
nrecs=length(levels(factor(paste(mydat$experiment,mydat$session_id))))
```

## Code to reproduce Fig. 2

```{r icc-examples-fig2, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
# figure of bad ICC: lena     used to be: avg_voc_dur_chi, now is: peak_wc_adu_ph; good ICC: lena used to be: voc_och_ph, now is: voc_dur_och_ph
data_set="lena"
#mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv'))
mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
mydat <- mydat[is.element(mydat$experiment, corpora),]


# remove missing data points altogether
mydat <- mydat[!is.na(mydat$peak_wc_adu_ph) & !is.na(mydat$voc_dur_och_ph),]

#sample down to get 2 recs per child
mysample = NULL
for(thischild in levels(as.factor(mydat$child_id))){
  onechidat <- mydat[mydat$child_id == thischild,c("child_id","experiment","age","child_id","peak_wc_adu_ph","voc_dur_och_ph")]
  
  if(dim(onechidat)[1]>=2){
    selrows=sample(1:dim(onechidat)[1],2)
    mysample = rbind(mysample,
                     cbind(onechidat[selrows[1],],
                           onechidat[selrows[2],c("child_id","peak_wc_adu_ph","voc_dur_och_ph")]
                     )
    )
  } 
}
colnames(mysample) <-
  c(
    "child_id",
    "corpus",
    "age",
    "child_id1",
    "peak_wc_adu_ph1",
    "voc_dur_och_ph1",
    "child_id2",
    "peak_wc_adu_ph2",
    "voc_dur_och_ph2"
  )

mysample$corpus = factor(mysample$corpus)

mylimits=range(mysample[,c("peak_wc_adu_ph1","peak_wc_adu_ph2")])

bad <-
  ggplot(mysample, aes(peak_wc_adu_ph1, peak_wc_adu_ph2)) + 
  geom_point(aes(colour = factor(corpus))) +
  geom_smooth(method = 'lm', formula = y ~ x)  +
  labs(color = "Corpus")  +  
  theme(text = element_text(size = 16)) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  scale_x_continuous(name="Peak hourly AWC rec 1",limits=mylimits) +
  scale_y_continuous(name="Peak hourly AWC rec 2",limits=mylimits) +
  geom_abline(intercept = 0, slope = 1)

mylimits=range(mysample[,c("voc_dur_och_ph1","voc_dur_och_ph2")])
good <-
  ggplot(mysample, aes(voc_dur_och_ph1, voc_dur_och_ph2)) + 
  geom_point(aes(colour = factor(corpus))) +
  geom_smooth(method = 'lm', formula = y ~ x)  + 
  labs(color = "Corpus")  +  
  theme(text = element_text(size = 16)) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  scale_x_continuous(name="Other ch voc dur rec 1",limits=mylimits) +
  scale_y_continuous(name="Other ch voc dur rec 2",limits=mylimits) +
  geom_abline(intercept = 0, slope = 1)


ggarrange(bad, good,
          ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
          font.label = list(size = 20))  + labs(color= "Corpus")  +  theme(text = element_text(size = 20))


```


## Code to reproduce other text in the section Results, subsection Overall reliability

```{r readin}
df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")

#df.icc.mixed[df.icc.mixed$formula=="no_exp",c("data_set","metric")]


```

Out of the `r dim(df.icc.mixed)[1]` fitted models, `r table(df.icc.mixed$formula)["full"]` could be fit with the full model, yielding a measure of Corpus ICC. Of these, `r round(sum(df.icc.mixed$icc_corpus<.2,na.rm=T)/sum(!is.na(df.icc.mixed$icc_corpus))*100)`% had Corpus ICCs smaller than .2, consistent with the idea that LENA and ACLEW metrics are robust to corpus differences. For the `r table(df.icc.mixed$formula)["no_exp"]` for which the full model was singular, we fit the data with the No Corpus model, and none was singular then, allowing us to have Child ICC for all `r dim(df.icc.mixed)[1]` metrics. 


## Code to reproduce Fig. 3

```{r icc-allexp-fig3, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora."}

df.icc.mixed$Type<-get_type(df.icc.mixed)


ggplot(df.icc.mixed, aes(y = icc_child_id, x = toupper(data_set))) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  labs( y = "ICC child ID",x="Pipeline") +  theme(text = element_text(size = 20)) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 

```


## Code to reproduce text under Figure 3


The majority of measures had ICCs between .3 and .5. `r sum(df.icc.mixed$icc_child_id > .5)` measures had higher ICCs, and surprisingly, `r sum(df.icc.mixed$icc_child_id[grep("och",df.icc.mixed$metric)] > .5)` of them corresponded to the "other child" category, known to have the worst accuracy according to previous analyses (Cristia et al., 2020). 

### Checking whether high ICC for other child measures are due to presence of other siblings

```{r explo-och-sibn}

#df.icc.mixed[df.icc.mixed$icc_child_id>.5,c("data_set","metric")]
#df.icc.mixed[df.icc.mixed$icc_child_id>.5,c("data_set","metric","icc_child_id")]

myaclewdat <- read.csv(paste0('../data_output/', "aclew",'_metrics_scaled.csv'))
myaclewdat <- myaclewdat[is.element(myaclewdat$experiment, corpora),]

read.csv("../input/aclew_md.csv")->x
x$labname[x$labname=="ROW"]<-"luc"
x$labname[x$labname=="SOD"]<-"win"
x$ch_id=paste(tolower(x$labname),as.character(x$child_level_id))
x$n_of_siblings<-x$number_older_sibs

x$ch_id[x$labname %in% c("BER")] = paste(tolower(x$labname[x$labname %in% c("BER")]),as.numeric(as.character(x$child_level_id[x$labname %in% c("BER")])))
x=x[!duplicated(x$ch_id),]

myaclewdat$lab=substr(myaclewdat$experiment,1,3)
myaclewdat$ch_id=paste(myaclewdat$lab,gsub(".* ","",myaclewdat$child_id))
myaclewdat$ch_id[myaclewdat$experiment=="warlaumont"]=gsub(" 0"," ",myaclewdat$ch_id[myaclewdat$experiment=="warlaumont"])
myaclewdat$ch_id[myaclewdat$experiment=="winnipeg"]=gsub(" C"," CW",myaclewdat$ch_id[myaclewdat$experiment=="winnipeg"])

#sort(factor(myaclewdat$ch_id[myaclewdat$experiment=="winnipeg"]))
#sort(x$ch_id[x$lab=="win"])
#sum(myaclewdat$ch_id %in% x$ch_id)
#sum(x$ch_id %in% myaclewdat$ch_id)

metadata=x[,c("ch_id","n_of_siblings")]
read.csv("../input/quechua_md.csv")->x 
x$ch_id=paste("que",x$child_id)
metadata=rbind(metadata,x[,c("ch_id","n_of_siblings")])
mydat2=merge(myaclewdat,metadata,all.x=T,by="ch_id")
#table(mydat2$n_of_siblings,mydat2$experiment)
has_n_of_sib=table(mydat2$experiment,!is.na(mydat2$n_of_siblings))
corp_w_sib=levels(factor(mydat2$experiment[!is.na(mydat2$n_of_siblings)]))
corp_w_sib_clean=corp_w_sib[1]
for(i in 2:length(corp_w_sib)) corp_w_sib_clean=paste(corp_w_sib_clean,corp_w_sib[i],sep=", ")
  
model<-lmer(voc_dur_och_ph~ age_s +n_of_siblings  + (1|experiment/child_id),data=mydat2)

#is sing
model<-lmer(voc_dur_och_ph~ age_s +n_of_siblings  + (1|child_id),data=mydat2)
icc.result.split<- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
#names(icc.result.split)=c("icc_child_id", "icc_corpus")
names(icc.result.split)=c("icc_child_id")

```


We reasoned this may be because children in our corpora vary in terms of the number of siblings they have, and that siblings' presence may be stable across recordings. To address this possibility, we fit the full model again to predict number of vocalizations from other children, but this time including sibling number as a fixed effect $lmer(metric~ age + sibling_number + (1|corpus/child))$, so that individual variation that was actually due to sibling number was captured by that fixed effect instead of the random effect for child. We had sibling number data for `r sum(has_n_of_sib[,"TRUE"])` recordings from `r length(levels(factor(mydat2$child_id[!is.na(mydat2$n_of_siblings)])))` in `r length(levels(factor(mydat2$experiment[!is.na(mydat2$n_of_siblings)])))` corpora (`r corp_w_sib_clean`). We fit this model for the metric with the highest Child ICC, ACLEW's total vocalization duration by other children. Results indicated the full model was singular, so we fitted a No Corpus model to be able to extract a Child ICC. In fact, there was no difference in Child ICC in our original analysis (`r round(df.icc.mixed[df.icc.mixed$metric=="voc_dur_och_ph" & df.icc.mixed$data_set=="aclew","icc_child_id"],2)`) versus this re-analysis including the number of siblings (`r round(icc.result.split["icc_child_id"],2)`).


### Code to reproduce text before Table 3

```{r reg model icc}
#I moved this chunk here -- check that nothing is broken by it

lr_icc_chi <- lm(icc_child_id ~ Type * data_set, data=df.icc.mixed) 
#binomial could be used, but diagnostic plots look pretty good (other than some outliers)
reg_sum=summary(lr_icc_chi)

reg_anova=Anova(lr_icc_chi)

```


Going back to our overarching analyses, we explored how similar Child ICCs were across different talker types and pipelines. We fit a linear model with the formula $lm(icc_child_id ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. We found an adjusted R-squared of `r round(reg_sum$adj.r.squared*100)`%, suggesting much of the variance across Child ICCs was explained by this model. A Type 3 ANOVA on this model revealed only type was a signficant (F(`r reg_anova["Type","Df"]`)=`r round(reg_anova["Type","F value"],1)`, p<.001), whereas neither pipeline nor the interaction between type and pipeline were significant.


## Code to reproduce Table 3


```{r tab3, results="as.is"}
key_metrics = c("wc_adu_ph", "lena_CVC", "lena_CTC",  "voc_fem_ph",  "voc_chi_ph")
x <- merge(df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="lena",c("metric","icc_child_id")] ,
           df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="aclew",c("metric","icc_child_id")],
           by='metric', all=TRUE)
colnames(x) <- c("metric", "LENA ICC", "ACLEW ICC")
rownames(x)<-x$metric
x["lena_CTC","ACLEW ICC"]<-df.icc.mixed[df.icc.mixed$metric=="simple_CTC","icc_child_id"]
x["lena_CVC","ACLEW ICC"]<-df.icc.mixed[df.icc.mixed$metric=="can_voc_chi_ph","icc_child_id"]
x[,2:3]=round(x[,2:3],2)
x=x[key_metrics,]

kable(x,row.names = F,digits=2,caption="Most commonly used metrics.")
#x
```

## Code to reproduce "Exploratory analyses: Correlations based on paired analyses of recordings done within two months"


```{r correlation analysis for close recs, warning = F, echo=FALSE}
#to compare with Gilkerson, we just need to focus on AWC, CVC, CTC (in LENA, but for comparison purposes, we also include aclew)


mydat_lena <- read.csv(paste0('../data_output/', "lena",'_metrics_scaled.csv'))
#table(mydat_lena$session_id)[table(mydat_lena$session_id)>1]
mydat_lena=mydat_lena[order(mydat_lena$experiment,mydat_lena$child_id,mydat_lena$age),]
# dim(mydat_lena) #1253 obs 
key=mydat_lena[,c("experiment","child_id","age")]

dist_contig_lena <- define_contiguous(mydat_lena)
# dim(dist_contig_lena) #684
# table(dist_contig_lena$session_id)[table(dist_contig_lena$session_id)>1] #0=no repeats
mydat_lena = merge(mydat_lena,dist_contig_lena[,c("session_id","next_session")],by="session_id", all.x=T) #note that we need to do all.x=T, bc we need to keep others that are next session


#table(dist_contig_lena$experiment) #this is the number of eligible recordings per corpus

#table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])#this is the number of eligible children per corpus
#sum(table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])) #and overall
# maximally, we'll have 148 rows in the samples below

#given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do 10 because there are a lot of recs in cougar & bergelson. Later increased to 20 bc there was a lot of variability still in the average r

mydat_aclew <- read.csv(paste0('../data_output/', "aclew",'_metrics_scaled.csv')) #1254
#mydat_aclew=mydat_aclew[order(mydat_aclew$experiment,mydat_aclew$child_id,mydat_aclew$age),]
#dim(mydat_aclew)
# dim(dist_contig_aclew) #686 -- for some reason, we have 2 more eligible recs here... not sure why
#length(dist_contig_aclew$session_id[!(dist_contig_aclew$session_id %in% dist_contig_lena$session_id)])  # in fact, we have lots of sessions not in common!
#length(dist_contig_lena$session_id[!(dist_contig_lena$session_id %in% dist_contig_aclew$session_id)])
# they are present in aclew but not in lena
# NOTE: I have "winnipeg C175 C175_20151201" "winnipeg C175 C175_20160301" for lena but not aclew; and i have "fausey-trio T066 T066/T066_000700"   "quechua 1096 20190630_190025_009107" "quechua 1096 20190702_193551_008712" 

#one thing that drove me crazy was that, probably because of the small differences in inclusion (2 recs in aclew & lena respectively), I was ending up with different lists of pairings across aclew & lena. So to simplify, I'll impose the same pairing across both, which involves losing a couple of additional recs in lena
xxx=mydat_aclew[mydat_aclew$session_id %in% mydat_lena$session_id,]
rownames(xxx)<-xxx$session_id
xxx=xxx[mydat_lena$session_id,]

dist_contig_aclew <- define_contiguous(xxx) 
mydat_aclew = merge(mydat_aclew,dist_contig_aclew[,c("session_id","next_session")],by="session_id", all.x=T)

# dist_contig_lena=dist_contig_lena[((dist_contig_lena$session_id %in% dist_contig_aclew$session_id) & (dist_contig_lena$next_session %in% dist_contig_aclew$next_session)),]
# dist_contig_lena=dist_contig_lena[order(dist_contig_lena$session_id),]
# 
# dist_contig_aclew=dist_contig_aclew[((dist_contig_aclew$session_id %in% dist_contig_lena$session_id) & (dist_contig_aclew$next_session %in% dist_contig_aclew$next_session)),]
# dist_contig_aclew=dist_contig_aclew[order(dist_contig_aclew$session_id),]


nsamples=20

all_rs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples)))
colnames(all_rs[,1:nsamples])<-paste0("sample",1:nsamples)
all_rs$data_set<-df.icc.mixed$data_set
all_rs$metric<-df.icc.mixed$metric

for(i in 1:nsamples){#i=1
  
    #for each child, sample 2 contiguous recordings that are less than 2 months away
    #step 1: sample one session per child among the list of sessions that are close by
      #we use the lena one because there is no difference across lena & aclew in terms of which kids have which sessions, since in this paper all recs have both lena and aclew, so it's just a question of using one of them
    close_sessions <- dist_contig_lena  %>%
    group_by(child_id)%>%
    slice_sample(n = 1) 
  #table(close_sessions$experiment) #these have to be similar
    #dim(close_sessions)
  
  for(j in 1:dim(df.icc.mixed)[1]){# j=1
    data_set=df.icc.mixed[j,"data_set"]
    metric=df.icc.mixed[j,"metric"]
    
    if(data_set=="aclew") dat_for_cor<-mydat_aclew else  dat_for_cor<-mydat_lena

    #step 2: get data from those sampled sessions as rec1
    rec1 = subset(dat_for_cor,session_id %in% close_sessions$session_id)  
    #step 3: get the next session
    rec2 = subset(dat_for_cor,session_id %in% close_sessions$next_session)   

     all_rs[all_rs$data_set==data_set & all_rs$metric==metric,i]<-
      cor.test(rec1[,metric],rec2[,metric])$estimate
    
  }
}


#summary(all_rs)
rval_tab=cbind(apply(all_rs[,1:20],1,mean),apply(all_rs[,1:20],1,sd),all_rs[,21:22])
colnames(rval_tab) <-c("m","sd","p","met") 
rval_tab[,1:2]=round(rval_tab[,1:2],2)

rval_tab$fin<-paste0(rval_tab$m," ","[",rval_tab$m-2*rval_tab$sd,",",rval_tab$m+2*rval_tab$sd,"]")

mytab=rbind(cbind(rval_tab[rval_tab$p=="aclew" & rval_tab$met== "wc_adu_ph","fin"],rval_tab[rval_tab$p=="lena" & rval_tab$met== "wc_adu_ph","fin"]),
            cbind(rval_tab[rval_tab$met== "can_voc_chi_ph","fin"],rval_tab[rval_tab$met== "lena_CVC","fin"]),
            cbind(rval_tab[rval_tab$met== "simple_CTC","fin"],rval_tab[rval_tab$met== "lena_CTC","fin"]),
            cbind(rval_tab[rval_tab$p=="aclew" & rval_tab$met== "voc_chi_ph","fin"],rval_tab[rval_tab$p=="lena" & rval_tab$met== "voc_chi_ph","fin"])
       )
mytab=gsub("0.",".",mytab,fixed=T)
colnames(mytab)<-c("aclew","lena")
rownames(mytab)<-c("AWC","CVC","CTC","Chi vocs")
print(mytab)

```
Out of our `r length(levels(factor(mydat_aclew$experiment)))` corpora and `r length(levels(factor(mydat_aclew$child_id)))` children, `r length(levels(factor(dist_contig_lena$child_id)))` children (belonging to `r length(levels(factor(gsub(" .*","",dist_contig_lena$child_id))))`  corpora) could be included in this analysis, as some children did not have recordings less than two months apart (in particular, no child from the Warlaumont corpus did). 

## Code to reproduce Fig. 4

```{r r-fig4, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of correlation coefficients."}
#bug here probably inherited from above

rval_tab$Type<-get_type(rval_tab)


ggplot(rval_tab, aes(y = m, x = toupper(p))) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  theme() +labs( y = "r",x="Pipeline")


```

## Code to reproduce results of regression on correlation values

```{r reg model cor}


lr_cor <- lm(m ~ Type * p, data=rval_tab) 
#plot(lr_cor)
#binomial could be used, but diagnostic plots look great

reg_sum_cor=summary(lr_cor)

reg_anova_cor=Anova(lr_icc_chi)

cor_t=t.test(rval_tab$m ~ rval_tab$p)

```

To see whether correlations in this analysis differed by talker types and pipelines, we fit a linear model with the formula $lm(cor ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. We found an adjusted R-squared of `r round(reg_sum_cor$adj.r.squared*100)`%, suggesting this model did not explain a great deal of variance in correlation coefficients. A Type 3 ANOVA on this model revealed a significant effect of pipeline (F = `r round(reg_anova_cor["data_set","F value"],2)`, p = `r round(reg_anova_cor["data_set","Pr(>F)"],2)`), due to higher correlations for ACLEW (m = `r round(cor_t$estimate["mean in group aclew"],2)`) than for LENA metrics (m = `r round(cor_t$estimate["mean in group lena"],2)`). See below for fuller results.

```{r print out anova results rec on cor}

kable(round(reg_anova_cor,2))
```


## Code to reproduce text and figures in "Exploratory analyses: Reliability within corpus"

```{r read in icc by corpus}
df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
df.icc.corpus$Type <- get_type(df.icc.corpus)

```

Figure 5 addresses this question, showing the distribution of ICC across our `r dim(df.icc.mixed)[1]` metrics in each of the `r length(levels(factor(df.icc.corpus$corpus)))` included corpora.  Out of `r dim(df.icc.corpus)[1]` fitted models (`r dim(df.icc.mixed)[1]` metrics times `r length(levels(factor(df.icc.corpus$corpus)))` corpora), `r sum(df.icc.corpus$formula=="no_chi_effect")` were singular when including a random intercept per child, and therefore they could not be included in these analyses at all. (Including a random intercept per corpus is not relevant here, since only data from one corpus is included in each model fit.)


```{r icc-bycor-fig5, echo=F,fig.width=4, fig.height=10,fig.cap="Child ICC by metric type and pipeline, when considering each corpus separately."}

ggplot(df.icc.corpus, aes(y = icc_child_id, x = toupper(data_set))) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  theme(legend.position = "top", axis.title.y=element_blank() ,axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +labs( y = "ICC child ID",x="Pipeline") +   
  facet_grid(cols=vars(corpus)) 

```


```{r icc-bycor-fig6, echo=F,fig.width=4, fig.height=10,fig.cap="Correlations in Child ICC across corpora. Each point indicates the correlation in Child ICC for the corpus named in the x-axis with every other corpus."}


r_X_corpus = NULL

for(corpusA in levels(factor(df.icc.corpus$corpus))){
  for(corpusB in levels(factor(df.icc.corpus$corpus))) if(corpusA!=corpusB){
    #check:
    # print(sum(df.icc.corpus[df.icc.corpus$corpus==corpusA,c("metric","data_set")] == df.icc.corpus[df.icc.corpus$corpus==corpusB,c("metric","data_set")]))
    #yes, the metric & data_set are matching across the two
    r_X_corpus = rbind(r_X_corpus, cbind(corpusA,corpusB,cor.test(df.icc.corpus$icc_child_id[df.icc.corpus$corpus==corpusA],df.icc.corpus$icc_child_id[df.icc.corpus$corpus==corpusB])$estimate))
  }
}
r_X_corpus=data.frame(r_X_corpus)
colnames(r_X_corpus)[3]<-c("cor")
r_X_corpus$cor=as.numeric(as.character(r_X_corpus$cor))

#summary(r_X_corpus$cor) #mean correlation across corpora is zero!

ggplot(r_X_corpus, aes(y = cor, x = corpusA)) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom() +  
  theme() +labs( y = "r",x="Corpus")
```


```{r reg model corpusm}


cor_icc <- lm(icc_child_id ~ Type * data_set * corpus, data=df.icc.corpus) 
#plot(cor_icc)
#binomial could be used,  diagnostic plots look okay-ish, inequality of variance with high ICCs being less numerous &  less spread out

reg_sum_cor_icc=summary(cor_icc)

reg_anova_cor_icc=Anova(cor_icc)

```

The fact that we cannot infer reliability from one corpus based on another one was confirmed statistically: We checked whether Child ICC differed by talker types and pipelines across corpora by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * corpus)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children;  pipeline LENA or ACLEW; and corpus the corpus ID. We found an adjusted R-squared of `r round(reg_sum_cor_icc$adj.r.squared*100)`%, suggesting this model explained nearly half of the variance in Child ICC. A Type 3 ANOVA on this model revealed several significant effects and interactions, including a three-way interaction of type, pipeline, and corpus; a two-way interaction of type and corpus; and a main effect of corpus. See below for more information.

```{r print out anova results rec on icc by corpus}
kable(round(reg_anova_cor_icc,2))

```


## Code to reproduce text and figures in "Exploratory analyses: Reliability across age groups"


```{r prepAge}
df.icc.age<-read.csv("../output/df.icc.age.csv")
age_levels=c("(0,6]" , "(6,12]",  "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]" )
#not present in data: , "(36,42]", "(42,48]", "(48,54]"

df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=age_levels)

df.icc.age$Type<-get_type(df.icc.age)
```

Out of `r dim(df.icc.age)[1]` fitted models (`r dim(df.icc.mixed)[1]` metrics times `r length(levels(factor(df.icc.age$age_bin)))` age bins), `r sum(df.icc.age$formula=="no_chi_effect")` were singular when including a random intercept per child, and therefore they could not be included in these analyses at all. In addition, `r sum(df.icc.age$formula=="no_exp")` were singular when including a random intercept per corpus. The remaining `r sum(df.icc.age$formula=="full")` could be analyzed with the full model.


```{r relBYage-fig7, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}

#this complicated section is just to add N of participants in each facet, we first estimate it:
facet_labels_chi=facet_labels_cor=NULL
for(thisage in levels(df.icc.age$age_bin)){#thisage="(0,6]"
  facet_labels_cor = c(facet_labels_cor,paste0("N cor=",min(df.icc.age$ncor[df.icc.age$age_bin==thisage],na.rm=T))) #checked: there is no variation across metrics in n of corpora included
    if(min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T) !=max(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T)){
    facet_labels_chi = c(facet_labels_chi,paste0("N chi=",paste(range(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T),collapse="-"))) 
  } else {
    facet_labels_chi = c(facet_labels_chi,paste0("N chi=",min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T))) 
  }
}

#and then we structure it so that it goes on the plot
f_labels<-data.frame(age_bin=levels(df.icc.age$age_bin),facet_labels_chi=facet_labels_chi,facet_labels_cor=facet_labels_cor)

f_labels$age_bin<-factor(f_labels$age_bin,levels=age_levels)

ggplot(df.icc.age, aes(y = icc_child_id, x = toupper(data_set))) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom(aes(colour = Type,shape = Type)) +  
  theme(legend.position="none") +labs( y = "r",x="Pipeline") + facet_wrap(~age_bin, ncol = 3) +
  geom_text(x=1.5,y=max(df.icc.age$icc_child_id,na.rm=T),aes(label=facet_labels_chi),data=f_labels,size=3) +
  geom_text(x=1.5,y=max(df.icc.age$icc_child_id,na.rm=T)*.95,aes(label=facet_labels_cor),data=f_labels,size=3)

```


```{r icc-bycor-fig8, echo=F,fig.width=4, fig.height=4,fig.cap="Correlations in Child ICC across corpora. Each point indicates the correlation in Child ICC for the corpus named in the x-axis with every other corpus."}

r_X_age = NULL

for(ageA in levels(factor(df.icc.age$age_bin))){#ageA="(0,6]"
  for(ageB in levels(factor(df.icc.age$age_bin))) if(ageA!=ageB){#ageB="(6,12]"
    #check:
    # print(sum(df.icc.age[df.icc.age$age_bin==ageA,c("metric","data_set")] == df.icc.age[df.icc.age$age_bin==ageB,c("metric","data_set")]))
    #dim(df.icc.age[df.icc.age$age_bin==ageA,c("metric","data_set")])
    #yes, the metric & data_set are matching across the two
    r_X_age = rbind(r_X_age, cbind(ageA,ageB,cor.test(df.icc.age$icc_child_id[df.icc.age$age_bin==ageA],df.icc.age$icc_child_id[df.icc.age$age_bin==ageB])$estimate))
  }
}
r_X_age=data.frame(r_X_age)
colnames(r_X_age)[3]<-c("cor")
r_X_age$cor=as.numeric(as.character(r_X_age$cor))
r_X_age$ageA=factor(r_X_age$ageA,levels=age_levels)
#r_X_age$ageB=factor(r_X_age$ageB,levels=age_levels)

#summary(r_X_age$cor) #mean correlation across corpora is zero!

ggplot(r_X_age, aes(y = cor, x = ageA)) +
  geom_violin(alpha = 0.5) +
  geom_quasirandom() +  
  theme() +labs( y = "r",x="Age")
```


```{r reg model age}


age_icc <- lm(icc_child_id ~ Type * data_set * age_bin, data=df.icc.age) 
#plot(age_icc)
#binomial could be used,  diagnostic plots look good

reg_sum_age_icc=summary(age_icc)

reg_anova_age_icc=Anova(age_icc)

```

As we did in the previous section for corpus, we checked whether Child ICC differed by talker types and pipelines across age bins by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * age_bin)$. We found an adjusted R-squared of `r round(reg_sum_age_icc$adj.r.squared*100)`%, suggesting this model explained about a third of the variance in Child ICC. However, a Type 3 ANOVA on this model revealed only an interaction of type and age bin, as well as a main effect of age bin, suggesting less complex effects than in the case of corpus. See below for more information.


```{r print out anova results rec on icc by age}
kable(round(reg_anova_age_icc,2))

```


## Save information about packages used

```{r}
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
```