LAAC-LSCP
/
RELIVAL


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
							---
title: Establishing the reliability and validity of measures extracted from long-form
  recordings
output:
  pdf_document:
    toc: yes
    toc_depth: 3
  html_document:
    toc: yes
    toc_depth: '3'
    df_print: paged
---

```{r setup, include=FALSE, eval=TRUE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE)

set.seed(185729481)

library("lme4")
library("performance") # ICC
library("ggplot2")
library("ggthemes")
library("ggpubr")
library("kableExtra")
library("psych")
library("dplyr")
library("tidyr")
library("stringr")

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


# List of corpora we are interested in
corpora <- c('bergelson', 'lucid', 'winnipeg', 'warlaumont','cougar','fausey-trio') 

# Columns that should not be scaled or taken into account as metrics
no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s', 'nobs', 'nchi',
                      'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","usession_id",
                      "normative","age","duration_alice", "duration_vcm" ,  "duration_vtc","duration_its" )

data_sets = c('aclew', 'lena')

check_small_var<-function(x,y,i) round(x[i],3)==round(y[i],3) & round(y[i],3) == 0

fit_child_model<-function(dataframe, metric){
    # Fit formula where experiment is removed
    formula <- as.formula(paste0(metric, "~ age_s + (1|child_id)"))
    model <- lmer(formula, data=dataframe)
    
    return (model)
}

extract_chi_variables<-function(model){
    icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
    icc.result.split <- c(as.data.frame(icc(model, by_group=TRUE))$ICC, NA)
    
    r2.nakagawa <- r2_nakagawa(model)
    
    ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
    ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
    # chi, NA, residual
    ranefs_vars <-c(ranefs_vars[1],NA,ranefs_vars[2])
    ranefs_stdv <-c(ranefs_stdv[1],NA,ranefs_stdv[2])
    # chi, NA
    ns<-c(unlist(summary(model)$n),NA)
    return (c(coefficients(summary(model))["age_s",],
                icc.result.mixed,
                icc.result.split,
                ranefs_vars,
                ranefs_stdv,
                r2.nakagawa,
                nobs(model),ns))
}

extract_full_variables<-function(model){
    icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
    icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
    r2.nakagawa <- r2_nakagawa(model)
    
    ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
    ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
      
    ns<-t(data.frame(summary(model)$n))
    
    return (c(  coefficients(summary(model))["age_s",],
                icc.result.mixed,
                icc.result.split,
                ranefs_vars,
                ranefs_stdv,
                r2.nakagawa,
                nobs(model),ns))
}

new_fit_models<-function(dataframe, data_set, metric, age, fit_full = TRUE){
  #dataframe=mydat ; age = NA ; full = TRUE
  iqr = quantile(dataframe[,metric],.75,na.rm=T)-quantile(dataframe[,metric],.25,na.rm=T)
  
  if(fit_full){
    # Fit full model
    formula <- as.formula(paste0(metric, "~ age_s + (1|experiment/child_id)"))
    model <- lmer(formula, data=dataframe)
  }
  
  if(fit_full && !isSingular(model)) # Fitted full model short circuit and
  {
    form="full"
    
    sw=shapiro.test(resid(model))$p
    mod_variables <- extract_full_variables(model)
    
    # Build line
  } else {
    model <- fit_child_model(dataframe, metric)
    if(!isSingular(model)){
      
      form = "no_exp"
      sw=shapiro.test(resid(model))$p
      mod_variables <- extract_chi_variables(model)
      
    } else {
      form='no_chi_effect'
      sw = NA
      mod_variables = c(
        NA,NA,NA, # c(coefficients(summary(model))["age_s",]
        NA,NA,    # icc.result.mixed
        NA,NA,    # icc.result.split
        NA,NA,NA, # ranefs_vars
        NA,NA,NA, # raners_std
        NA,NA,    # r2
        NA,NA,NA) # nobs (child, corpus), nobs
    }
  }
  
  icc.row = c(data_set, age, metric, iqr, mod_variables, form, sw)
  return (icc.row)
}
```

## Recalculate everything or not?

If RECALC is set to TRUE, then the ICC tables will be re-generated.

```{r}

RECALC=FALSE

```


## Generate ICC tables


```{r create-all-icc, eval=RECALC}
df.icc.mixed.cols = c("data_set","age_bin", "metric", "iqr",
                      "age_b","age_se","age_t", # beta, standard error, T
                      "icc_adjusted", "icc_conditional", 
                      "icc_child_id", "icc_corpus",
                      "child_id_var","corpus_var","residual_var",
                      "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
                      "nobs","nchi", "ncor",
                      "formula","sw")

df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
                          stringsAsFactors = FALSE)

for (data_set in data_sets){   # data_set = "aclew"
  # Load data
  mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  metrics <- colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  for(metric in metrics) 
  {  # metric = "voc_chi_ph"
    icc.row <- new_fit_models(mydat, data_set, metric, NA, TRUE)
    df.icc.mixed[nrow(df.icc.mixed) + 1,] <- icc.row
  } 
}

write.csv(df.icc.mixed,"../output/df.icc.mixed.csv",row.names=F)


# repeat for the version within each corpus
df.icc.corpus.cols = c("data_set","corpus","metric", "iqr",
                       "age_b","age_se","age_t",
                       "icc_adjusted", "icc_conditional",
                       "icc_child_id", "icc_corpus",
                        "child_id_var","corpus_var","residual_var",
                        "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
                        "nobs","nchi", "ncor",
                        "formula","sw")
df.icc.corpus = data.frame(matrix(ncol=length(df.icc.corpus.cols),nrow=0, dimnames=list(NULL, df.icc.corpus.cols)),
                           stringsAsFactors=FALSE)


for (data_set in data_sets){   # data_set = "aclew"
    # Load data 
    mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
    for(corpus in corpora){
      mycordat <- mydat[mydat$experiment == corpus, ]
      metrics <- colnames(mycordat)[!is.element(colnames(mycordat), no.scale.columns)]
      for(metric in metrics) 
      {  # metric = "avg_voc_dur_mal"
        icc.row <- new_fit_models(mycordat, data_set, metric, corpus, FALSE)
        df.icc.corpus[nrow(df.icc.corpus) + 1,] <- icc.row
      } 
  }
}


write.csv(df.icc.corpus,"../output/df.icc.corpus.csv",row.names=F)
```

```{r}
mydat <- read.csv(paste0('../data_output/', 'aclew','_metrics.csv'))

child_per_corpus = setNames(aggregate(data = mydat, child_id ~ experiment, function(child_id) length(unique(child_id))), c('experiment', 'No_Children'))
rec_per_corpus = setNames(aggregate(data = mydat, session_id ~ experiment, function(session_id) length(unique(session_id))), c('experiment', 'No_Rec'))
dur_per_corpus = setNames(aggregate(data = mydat, duration_vtc ~ experiment, function(duration_vtc) sum(duration_vtc)/3.6e+6), c('experiment', 'Duration_h'))
age_mean_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) mean(age)), c('experiment', 'Mean_Age'))
age_min_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) min(age)), c('experiment', 'Min_Age'))
age_max_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) max(age)), c('experiment', 'Max_Age'))

corp_code = data.frame(
  experiment=c("bergelson", "cougar", "fausey-trio", "lucid", "warlaumont", "winnipeg"),
  code=c("BER", "COU", "TRI", "L05", "WAR", "MCD"),
  location=c("Northeast US", "Northwest US", "Western US", "Northwest England", "Western US", "Western Canada")
)

corp_desc_list = list(corp_code, child_per_corpus, rec_per_corpus, dur_per_corpus, age_mean_per_corpus, age_min_per_corpus, age_max_per_corpus)

corpus_description <- Reduce(function(x, y) merge(x, y, all=TRUE), corp_desc_list)
corpus_description <- transform(corpus_description, Age_Range=paste(Min_Age, Max_Age, sep="-"))
corpus_description <- subset(corpus_description, select = -c(Min_Age, Max_Age))
write.csv(corpus_description, "../output/corpus_description.csv", sep='\t')
nkids=length(levels(factor(paste(mydat$experiment,mydat$child_id))))
```


```{r icc-age, eval=RECALC}

#We do this one separately because we want to standardize metrics within each age group

# repeat within age group bins
df.icc.age.cols = c("data_set","age_bin","metric", "iqr",
                    "age_b","age_se","age_t", # beta, standard error, T
                    "icc_adjusted", "icc_conditional", 
                    "icc_child_id", "icc_corpus",
                    "child_id_var","corpus_var","residual_var",
                    "child_id_sd","corpus_sd","residual_sd",
                    "nobs", "nchi","ncor",
                    "formula","sw")


df.icc.age = data.frame(matrix(ncol=length(df.icc.age.cols),nrow=0, dimnames=list(NULL, df.icc.age.cols)),
                        stringsAsFactors=FALSE)


for (data_set in data_sets){   # data_set = "aclew"
  # Load data and calculate age cuts
  mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
  mydat$age_bin <- cut(mydat$age,c(0:6*6))
  
  metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  for(thisage in levels(mydat$age_bin))
  {   #thisage= "(0,6]"
    thiscordata <- mydat[mydat$age_bin == thisage,]
    for(metric in metrics) 
    {  # metric = "avg_voc_dur_mal"
      
      pre_scaled_metric <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
      thiscordata[abs(pre_scaled_metric)>2.5 & !is.na(pre_scaled_metric), metric] <- NA
      thiscordata[, metric] <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
      if(dim(thiscordata)[1] > 30 & length(levels(factor(thiscordata$experiment))) > 1)
      {
        icc.row <- new_fit_models(thiscordata, data_set, metric, thisage, TRUE)
        
      }else{
        icc.row <- c(data_set,thisage,metric,iqr,
                     NA,NA,NA,
                     NA,NA,
                     NA,NA,
                     NA,NA,NA,
                     NA,NA,NA,
                     NA,NA,NA,
                     "not_enough_data",NA)
      }
      df.icc.age[nrow(df.icc.age) + 1,] <- icc.row
    } 
  }
}
write.csv(df.icc.age,"../output/df.icc.age.csv",row.names=F)
```


## Describe datasets

```{r}
df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")

```

We are looking here at `r length(corpora)` corpora, `r nkids` children, `r max(df.icc.mixed$nobs[df.icc.mixed$data_set=="lena"])` recordings, `r length(levels(factor(df.icc.mixed$metric)))` many metrics.
<!-- The number of children comes from nkids, in the first chunk that is not evaluated. -->


## Reliability analyses combining all corpora

```{r icc-allexp, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora."}
df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")

icc_exp <- ggplot(df.icc.mixed, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
  geom_jitter( aes(x = icc_corpus,y=0,colour=toupper(data_set)))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette)  +  theme(text = element_text(size = 20))  + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline") 

icc_chi <- ggplot(df.icc.mixed, aes(x = icc_child_id, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
  geom_jitter(aes(x = icc_child_id,y=0,colour=toupper(data_set)))+ scale_fill_colorblind()+scale_colour_manual(values=cbPalette) +  theme(text = element_text(size = 20))   + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline") 


ggarrange(icc_exp, icc_chi,
          labels = c("A", "B"),
          ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=-0.1,
          font.label = list(size = 20))  +  theme(text = element_text(size = 20)) 


```


```{r worst, results="as.is"}

x <- head(df.icc.mixed[order(df.icc.mixed$icc_child_id),c("data_set","metric","icc_child_id","icc_corpus")]) #worst

kable(x,row.names = F,digits=2,caption="Measures with the lowest ICC attributed to children.")

```

```{r best, results="as.is"}

x <- tail(df.icc.mixed[order(df.icc.mixed$icc_child_id),c("data_set","metric","icc_child_id","icc_corpus")]) #best

kable(x,row.names = F,digits=2,caption="Measures with the highest ICC attributed to children.")


```

```{r sel, results="as.is"}
key_metrics = c("lena_CVC", "lena_CTC", "voc_dur_chi_ph", "voc_chi_ph", "voc_fem_ph", "voc_mal_ph", "wc_adu_ph")
x <- merge(df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="lena",c("metric","icc_child_id")] ,
           df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="aclew",c("metric","icc_child_id")],
           by='metric', all=TRUE)
colnames(x) <- c("metric", "LENA ICC", "ACLEW ICC")
kable(x,row.names = F,digits=2,caption="Most commonly used metrics.")


```

```{r icc-examples-fig2, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
# figure of bad ICC: lena     avg_voc_dur_chi; good ICC: lena voc_och_ph
data_set="lena"
mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv'))
mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
mydat <- mydat[is.element(mydat$experiment, corpora),]


# remove those data points altogether
mydat <- mydat[!is.na(mydat$avg_voc_dur_chi) & !is.na(mydat$voc_och_ph),]

#make sure cols work as we want them to
mydat$child_id <- paste(mydat$experiment,mydat$child_id)
mydat$unique <- paste(mydat$experiment,mydat$child_id,mydat$session_id)

#sample down to get 2 recs per child
mysample = NULL
for(thischild in levels(as.factor(mydat$child_id))){
  onechidat <- mydat[mydat$child_id == thischild,c("child_id","experiment","age","unique","avg_voc_dur_chi","voc_och_ph")]
  
  if(dim(onechidat)[1]>=2){
    selrows=sample(1:dim(onechidat)[1],2)
    mysample = rbind(mysample,
                     cbind(onechidat[selrows[1],],
                           onechidat[selrows[2],c("unique","avg_voc_dur_chi","voc_och_ph")]
                     )
    )
  } 
}
colnames(mysample) <-
  c(
    "child_id",
    "corpus",
    "age",
    "unique1",
    "avg_voc_dur_chi1",
    "voc_och_ph1",
    "unique2",
    "avg_voc_dur_chi2",
    "voc_och_ph2"
  )

mysample$corpus = factor(mysample$corpus)

mylimits=range(mysample[,c("avg_voc_dur_chi1","avg_voc_dur_chi2")])

bad <-
  ggplot(mysample, aes(avg_voc_dur_chi1, avg_voc_dur_chi2)) + 
  geom_point(aes(colour = factor(corpus))) +
  geom_smooth(method = 'lm', formula = y ~ x)  +
  labs(color = "Corpus")  +  
  theme(text = element_text(size = 20)) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  scale_x_continuous(name="Avg chi voc dur rec 1",limits=mylimits) +
  scale_y_continuous(name="Avg chi voc dur rec 2",limits=mylimits) +
  geom_abline(intercept = 0, slope = 1)

mylimits=range(mysample[,c("voc_och_ph1","voc_och_ph2")])
good <-
  ggplot(mysample, aes(voc_och_ph1, voc_och_ph2)) + 
  geom_point(aes(colour = factor(corpus))) +
  geom_smooth(method = 'lm', formula = y ~ x)  + 
  labs(color = "Corpus")  +  
  theme(text = element_text(size = 20)) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  scale_x_continuous(name="N other ch voc rec 1",limits=mylimits) +
  scale_y_continuous(name="N other ch voc rec 2",limits=mylimits) +
  geom_abline(intercept = 0, slope = 1)


ggarrange(bad, good,
          ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
          font.label = list(size = 20))  + labs(color= "Corpus")  +  theme(text = element_text(size = 20))


```

```{r reg model}

read.csv("../output/df.icc.mixed.csv")->df.icc.mixed

df.icc.mixed$subject[grepl("chi", df.icc.mixed$metric, fixed = TRUE)] <- "chi"
df.icc.mixed$subject[df.icc.mixed$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
df.icc.mixed$subject[grepl("och", df.icc.mixed$metric, fixed = TRUE)] <- "och"
df.icc.mixed$subject[grepl("mal", df.icc.mixed$metric, fixed = TRUE)] <- "mal"
df.icc.mixed$subject[grepl("fem", df.icc.mixed$metric, fixed = TRUE)] <- "fem"
df.icc.mixed$subject[grepl("adu", df.icc.mixed$metric, fixed = TRUE)] <- "adu"
df.icc.mixed$subject= factor(df.icc.mixed$subject,levels=c("chi","och","fem","mal","adu"))


lr_icc_chi <- lm(icc_child_id ~ subject + data_set, data=df.icc.mixed) 
#binomial could be used, but diagnostic plots look pretty good (other than some outliers)
summary(lr_icc_chi)

# Get common ACLEW/LENA metrics to see if statistically different
common_metrics = intersect(df.icc.mixed[df.icc.mixed$data_set=='aclew', "metric"],
                           df.icc.mixed[df.icc.mixed$data_set=='lena', "metric"])

lr_icc_chi_common <- lm(icc_child_id ~ subject + data_set, data=df.icc.mixed,subset=c(metric %in% common_metrics))
summary(lr_icc_chi_common)
```


## Paired analysis with minimum distance between recs


```{r correlation analysis for close recs, warning = F, echo=FALSE}
#to compare with Gilkerson, we just need to focus on AWC, CVC, CTC (in LENA, but for comparison purposes, we also include aclew)

nsamples=10

data_set="lena"
mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
mydat$uchild_id <- paste(mydat$experiment, mydat$child_id)
mydat$usession_id <- paste(mydat$uchild_id, mydat$session_id)
mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),]

dist_contig <- mydat %>% 
  arrange(experiment, child_id, age) %>% #this sorts the table by corpus then id then age
  group_by(experiment, child_id) %>% 
  mutate(n_rec = n(),
         age_dist_next_rec = lead(age) - age, 
         #this gets the diff corresponding cell in the preceding row and a given row
         next_session = lead(usession_id)) %>% 
  filter(age_dist_next_rec<2)

summary(dist_contig$n_rec) #distribution of n of recs per child

table(dist_contig$experiment) #this is the number of eligible recordings per corpus
sum(table(dist_contig$experiment)) #and overall

table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])#this is the number of eligible children per corpus
sum(table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])) #and overall

#given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do 10 because there are a lot of recs in cougar & bergelson..

data_set="aclew"
mydat_aclew <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
mydat_aclew$uchild_id <- paste(mydat_aclew$experiment, mydat_aclew$child_id)
mydat_aclew$usession_id <- paste(mydat_aclew$uchild_id, mydat_aclew$session_id)

all_rs=matrix(NA,nrow=nsamples,ncol=5)
colnames(all_rs)<-c("lena_CVC","lena_CTC","lena_wc_adu_ph","aclew_voc_chi_ph","aclew_wc_adu_ph")

for(i in 1:nsamples){
  #for each child, sample 2 contiguous recordings that are less than 2 months away
  #step 1: sample one session per child among the list of sessions that are close by
  close_sessions <- dist_contig  %>%
    group_by(uchild_id)%>%
    slice_sample(n = 1) 
  
  #step 2: get data from those sampled sessions as rec1
  rec1 = subset(dist_contig,usession_id %in% close_sessions$usession_id)  
  #step 3: get the next session
  rec2_sessions=levels(factor(rec1$next_session))
  rec2 = subset(mydat,usession_id %in% rec2_sessions)   
  
  all_rs[i,"lena_CVC"]<-cor.test(rec1$lena_CVC,rec2$lena_CVC)$estimate
  all_rs[i,"lena_CTC"]<-cor.test(rec1$lena_CTC,rec2$lena_CTC)$estimate
  all_rs[i,"lena_wc_adu_ph"]<-cor.test(rec1$wc_adu_ph,rec2$wc_adu_ph)$estimate
  
  #step 2: get data from those sampled sessions as rec1
  rec1 = subset(mydat_aclew,usession_id %in% close_sessions$usession_id)  
  #step 3: get the next session
  rec2 = subset(mydat_aclew,usession_id %in% rec2_sessions)   
  
  all_rs[i,"aclew_voc_chi_ph"]<-cor.test(rec1$voc_chi_ph,rec2$voc_chi_ph)$estimate
  all_rs[i,"aclew_wc_adu_ph"]<-cor.test(rec1$wc_adu_ph,rec2$wc_adu_ph)$estimate

}

summary(all_rs)
mean_rvalue <- as.data.frame(sapply(as.data.frame(all_rs), mean))
mean_rvalue <- setNames(cbind(rownames(mean_rvalue), mean_rvalue, row.names = NULL), 
         c("Metric", "Mean"))
sd_rvalue <- as.data.frame(sapply(as.data.frame(all_rs), sd))
sd_rvalue <- setNames(cbind(rownames(sd_rvalue), sd_rvalue, row.names = NULL), 
         c("Metric", "SD"))
mean_sd_rvalue <- merge(mean_rvalue, sd_rvalue, by='Metric')
# 
kable(mean_sd_rvalue)
print(mean_sd_rvalue)
```


## Reliability analyses per corpus

```{r icc-bycor, echo=F,fig.width=4, fig.height=10,fig.cap="Distribution of ICC attributed to children in each separate corpus."}
df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")

ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = toupper(data_set))) + 
  geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
  labs( x = "ICC child ID")+
  geom_jitter(aes(x = icc_adjusted,y=0,colour=toupper(data_set))) +   
  facet_grid(rows=vars(corpus)) +
  scale_fill_colorblind()+scale_colour_manual(values=cbPalette) +
  ylim(-0.5,6) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline")  +  theme(text = element_text(size = 20)) 

```

## Reliability by child age

```{r relBYage, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}
df.icc.age<-read.csv("../output/df.icc.age.csv")

df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=c("(0,6]" , "(6,12]",  "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]", "(36,42]", "(42,48]", "(48,54]" ))

icc_exp <- ggplot(df.icc.age, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
  geom_jitter(aes(x = icc_corpus,y=0,colour=toupper(data_set)))+ 
  facet_grid(rows=vars(age_bin)) +
  scale_fill_colorblind() +scale_colour_manual(values=cbPalette)+ ylim(-0.5,16) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline")  +  theme(text = element_text(size = 15)) 

icc_chi <- ggplot(df.icc.age, aes(x = icc_child_id, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
  geom_jitter(aes(x = icc_child_id,y=0,colour=toupper(data_set)))+
  facet_grid(rows=vars(age_bin)) +
  scale_fill_colorblind()+scale_colour_manual(values=cbPalette)+ ylim(-0.5,15) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline")  +  theme(text = element_text(size = 15)) 


ggarrange(icc_exp, icc_chi,
          labels = c("A", "B"),
          ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.05, hjust=0.45,
          font.label = list(size = 15)) 


```

```{r sel3, results="as.is"}

# x <- cbind(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id")] ,
#            df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id")]
# )

kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (LENA).")

kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (aclew).")

write.csv(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph"),c("data_set","metric","age_bin","icc_child_id")],"tabXage.csv")
```


```{r reg mod in age}
common_metrics = intersect(df.icc.age[df.icc.age$data_set=='aclew', "metric"],
                           df.icc.age[df.icc.age$data_set=='lena', "metric"])


df.icc.age$subject[grepl("chi", df.icc.age$metric, fixed = TRUE)] <- "chi"
df.icc.age$subject[df.icc.age$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
df.icc.age$subject[grepl("och", df.icc.age$metric, fixed = TRUE)] <- "och"
df.icc.age$subject[grepl("mal", df.icc.age$metric, fixed = TRUE)] <- "mal"
df.icc.age$subject[grepl("fem", df.icc.age$metric, fixed = TRUE)] <- "fem"
df.icc.age$subject[grepl("adu", df.icc.age$metric, fixed = TRUE)] <- "adu"
df.icc.age$subject= factor(df.icc.age$subject,levels=c("chi","och","fem","mal","adu"))


lr_icc_chi <- lm(icc_child_id ~ subject + data_set + age_bin, data=df.icc.age) 
#binomial could be used, but diagnostic plots look pretty good (other than some outliers)
plot(lr_icc_chi)
summary(lr_icc_chi)


lr_icc_chi_common <- lm(icc_child_id ~ subject + data_set + age_bin, data=df.icc.age,subset=c(metric %in% common_metrics)) 
plot(lr_icc_chi_common)
summary(lr_icc_chi_common)

```


## Validity against age

```{r ageFX, echo=F,fig.width=7, fig.height=10,fig.cap="Distribution of t for age when all corpora are analyzed together (a) or for each corpus separately (b)."}
allcor <- ggplot(df.icc.mixed, aes(x = age_t, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "Age") +
  geom_jitter(aes(x = age_t,y=0,colour=data_set)) + 
  scale_fill_colorblind() +scale_colour_manual(values=cbPalette)

bycor <- ggplot(df.icc.corpus, aes(x = age_t, fill = data_set)) +
  geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
  labs( x = "Age")+
  geom_jitter(aes(x = age_t,y=0,colour=data_set)) +
  facet_grid(rows=vars(corpus)) +
  scale_fill_colorblind()+scale_colour_manual(values=cbPalette)

ggarrange(allcor, bycor,
          labels = c("A", "B"),
          ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
```


```{r}
df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
df.icc.age<-read.csv("../output/df.icc.age.csv")
df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
```

```{r}
target_metrics = c("lena_CVC", "lena_CTC", "voc_dur_chi_ph", "voc_chi_ph", "voc_fem_ph", "voc_mal_ph", "wc_adu_ph")
ggplot(
  df.icc.age[df.icc.age$metric %in% target_metrics, ], 
  aes(x=factor(age_bin, level=str_sort(unique(df.icc.age$age_bin), numeric=TRUE)), 
      y=icc_child_id, group=interaction(metric, data_set), color=metric, linetype=grepl('lena', metric, fixed = TRUE))) +
  scale_linetype_manual(values = c("TRUE" = "dashed", "FALSE" = "solid")) +
  guides(linetype = "none")+
  geom_line() +
  facet_grid(. ~ toupper(data_set)) + 
  ylim(0, 1) +
  xlab('Age bins') + 
  ylab('ICC Child') +
  labs(color="Metric") +
  theme(legend.position = "top")
ggsave(paste0("plots/metrics/age_metrics.png"))
```

## Formula stats
Mixed models formula summary:
`r kable(table(df.icc.mixed$formula))`

Corpus models formula summary:
`r kable(table(df.icc.corpus$formula))`

Age models formula summary:
`r kable(table(df.icc.age$formula))`


## Save information about packages used

```{r}
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
```