123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697 |
- ---
- title: Establishing the reliability and validity of measures extracted from long-form
- recordings
- output:
- pdf_document:
- toc: yes
- toc_depth: 3
- html_document:
- toc: yes
- toc_depth: '3'
- df_print: paged
- ---
- ```{r setup, include=FALSE, eval=TRUE}
- knitr::opts_chunk$set(echo = FALSE, warning = FALSE)
- set.seed(185729481)
- library("lme4")
- library("performance") # ICC
- library("ggplot2")
- library("ggthemes")
- library("ggpubr")
- library("kableExtra")
- library("psych")
- library("dplyr")
- library("tidyr")
- library("stringr")
- cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
- # List of corpora we are interested in
- corpora <- c('bergelson', 'lucid', 'winnipeg', 'warlaumont','cougar','fausey-trio')
- # Columns that should not be scaled or taken into account as metrics
- no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s', 'nobs', 'nchi',
- 'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","usession_id",
- "normative","age","duration_alice", "duration_vcm" , "duration_vtc","duration_its" )
- data_sets = c('aclew', 'lena')
- check_small_var<-function(x,y,i) round(x[i],3)==round(y[i],3) & round(y[i],3) == 0
- fit_child_model<-function(dataframe, metric){
- # Fit formula where experiment is removed
- formula <- as.formula(paste0(metric, "~ age_s + (1|child_id)"))
- model <- lmer(formula, data=dataframe)
-
- return (model)
- }
- extract_chi_variables<-function(model){
- icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
- icc.result.split <- c(as.data.frame(icc(model, by_group=TRUE))$ICC, NA)
-
- r2.nakagawa <- r2_nakagawa(model)
-
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
- # chi, NA, residual
- ranefs_vars <-c(ranefs_vars[1],NA,ranefs_vars[2])
- ranefs_stdv <-c(ranefs_stdv[1],NA,ranefs_stdv[2])
- # chi, NA
- ns<-c(unlist(summary(model)$n),NA)
- return (c(coefficients(summary(model))["age_s",],
- icc.result.mixed,
- icc.result.split,
- ranefs_vars,
- ranefs_stdv,
- r2.nakagawa,
- nobs(model),ns))
- }
- extract_full_variables<-function(model){
- icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
- icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
- r2.nakagawa <- r2_nakagawa(model)
-
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
-
- ns<-t(data.frame(summary(model)$n))
-
- return (c( coefficients(summary(model))["age_s",],
- icc.result.mixed,
- icc.result.split,
- ranefs_vars,
- ranefs_stdv,
- r2.nakagawa,
- nobs(model),ns))
- }
- new_fit_models<-function(dataframe, data_set, metric, age, fit_full = TRUE){
- #dataframe=mydat ; age = NA ; full = TRUE
- iqr = quantile(dataframe[,metric],.75,na.rm=T)-quantile(dataframe[,metric],.25,na.rm=T)
-
- if(fit_full){
- # Fit full model
- formula <- as.formula(paste0(metric, "~ age_s + (1|experiment/child_id)"))
- model <- lmer(formula, data=dataframe)
- }
-
- if(fit_full && !isSingular(model)) # Fitted full model short circuit and
- {
- form="full"
-
- sw=shapiro.test(resid(model))$p
- mod_variables <- extract_full_variables(model)
-
- # Build line
- } else {
- model <- fit_child_model(dataframe, metric)
- if(!isSingular(model)){
-
- form = "no_exp"
- sw=shapiro.test(resid(model))$p
- mod_variables <- extract_chi_variables(model)
-
- } else {
- form='no_chi_effect'
- sw = NA
- mod_variables = c(
- NA,NA,NA, # c(coefficients(summary(model))["age_s",]
- NA,NA, # icc.result.mixed
- NA,NA, # icc.result.split
- NA,NA,NA, # ranefs_vars
- NA,NA,NA, # raners_std
- NA,NA, # r2
- NA,NA,NA) # nobs (child, corpus), nobs
- }
- }
-
- icc.row = c(data_set, age, metric, iqr, mod_variables, form, sw)
- return (icc.row)
- }
- ```
- ## Recalculate everything or not?
- If RECALC is set to TRUE, then the ICC tables will be re-generated.
- ```{r}
- RECALC=FALSE
- ```
- ## Generate ICC tables
- ```{r create-all-icc, eval=RECALC}
- df.icc.mixed.cols = c("data_set","age_bin", "metric", "iqr",
- "age_b","age_se","age_t", # beta, standard error, T
- "icc_adjusted", "icc_conditional",
- "icc_child_id", "icc_corpus",
- "child_id_var","corpus_var","residual_var",
- "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
- "nobs","nchi", "ncor",
- "formula","sw")
- df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
- stringsAsFactors = FALSE)
- for (data_set in data_sets){ # data_set = "aclew"
- # Load data
- mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
- metrics <- colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
- for(metric in metrics)
- { # metric = "voc_chi_ph"
- icc.row <- new_fit_models(mydat, data_set, metric, NA, TRUE)
- df.icc.mixed[nrow(df.icc.mixed) + 1,] <- icc.row
- }
- }
- write.csv(df.icc.mixed,"../output/df.icc.mixed.csv",row.names=F)
- # repeat for the version within each corpus
- df.icc.corpus.cols = c("data_set","corpus","metric", "iqr",
- "age_b","age_se","age_t",
- "icc_adjusted", "icc_conditional",
- "icc_child_id", "icc_corpus",
- "child_id_var","corpus_var","residual_var",
- "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
- "nobs","nchi", "ncor",
- "formula","sw")
- df.icc.corpus = data.frame(matrix(ncol=length(df.icc.corpus.cols),nrow=0, dimnames=list(NULL, df.icc.corpus.cols)),
- stringsAsFactors=FALSE)
- for (data_set in data_sets){ # data_set = "aclew"
- # Load data
- mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
- for(corpus in corpora){
- mycordat <- mydat[mydat$experiment == corpus, ]
- metrics <- colnames(mycordat)[!is.element(colnames(mycordat), no.scale.columns)]
- for(metric in metrics)
- { # metric = "avg_voc_dur_mal"
- icc.row <- new_fit_models(mycordat, data_set, metric, corpus, FALSE)
- df.icc.corpus[nrow(df.icc.corpus) + 1,] <- icc.row
- }
- }
- }
- write.csv(df.icc.corpus,"../output/df.icc.corpus.csv",row.names=F)
- ```
- ```{r}
- mydat <- read.csv(paste0('../data_output/', 'aclew','_metrics.csv'))
- child_per_corpus = setNames(aggregate(data = mydat, child_id ~ experiment, function(child_id) length(unique(child_id))), c('experiment', 'No_Children'))
- rec_per_corpus = setNames(aggregate(data = mydat, session_id ~ experiment, function(session_id) length(unique(session_id))), c('experiment', 'No_Rec'))
- dur_per_corpus = setNames(aggregate(data = mydat, duration_vtc ~ experiment, function(duration_vtc) sum(duration_vtc)/3.6e+6), c('experiment', 'Duration_h'))
- age_mean_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) mean(age)), c('experiment', 'Mean_Age'))
- age_min_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) min(age)), c('experiment', 'Min_Age'))
- age_max_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) max(age)), c('experiment', 'Max_Age'))
- corp_code = data.frame(
- experiment=c("bergelson", "cougar", "fausey-trio", "lucid", "warlaumont", "winnipeg"),
- code=c("BER", "COU", "TRI", "L05", "WAR", "MCD"),
- location=c("Northeast US", "Northwest US", "Western US", "Northwest England", "Western US", "Western Canada")
- )
- corp_desc_list = list(corp_code, child_per_corpus, rec_per_corpus, dur_per_corpus, age_mean_per_corpus, age_min_per_corpus, age_max_per_corpus)
- corpus_description <- Reduce(function(x, y) merge(x, y, all=TRUE), corp_desc_list)
- corpus_description <- transform(corpus_description, Age_Range=paste(Min_Age, Max_Age, sep="-"))
- corpus_description <- subset(corpus_description, select = -c(Min_Age, Max_Age))
- write.csv(corpus_description, "../output/corpus_description.csv", sep='\t')
- nkids=length(levels(factor(paste(mydat$experiment,mydat$child_id))))
- ```
- ```{r icc-age, eval=RECALC}
- #We do this one separately because we want to standardize metrics within each age group
- # repeat within age group bins
- df.icc.age.cols = c("data_set","age_bin","metric", "iqr",
- "age_b","age_se","age_t", # beta, standard error, T
- "icc_adjusted", "icc_conditional",
- "icc_child_id", "icc_corpus",
- "child_id_var","corpus_var","residual_var",
- "child_id_sd","corpus_sd","residual_sd",
- "nobs", "nchi","ncor",
- "formula","sw")
- df.icc.age = data.frame(matrix(ncol=length(df.icc.age.cols),nrow=0, dimnames=list(NULL, df.icc.age.cols)),
- stringsAsFactors=FALSE)
- for (data_set in data_sets){ # data_set = "aclew"
- # Load data and calculate age cuts
- mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
- mydat$age_bin <- cut(mydat$age,c(0:6*6))
-
- metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
- for(thisage in levels(mydat$age_bin))
- { #thisage= "(0,6]"
- thiscordata <- mydat[mydat$age_bin == thisage,]
- for(metric in metrics)
- { # metric = "avg_voc_dur_mal"
-
- pre_scaled_metric <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
- thiscordata[abs(pre_scaled_metric)>2.5 & !is.na(pre_scaled_metric), metric] <- NA
- thiscordata[, metric] <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
- if(dim(thiscordata)[1] > 30 & length(levels(factor(thiscordata$experiment))) > 1)
- {
- icc.row <- new_fit_models(thiscordata, data_set, metric, thisage, TRUE)
-
- }else{
- icc.row <- c(data_set,thisage,metric,iqr,
- NA,NA,NA,
- NA,NA,
- NA,NA,
- NA,NA,NA,
- NA,NA,NA,
- NA,NA,NA,
- "not_enough_data",NA)
- }
- df.icc.age[nrow(df.icc.age) + 1,] <- icc.row
- }
- }
- }
- write.csv(df.icc.age,"../output/df.icc.age.csv",row.names=F)
- ```
- ## Describe datasets
- ```{r}
- df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
- ```
- We are looking here at `r length(corpora)` corpora, `r nkids` children, `r max(df.icc.mixed$nobs[df.icc.mixed$data_set=="lena"])` recordings, `r length(levels(factor(df.icc.mixed$metric)))` many metrics.
- <!-- The number of children comes from nkids, in the first chunk that is not evaluated. -->
- ## Reliability analyses combining all corpora
- ```{r icc-allexp, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora."}
- df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
- icc_exp <- ggplot(df.icc.mixed, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
- geom_jitter( aes(x = icc_corpus,y=0,colour=toupper(data_set)))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette) + theme(text = element_text(size = 20)) + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline")
- icc_chi <- ggplot(df.icc.mixed, aes(x = icc_child_id, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
- geom_jitter(aes(x = icc_child_id,y=0,colour=toupper(data_set)))+ scale_fill_colorblind()+scale_colour_manual(values=cbPalette) + theme(text = element_text(size = 20)) + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline")
- ggarrange(icc_exp, icc_chi,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=-0.1,
- font.label = list(size = 20)) + theme(text = element_text(size = 20))
- ```
- ```{r worst, results="as.is"}
- x <- head(df.icc.mixed[order(df.icc.mixed$icc_child_id),c("data_set","metric","icc_child_id","icc_corpus")]) #worst
- kable(x,row.names = F,digits=2,caption="Measures with the lowest ICC attributed to children.")
- ```
- ```{r best, results="as.is"}
- x <- tail(df.icc.mixed[order(df.icc.mixed$icc_child_id),c("data_set","metric","icc_child_id","icc_corpus")]) #best
- kable(x,row.names = F,digits=2,caption="Measures with the highest ICC attributed to children.")
- ```
- ```{r sel, results="as.is"}
- key_metrics = c("lena_CVC", "lena_CTC", "voc_dur_chi_ph", "voc_chi_ph", "voc_fem_ph", "voc_mal_ph", "wc_adu_ph")
- x <- merge(df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="lena",c("metric","icc_child_id")] ,
- df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="aclew",c("metric","icc_child_id")],
- by='metric', all=TRUE)
- colnames(x) <- c("metric", "LENA ICC", "ACLEW ICC")
- kable(x,row.names = F,digits=2,caption="Most commonly used metrics.")
- ```
- ```{r icc-examples-fig2, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
- # figure of bad ICC: lena avg_voc_dur_chi; good ICC: lena voc_och_ph
- data_set="lena"
- mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv'))
- mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
- mydat <- mydat[is.element(mydat$experiment, corpora),]
- # remove those data points altogether
- mydat <- mydat[!is.na(mydat$avg_voc_dur_chi) & !is.na(mydat$voc_och_ph),]
- #make sure cols work as we want them to
- mydat$child_id <- paste(mydat$experiment,mydat$child_id)
- mydat$unique <- paste(mydat$experiment,mydat$child_id,mydat$session_id)
- #sample down to get 2 recs per child
- mysample = NULL
- for(thischild in levels(as.factor(mydat$child_id))){
- onechidat <- mydat[mydat$child_id == thischild,c("child_id","experiment","age","unique","avg_voc_dur_chi","voc_och_ph")]
-
- if(dim(onechidat)[1]>=2){
- selrows=sample(1:dim(onechidat)[1],2)
- mysample = rbind(mysample,
- cbind(onechidat[selrows[1],],
- onechidat[selrows[2],c("unique","avg_voc_dur_chi","voc_och_ph")]
- )
- )
- }
- }
- colnames(mysample) <-
- c(
- "child_id",
- "corpus",
- "age",
- "unique1",
- "avg_voc_dur_chi1",
- "voc_och_ph1",
- "unique2",
- "avg_voc_dur_chi2",
- "voc_och_ph2"
- )
- mysample$corpus = factor(mysample$corpus)
- mylimits=range(mysample[,c("avg_voc_dur_chi1","avg_voc_dur_chi2")])
- bad <-
- ggplot(mysample, aes(avg_voc_dur_chi1, avg_voc_dur_chi2)) +
- geom_point(aes(colour = factor(corpus))) +
- geom_smooth(method = 'lm', formula = y ~ x) +
- labs(color = "Corpus") +
- theme(text = element_text(size = 20)) +
- theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
- panel.background = element_blank(), axis.line = element_line(colour = "black")) +
- scale_x_continuous(name="Avg chi voc dur rec 1",limits=mylimits) +
- scale_y_continuous(name="Avg chi voc dur rec 2",limits=mylimits) +
- geom_abline(intercept = 0, slope = 1)
- mylimits=range(mysample[,c("voc_och_ph1","voc_och_ph2")])
- good <-
- ggplot(mysample, aes(voc_och_ph1, voc_och_ph2)) +
- geom_point(aes(colour = factor(corpus))) +
- geom_smooth(method = 'lm', formula = y ~ x) +
- labs(color = "Corpus") +
- theme(text = element_text(size = 20)) +
- theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
- panel.background = element_blank(), axis.line = element_line(colour = "black")) +
- scale_x_continuous(name="N other ch voc rec 1",limits=mylimits) +
- scale_y_continuous(name="N other ch voc rec 2",limits=mylimits) +
- geom_abline(intercept = 0, slope = 1)
- ggarrange(bad, good,
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
- font.label = list(size = 20)) + labs(color= "Corpus") + theme(text = element_text(size = 20))
- ```
- ```{r reg model}
- read.csv("../output/df.icc.mixed.csv")->df.icc.mixed
- df.icc.mixed$subject[grepl("chi", df.icc.mixed$metric, fixed = TRUE)] <- "chi"
- df.icc.mixed$subject[df.icc.mixed$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
- df.icc.mixed$subject[grepl("och", df.icc.mixed$metric, fixed = TRUE)] <- "och"
- df.icc.mixed$subject[grepl("mal", df.icc.mixed$metric, fixed = TRUE)] <- "mal"
- df.icc.mixed$subject[grepl("fem", df.icc.mixed$metric, fixed = TRUE)] <- "fem"
- df.icc.mixed$subject[grepl("adu", df.icc.mixed$metric, fixed = TRUE)] <- "adu"
- df.icc.mixed$subject= factor(df.icc.mixed$subject,levels=c("chi","och","fem","mal","adu"))
- lr_icc_chi <- lm(icc_child_id ~ subject + data_set, data=df.icc.mixed)
- #binomial could be used, but diagnostic plots look pretty good (other than some outliers)
- summary(lr_icc_chi)
- # Get common ACLEW/LENA metrics to see if statistically different
- common_metrics = intersect(df.icc.mixed[df.icc.mixed$data_set=='aclew', "metric"],
- df.icc.mixed[df.icc.mixed$data_set=='lena', "metric"])
- lr_icc_chi_common <- lm(icc_child_id ~ subject + data_set, data=df.icc.mixed,subset=c(metric %in% common_metrics))
- summary(lr_icc_chi_common)
- ```
- ## Paired analysis with minimum distance between recs
- ```{r correlation analysis for close recs, warning = F, echo=FALSE}
- #to compare with Gilkerson, we just need to focus on AWC, CVC, CTC (in LENA, but for comparison purposes, we also include aclew)
- nsamples=10
- data_set="lena"
- mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
- mydat$uchild_id <- paste(mydat$experiment, mydat$child_id)
- mydat$usession_id <- paste(mydat$uchild_id, mydat$session_id)
- mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),]
- dist_contig <- mydat %>%
- arrange(experiment, child_id, age) %>% #this sorts the table by corpus then id then age
- group_by(experiment, child_id) %>%
- mutate(n_rec = n(),
- age_dist_next_rec = lead(age) - age,
- #this gets the diff corresponding cell in the preceding row and a given row
- next_session = lead(usession_id)) %>%
- filter(age_dist_next_rec<2)
- summary(dist_contig$n_rec) #distribution of n of recs per child
- table(dist_contig$experiment) #this is the number of eligible recordings per corpus
- sum(table(dist_contig$experiment)) #and overall
- table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])#this is the number of eligible children per corpus
- sum(table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])) #and overall
- #given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do 10 because there are a lot of recs in cougar & bergelson..
- data_set="aclew"
- mydat_aclew <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
- mydat_aclew$uchild_id <- paste(mydat_aclew$experiment, mydat_aclew$child_id)
- mydat_aclew$usession_id <- paste(mydat_aclew$uchild_id, mydat_aclew$session_id)
- all_rs=matrix(NA,nrow=nsamples,ncol=5)
- colnames(all_rs)<-c("lena_CVC","lena_CTC","lena_wc_adu_ph","aclew_voc_chi_ph","aclew_wc_adu_ph")
- for(i in 1:nsamples){
- #for each child, sample 2 contiguous recordings that are less than 2 months away
- #step 1: sample one session per child among the list of sessions that are close by
- close_sessions <- dist_contig %>%
- group_by(uchild_id)%>%
- slice_sample(n = 1)
-
- #step 2: get data from those sampled sessions as rec1
- rec1 = subset(dist_contig,usession_id %in% close_sessions$usession_id)
- #step 3: get the next session
- rec2_sessions=levels(factor(rec1$next_session))
- rec2 = subset(mydat,usession_id %in% rec2_sessions)
-
- all_rs[i,"lena_CVC"]<-cor.test(rec1$lena_CVC,rec2$lena_CVC)$estimate
- all_rs[i,"lena_CTC"]<-cor.test(rec1$lena_CTC,rec2$lena_CTC)$estimate
- all_rs[i,"lena_wc_adu_ph"]<-cor.test(rec1$wc_adu_ph,rec2$wc_adu_ph)$estimate
-
- #step 2: get data from those sampled sessions as rec1
- rec1 = subset(mydat_aclew,usession_id %in% close_sessions$usession_id)
- #step 3: get the next session
- rec2 = subset(mydat_aclew,usession_id %in% rec2_sessions)
-
- all_rs[i,"aclew_voc_chi_ph"]<-cor.test(rec1$voc_chi_ph,rec2$voc_chi_ph)$estimate
- all_rs[i,"aclew_wc_adu_ph"]<-cor.test(rec1$wc_adu_ph,rec2$wc_adu_ph)$estimate
- }
- summary(all_rs)
- mean_rvalue <- as.data.frame(sapply(as.data.frame(all_rs), mean))
- mean_rvalue <- setNames(cbind(rownames(mean_rvalue), mean_rvalue, row.names = NULL),
- c("Metric", "Mean"))
- sd_rvalue <- as.data.frame(sapply(as.data.frame(all_rs), sd))
- sd_rvalue <- setNames(cbind(rownames(sd_rvalue), sd_rvalue, row.names = NULL),
- c("Metric", "SD"))
- mean_sd_rvalue <- merge(mean_rvalue, sd_rvalue, by='Metric')
- #
- kable(mean_sd_rvalue)
- print(mean_sd_rvalue)
- ```
- ## Reliability analyses per corpus
- ```{r icc-bycor, echo=F,fig.width=4, fig.height=10,fig.cap="Distribution of ICC attributed to children in each separate corpus."}
- df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
- ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = toupper(data_set))) +
- geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
- labs( x = "ICC child ID")+
- geom_jitter(aes(x = icc_adjusted,y=0,colour=toupper(data_set))) +
- facet_grid(rows=vars(corpus)) +
- scale_fill_colorblind()+scale_colour_manual(values=cbPalette) +
- ylim(-0.5,6) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline") + theme(text = element_text(size = 20))
- ```
- ## Reliability by child age
- ```{r relBYage, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}
- df.icc.age<-read.csv("../output/df.icc.age.csv")
- df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=c("(0,6]" , "(6,12]", "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]", "(36,42]", "(42,48]", "(48,54]" ))
- icc_exp <- ggplot(df.icc.age, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
- geom_jitter(aes(x = icc_corpus,y=0,colour=toupper(data_set)))+
- facet_grid(rows=vars(age_bin)) +
- scale_fill_colorblind() +scale_colour_manual(values=cbPalette)+ ylim(-0.5,16) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline") + theme(text = element_text(size = 15))
- icc_chi <- ggplot(df.icc.age, aes(x = icc_child_id, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
- geom_jitter(aes(x = icc_child_id,y=0,colour=toupper(data_set)))+
- facet_grid(rows=vars(age_bin)) +
- scale_fill_colorblind()+scale_colour_manual(values=cbPalette)+ ylim(-0.5,15) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline") + theme(text = element_text(size = 15))
- ggarrange(icc_exp, icc_chi,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.05, hjust=0.45,
- font.label = list(size = 15))
- ```
- ```{r sel3, results="as.is"}
- # x <- cbind(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id")] ,
- # df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id")]
- # )
- kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (LENA).")
- kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (aclew).")
- write.csv(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph"),c("data_set","metric","age_bin","icc_child_id")],"tabXage.csv")
- ```
- ```{r reg mod in age}
- common_metrics = intersect(df.icc.age[df.icc.age$data_set=='aclew', "metric"],
- df.icc.age[df.icc.age$data_set=='lena', "metric"])
- df.icc.age$subject[grepl("chi", df.icc.age$metric, fixed = TRUE)] <- "chi"
- df.icc.age$subject[df.icc.age$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
- df.icc.age$subject[grepl("och", df.icc.age$metric, fixed = TRUE)] <- "och"
- df.icc.age$subject[grepl("mal", df.icc.age$metric, fixed = TRUE)] <- "mal"
- df.icc.age$subject[grepl("fem", df.icc.age$metric, fixed = TRUE)] <- "fem"
- df.icc.age$subject[grepl("adu", df.icc.age$metric, fixed = TRUE)] <- "adu"
- df.icc.age$subject= factor(df.icc.age$subject,levels=c("chi","och","fem","mal","adu"))
- lr_icc_chi <- lm(icc_child_id ~ subject + data_set + age_bin, data=df.icc.age)
- #binomial could be used, but diagnostic plots look pretty good (other than some outliers)
- plot(lr_icc_chi)
- summary(lr_icc_chi)
- lr_icc_chi_common <- lm(icc_child_id ~ subject + data_set + age_bin, data=df.icc.age,subset=c(metric %in% common_metrics))
- plot(lr_icc_chi_common)
- summary(lr_icc_chi_common)
- ```
- ## Validity against age
- ```{r ageFX, echo=F,fig.width=7, fig.height=10,fig.cap="Distribution of t for age when all corpora are analyzed together (a) or for each corpus separately (b)."}
- allcor <- ggplot(df.icc.mixed, aes(x = age_t, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "Age") +
- geom_jitter(aes(x = age_t,y=0,colour=data_set)) +
- scale_fill_colorblind() +scale_colour_manual(values=cbPalette)
- bycor <- ggplot(df.icc.corpus, aes(x = age_t, fill = data_set)) +
- geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
- labs( x = "Age")+
- geom_jitter(aes(x = age_t,y=0,colour=data_set)) +
- facet_grid(rows=vars(corpus)) +
- scale_fill_colorblind()+scale_colour_manual(values=cbPalette)
- ggarrange(allcor, bycor,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
- ```
- ```{r}
- df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
- df.icc.age<-read.csv("../output/df.icc.age.csv")
- df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
- ```
- ```{r}
- target_metrics = c("lena_CVC", "lena_CTC", "voc_dur_chi_ph", "voc_chi_ph", "voc_fem_ph", "voc_mal_ph", "wc_adu_ph")
- ggplot(
- df.icc.age[df.icc.age$metric %in% target_metrics, ],
- aes(x=factor(age_bin, level=str_sort(unique(df.icc.age$age_bin), numeric=TRUE)),
- y=icc_child_id, group=interaction(metric, data_set), color=metric, linetype=grepl('lena', metric, fixed = TRUE))) +
- scale_linetype_manual(values = c("TRUE" = "dashed", "FALSE" = "solid")) +
- guides(linetype = "none")+
- geom_line() +
- facet_grid(. ~ toupper(data_set)) +
- ylim(0, 1) +
- xlab('Age bins') +
- ylab('ICC Child') +
- labs(color="Metric") +
- theme(legend.position = "top")
- ggsave(paste0("plots/metrics/age_metrics.png"))
- ```
- ## Formula stats
- Mixed models formula summary:
- `r kable(table(df.icc.mixed$formula))`
- Corpus models formula summary:
- `r kable(table(df.icc.corpus$formula))`
- Age models formula summary:
- `r kable(table(df.icc.age$formula))`
- ## Save information about packages used
- ```{r}
- writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
- ```
|