123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947 |
- ---
- title: Establishing the reliability and validity of measures extracted from long-form
- recordings
- output:
- pdf_document:
- toc: yes
- toc_depth: 3
- html_document:
- toc: yes
- toc_depth: '3'
- df_print: paged
- ---
- ```{r setup, include=FALSE}
- knitr::opts_chunk$set(echo = FALSE, warning = FALSE)
- library("lme4")
- library("performance") # ICC
- library(ggplot2)
- library(ggthemes)
- library(ggpubr)
- library(kableExtra)
- library(psych)
- library(dplyr)
- library(tidyr)
- cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
- # List of corpora we are interested in
- corpora <- c('bergelson', 'lucid', 'winnipeg', 'warlaumont','cougar','fausey')
- # Columns that should not be scaled or taken into account as metrics
- no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique',
- 'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","usession_id", "normative")
- data_sets = c('aclew', 'lena')
- ```
- ## TODO later
- - split cougar into normative and not (Done, to be checked by @alex though)
- - consistency within files by comparing even versus odd hours of the day
- - extract SD, median & IQR per hour and perhaps something that gives an idea of peak activity -- eg peak N & dur per hour
- ## TO think
- - factor analyses: first attempt done, but am I confused about how to do it?
- - code for estimating CI for ICC -- looks like the only way is via bootstrapping, in which case,
- - leave for the end since it'll be a LONG time calculating [link](https://stats.stackexchange.com/questions/184767/how-do-i-calculate-the-confidence-interval-of-an-icc/564139#564139)
- ## Generate data
- By default, these chunks are not re-evaluated.
- ```{r icc-allcorporatogether, eval = FALSE}
- # Declare df to store ICCs
- df.icc.mixed.cols = c("data_set","metric", "iqr",
- "age_b","age_se","age_t",
- "icc_adjusted", "icc_conditional", "icc_child_id_corpus", "icc_corpus",
- "child_id_var","corpus_var","residual_var",
- "child_id_sd","corpus_sd","residual_sd",
- "nobs",
- "formula","sw")
- df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
- stringsAsFactors = FALSE)
- for (data_set in data_sets){ #data_set="aclew"
-
- # Load data and remove unwanted corpora
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- mydat <- mydat[is.element(mydat$experiment, corpora),]
- all_cols = names(mydat)
- metrics = all_cols[!is.element(all_cols, no.scale.columns)]
-
- #remove outliers
- # WILLIAM: Why not use IQR to remove outliers?
- # .V1 ?
- for(metric in metrics) mydat[, metric] <- ifelse(scale(mydat[, metric])>2.5 | scale(mydat[, metric]) < -2.5,NA,mydat[, metric])
- colnames(mydat)<-gsub(".V1","", colnames(mydat),fixed=T)
-
- # Scale mydat
- mydat[, metrics] <- scale(mydat[, metrics])
- metrics = metrics[!(metrics=="age")] #remove age, since a regression with age twice doesn't make sense
-
- #print(summary(mydat)) #to look at distribution of mydat
- # Fit LMER
- for (metric in metrics){#metric="lp_n"
- # Formula = metric ~ age + (1|experiment/child_id)
- # = metric ~ age + (1|experiment) + (1|experiment:child_id)
- # age -> fixed effect
- # experiment -> random effect
- # child:experiment (child nested in experiment) -> random effect
- # AC double checked: this absolutely ensures no confusion bc child_id repeated across corpora (ie using / enforces nesting)
- formula <- as.formula(paste0(metric, "~ age + (1|experiment/child_id)"))
- model <- lmer(formula, data=mydat)
- form="full"
-
- # get IQR of metric
- iqr = quantile(mydat[,metric],.75,na.rm=T)-quantile(mydat[,metric],.25,na.rm=T)
- # Ran Ef
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
-
- if(isSingular(model)){
- #check if this is due to exp causing overfit
- if(round(ranefs_vars[2],3)==round(ranefs_stdv[2],3) & round(ranefs_vars[2],3)==0){
- formula <- as.formula(paste0(metric, "~ age + (1|child_id)"))
- model <- lmer(formula, data=mydat)
- form = "no_exp"
- }else{#otherwise, the problem is probably the child
- if(round(ranefs_vars[1],3)==round(ranefs_stdv[1],3) & round(ranefs_vars[1],3)==0){
- formula <- as.formula(paste0(metric, "~ age + (1|experiment)"))
- model <- lmer(formula, data=mydat)
- form = "no_chi"
- }
- }
- # re-extract Ran Ef when model changed
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
- }
- # print(paste(data_set,metric))
- # print(VarCorr(model))
-
-
- # Mixed ICC
- icc.result.mixed <- as.data.frame(icc(model))[-3]
-
- # By group ICC
- icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
- # print(paste(data_set,metric))
- # print(icc(model, by_group=TRUE))
- if(form =="no_exp"){
- icc.result.split = cbind(icc.result.split,NA)
- ranefs_vars = cbind(ranefs_vars[1],NA,ranefs_vars[2])
- ranefs_stdv = cbind(ranefs_stdv[1],NA,ranefs_stdv[2])
- }
- if(form =="no_chi"){
- icc.result.split = cbind(NA,icc.result.split)
- ranefs_vars = cbind(NA,ranefs_vars)
- ranefs_stdv = cbind(NA,ranefs_stdv)
- }
-
- sw=shapiro.test(resid(model))$p
-
-
-
- icc.row.mixed = cbind(data_set,
- metric,iqr,
- t(coefficients(summary(model))["age",]),
- icc.result.mixed,
- icc.result.split,
- ranefs_vars,
- ranefs_stdv,
- nobs(model),
- form,sw,
- stringsAsFactors=FALSE)
-
-
- df.icc.mixed[nrow(df.icc.mixed) + 1,] <- icc.row.mixed
-
- #print('-------------------')
- # insight get variance
- }
- }
- #df.icc.mixed
- write.csv(df.icc.mixed,"OUTPUT/df.icc.mixed.csv",row.names=F)
- ```
- ```{r icc-per-corpus, eval=FALSE}
- # repeat within each corpus
- df.icc.corpus.cols = c("data_set","corpus","metric", "iqr",
- "age_b","age_se","age_t",
- "icc_adjusted", "icc_conditional",
- "child_id_var","residual_var",
- "child_id_sd","residual_sd",
- "nobs",
- "formula","is.sing","sw")
- df.icc.corpus = data.frame(matrix(ncol=length(df.icc.corpus.cols),nrow=0, dimnames=list(NULL, df.icc.corpus.cols)),
- stringsAsFactors=FALSE)
- for (data_set in data_sets){ #data_set="aclew"
-
- # Load data and remove unwanted experiments
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- for(corpus in corpora){#corpus="lucid"
- thiscordata <- mydat[mydat$experiment == corpus,]
-
- all_cols = names(thiscordata)
- metrics = all_cols[!is.element(all_cols, no.scale.columns)]
-
- #remove outliers
- for(metric in metrics) thiscordata[, metric] <- ifelse(scale(thiscordata[, metric])>2.5,NA,thiscordata[, metric])
- colnames(thiscordata)<-gsub(".V1","", colnames(thiscordata),fixed=T)
-
- # Scale thiscordata
- thiscordata[, metrics] <- scale(thiscordata[, metrics]) # TODO: check why this line fails for me (William)
- metrics = metrics[!(metrics=="age")] #remove age, since a regression with age twice doesn't make sense
-
- #print(summary(thiscordata)) #to look at distribution of thiscordata
- # Fit LMER
- for (metric in metrics) if(sum(!is.na(thiscordata[,metric]>30))){#metric="lp_n"
- formula <- as.formula(paste0(metric, "~ age + (1|child_id)"))
- model <- lmer(formula, data=thiscordata)
- form="full"
-
- # get IQR of metric
- iqr = quantile(thiscordata[,metric],.75,na.rm=T)-quantile(thiscordata[,metric],.25,na.rm=T)
-
- # Ran Ef
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
-
- # Mixed ICC
- if(!is.na(icc(model))) icc.result.mixed <- as.data.frame(icc(model))[-3] else icc.result.mixed <- cbind(NA,NA)
-
-
- sw=shapiro.test(resid(model))$p
-
-
- icc.row = cbind(data_set,corpus,
- metric,iqr,
- t(coefficients(summary(model))["age",]),
- icc.result.mixed,
- ranefs_vars,
- ranefs_stdv,
- nobs(model),
- form,isSingular(model),sw,
- stringsAsFactors=FALSE)
-
-
- df.icc.corpus[nrow(df.icc.corpus) + 1,] <- icc.row
-
-
- #print('-------------------')
- # insight get variance
- }
- }
-
- }
- write.csv(df.icc.corpus,"OUTPUT/df.icc.corpus.csv",row.names=F)
- ```
- ```{r icc-age, eval=FALSE}
- # repeat within age group bins
- df.icc.age.cols = c("data_set","age_bin","metric", "iqr",
- "age_b","age_se","age_t",
- "icc_adjusted", "icc_conditional",
- "icc_child_id_corpus", "icc_corpus",
- "child_id_var","corpus_var","residual_var",
- "child_id_sd","corpus_sd","residual_sd",
- "nobs", "nchi","ncor",
- "formula","is.sing","sw")
- df.icc.age = data.frame(matrix(ncol=length(df.icc.age.cols),nrow=0, dimnames=list(NULL, df.icc.age.cols)),
- stringsAsFactors=FALSE)
- for (data_set in data_sets){ #data_set="lena"
-
- # Load data and remove unwanted corpuss
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- mydat <- mydat[is.element(mydat$experiment, corpora),]
- mydat$age_bin <- cut(mydat$age,c(0:20*6))
-
- for(thisage in levels(mydat$age_bin)){#age= "(0,6]"
- thiscordata <- mydat[mydat$age_bin == thisage,]
-
- if(dim(thiscordata)[1]>30){
- thiscordata$child_id<-factor(thiscordata$child_id)
-
- all_cols = names(thiscordata)
- metrics = all_cols[!is.element(all_cols, no.scale.columns)]
-
- #remove outliers
- for(metric in metrics) thiscordata[, metric] <- ifelse(scale(thiscordata[, metric])>2.5,NA,thiscordata[, metric])
- colnames(thiscordata)<-gsub(".V1","", colnames(thiscordata),fixed=T)
-
- # Scale thiscordata
- thiscordata[, metrics] <- scale(thiscordata[, metrics])
- metrics = metrics[!(metrics=="age")] #remove age, since a regression with age twice doesn't make sense
-
- #print(summary(thiscordata)) #to look at distribution of thiscordata
- # Fit LMER
- for (metric in metrics){#metric="lp_n"
- thiscordata_metric=thiscordata[!is.na(thiscordata[,metric]),]
- if(sum(!is.na(thiscordata_metric[,metric]))>30 & length(levels(factor(thiscordata_metric$experiment))) > 1 ){
- formula <- as.formula(paste0(metric, "~ age + (1|experiment/child_id)"))
- model <- lmer(formula, data=thiscordata_metric)
- form="full"
-
- # get IQR of metric
- iqr = quantile(thiscordata_metric[,metric],.75,na.rm=T)-quantile(thiscordata_metric[,metric],.25,na.rm=T)
- # Ran Ef
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
-
- if(isSingular(model)){
- #check if this is due to exp causing overfit
- if(ranefs_vars[2]==ranefs_stdv[2] & ranefs_vars[2]==0){
- formula <- as.formula(paste0(metric, "~ age + (1|child_id)"))
- model <- lmer(formula, data=mydat)
- form = "no_exp"
- }else{#otherwise, the problem is the child
- if(ranefs_vars[1]==ranefs_stdv[1] & ranefs_vars[1]==0){
- formula <- as.formula(paste0(metric, "~ age + (1|experiment)"))
- model <- lmer(formula, data=mydat)
- form = "no_chi"
- }
- }
- # re-extract Ran Ef when model changed
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
- }
- # print(paste(data_set,metric))
- # print(VarCorr(model))
-
-
- # Mixed ICC
- if(!isSingular(model)) icc.result.mixed <- as.data.frame(icc(model))[-3] else icc.result.mixed <- cbind(NA,NA)
-
- # By group ICC
- if(!isSingular(model)) icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC) else icc.result.split <- cbind(NA,NA)
- # print(paste(data_set,metric))
- # print(icc(model, by_group=TRUE))
- if(form =="no_exp"){
- icc.result.split = cbind(icc.result.split,NA)
- ranefs_vars = cbind(ranefs_vars[1],NA,ranefs_vars[2])
- ranefs_stdv = cbind(ranefs_stdv[1],NA,ranefs_stdv[2])
- }
- if(form =="no_chi"){
- icc.result.split = cbind(NA,icc.result.split)
- ranefs_vars = cbind(NA,ranefs_vars)
- ranefs_stdv = cbind(NA,ranefs_stdv)
- }
-
- sw=shapiro.test(resid(model))$p
-
-
- icc.row = cbind(data_set,thisage,
- metric,iqr,
- t(coefficients(summary(model))["age",]),
- icc.result.mixed,
- icc.result.split,
- ranefs_vars,
- ranefs_stdv,
- nobs(model),t(summary(model)$n),
- form,isSingular(model),sw,
- stringsAsFactors=FALSE)
-
-
- df.icc.age[nrow(df.icc.age) + 1,] <- icc.row
-
-
- #print('-------------------')
- # insight get variance
- }
- }
-
- }
- # else icc.row = cbind(data_set,thisage, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
- # dim(thiscordata)[1],
- # NA,NA,NA)
-
-
- }
-
- }
- write.csv(df.icc.age,"OUTPUT/df.icc.age.csv",row.names=F)
- ```
- ```{r icc-normativity, eval=FALSE}
- corpus = c('cougar')
- # repeat within each corpus
- df.icc.normativity.cols = c("data_set","normative","metric", "iqr",
- "age_b","age_se","age_t",
- "icc_adjusted", "icc_conditional",
- "child_id_var","residual_var",
- "child_id_sd","residual_sd",
- "nobs",
- "formula","is.sing","sw")
- df.icc.normativity = data.frame(matrix(ncol=length(df.icc.normativity.cols),nrow=0, dimnames=list(NULL, df.icc.normativity.cols)),
- stringsAsFactors=FALSE)
- for (data_set in data_sets){ #data_set="aclew"
- print(data_set)
- # Load data and remove unwanted experiments
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- corpus_data <- mydat[mydat$experiment == corpus,]
- children <- read.csv(paste0('DATA/', 'children.csv'))
- children <- children[, c('child_id', 'normative', 'experiment')]
- # Merge data and children normativity
- corpus_data <- merge(corpus_data, children, on=c('child_id', 'experiment'))
- normativity_items <- unique(corpus_data[, c('normative')]) # Should be Y/N
- all_cols = names(corpus_data)
- no.scale.columns.norm <- c(no.scale.columns, 'normative')
- metrics = all_cols[!is.element(all_cols, no.scale.columns.norm)]
- #remove outliers
- # for(metric in metrics) corpus_data[, metric] <- ifelse(scale(corpus_data[, metric])>2.5,NA,corpus_data[, metric])
- # colnames(corpus_data)<-gsub(".V1","", colnames(corpus_data),fixed=T)
- # Scale thiscordata
- # print(corpus_data)
- corpus_data[, metrics] <- scale(corpus_data[, metrics])
- metrics = metrics[!(metrics=="age")] #remove age, since a regression with age twice doesn't make sense
- #print(summary(thiscordata)) #to look at distribution of thiscordata
- # Fit LMER
- for (normativity in normativity_items)
- {
- thiscordata <- corpus_data[corpus_data$normative == normativity,]
- # Not sure why we get NAs for LENA data
- # TODO: check why we have NAs
- for (metric in metrics)if(sum(!is.na(thiscordata[,metric]>0))){#metric="lp_n"
- print(metric)
- formula <- as.formula(paste0(metric, "~ age + (1|child_id)"))
- model <- lmer(formula, data=thiscordata)
- form="full"
- # get IQR of metric
- iqr = quantile(thiscordata[,metric],.75,na.rm=T)-quantile(thiscordata[,metric],.25,na.rm=T)
- # Ran Ef
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
- # Mixed ICC
- if(!is.na(icc(model))) icc.result.mixed <- as.data.frame(icc(model))[-3] else icc.result.mixed <- cbind(NA,NA)
- sw=shapiro.test(resid(model))$p
- icc.row = cbind(data_set,normativity,
- metric,iqr,
- t(coefficients(summary(model))["age",]),
- icc.result.mixed,
- ranefs_vars,
- ranefs_stdv,
- nobs(model),
- form,isSingular(model),sw,
- stringsAsFactors=FALSE)
- df.icc.normativity[nrow(df.icc.normativity) + 1,] <- icc.row
- #print('-------------------')
- # insight get variance
- }
- }
- }
- write.csv(df.icc.normativity,"OUTPUT/df.icc.normativity.csv",row.names=F)
- ```
- ## Reliability analyses combining all corpora
- ```{r icc-allexp, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora."}
- df.icc.mixed<-read.csv("OUTPUT/df.icc.mixed.csv")
- icc_exp <- ggplot(df.icc.mixed, aes(x = icc_corpus, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
- geom_jitter( aes(x = icc_corpus,y=0,colour=data_set))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette) + xlim(0,1)
- icc_chi <- ggplot(df.icc.mixed, aes(x = icc_child_id_corpus, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
- geom_jitter(aes(x = icc_child_id_corpus,y=0,colour=data_set))+ scale_fill_colorblind()+scale_colour_manual(values=cbPalette) + xlim(0,1)
- ggarrange(icc_exp, icc_chi,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
- ```
- ```{r worst, results="as.is"}
- x <- head(df.icc.mixed[order(df.icc.mixed$icc_child_id_corpus),c("data_set","metric","icc_child_id_corpus","icc_corpus")]) #worst
- kable(x,row.names = F,digits=2,caption="Measures with the lowest ICC attributed to children.")
- ```
- ```{r best, results="as.is"}
- x <- tail(df.icc.mixed[order(df.icc.mixed$icc_child_id_corpus),c("data_set","metric","icc_child_id_corpus","icc_corpus")]) #best
- kable(x,row.names = F,digits=2,caption="Measures with the highest ICC attributed to children.")
- ```
- ```{r sel, results="as.is"}
- x <- cbind(df.icc.mixed[df.icc.mixed$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.mixed$data_set=="lena",c("metric","icc_child_id_corpus")] ,
- df.icc.mixed[df.icc.mixed$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.mixed$data_set=="aclew",c("icc_child_id_corpus")]
- )
- kable(x,row.names = F,digits=2,caption="Most commonly used metrics.")
- ```
- ```{r icc-examples, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
- # figure of bad ICC: lena avg_voc_dur_chi; good ICC: lena voc_och_ph
- data_set="lena"
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- mydat <- mydat[is.element(mydat$experiment, corpora),]
- #remove outliers
- for(metric in c("avg_voc_dur_chi","voc_och_ph")) mydat[, metric] <- ifelse(scale(mydat[, metric])>2.5 | scale(mydat[, metric]) < -2.5,NA,mydat[, metric])
- # remove those data points altogether
- mydat <- mydat[!is.na(mydat$avg_voc_dur_chi) & !is.na(mydat$voc_och_ph),]
- #make sure cols work as we want them to
- mydat$child_id <- paste(mydat$experiment,mydat$child_id)
- mydat$unique <- paste(mydat$experiment,mydat$child_id,mydat$session_id)
- #sample down to get 2 recs per child
- mysample = NULL
- for(thischild in levels(as.factor(mydat$child_id))){
- onechidat <- mydat[mydat$child_id == thischild,c("child_id","experiment","age","unique","avg_voc_dur_chi","voc_och_ph")]
-
- if(dim(onechidat)[1]>=2){
- selrows=sample(1:dim(onechidat)[1],2)
- mysample = rbind(mysample,
- cbind(onechidat[selrows[1],],
- onechidat[selrows[2],c("unique","avg_voc_dur_chi","voc_och_ph")]
- )
- )
- }
- }
- colnames(mysample)<-c("child_id","corpus","age","unique1","avg_voc_dur_chi1","voc_och_ph1","unique2","avg_voc_dur_chi2","voc_och_ph2")
- mysample$corpus=factor(mysample$corpus)
- bad <- ggplot(mysample, aes(avg_voc_dur_chi1,avg_voc_dur_chi2)) + geom_point(aes(colour = factor(corpus))) +
- geom_smooth(method='lm', formula= y~x)
- good <- ggplot(mysample, aes(voc_och_ph1,voc_och_ph2)) + geom_point(aes(colour = factor(corpus))) +
- geom_smooth(method='lm', formula= y~x)
- ggarrange(bad, good,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
- ```
- ## ICC normativity
- ```{r icc-normativity, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to ACLEW (a) and LENA (b) in the Cougar corpus split by normativity."}
- df.icc.normativity<-read.csv("OUTPUT/df.icc.normativity.csv")
- icc_normative <- ggplot(df.icc.normativity[df.icc.normativity$data_set == 'aclew',], aes(x = icc_adjusted, fill = normative)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC ACLEW")+
- geom_jitter( aes(x = icc_adjusted,y=0,colour=normative))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette) + xlim(0,1)
- icc_non_normative <- ggplot(df.icc.normativity[df.icc.normativity$data_set == 'lena',], aes(x = icc_adjusted, fill = normative)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC LENA")+
- geom_jitter(aes(x = icc_adjusted,y=0,colour=normative))+ scale_fill_colorblind()+scale_colour_manual(values=cbPalette) + xlim(0,1)
- ggarrange(icc_normative, icc_non_normative,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
- ```
- ## Distance between recs
- ```{r count nb of recordings per child, warning=F, echo=FALSE}
- data_set="lena"
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- mydat <- mydat[is.element(mydat$experiment, corpora),]
- mydat$usession_id <- paste(mydat$child_id,mydat$session_id)
- n_rec<-mydat %>%
- group_by(experiment, child_id)%>%
- summarise(n_rec=n())
- #hist(n_rec$n_rec)
- #sort(table(mydat$child_id))
- #summary(n_rec$n_rec)
- # n_rec %>%
- # group_by(experiment) %>%
- # summarise(temp = list(c(summary(object = n_rec)))) %>%
- # unnest_wider(temp)
- ggplot(n_rec, aes(n_rec))+
- geom_histogram(binwidth=1)+
- facet_wrap(~experiment, scales = "free_y")+
- ggtitle("distribution of # recordings per child, per corpus")
- ```
- ```{r max-dist}
- distance<-mydat %>%
- group_by(experiment, child_id)%>%
- summarise(distance = max(age)-min(age))
-
- # distance %>%
- # group_by(experiment) %>%
- # summarise(temp = list(c(summary(object = distance)))) %>%
- # unnest_wider(temp)
- ggplot(distance, aes(distance))+
- geom_histogram(binwidth=2)+
- facet_wrap(~experiment, scales = "free_y")+
- ggtitle("distance(mo) b/w first & last recording per corpus")
- ```
- ```{r count distance nearest recording, warning = F, echo=FALSE}
- dist_contig <- mydat %>%
- arrange(experiment, child_id, age) %>% #this sorts the table by corpus then id then age
- group_by(child_id) %>%
- mutate(n_rec = n(),
- age_dist_next_rec = lead(age) - age,
- #this gets the diff corresponding cell in the preceding row and a given row
- dist_max_min = max(age)-min(age)) %>% #this gets the max difference
- dplyr::select(experiment, child_id, age, usession_id,
- age_dist_next_rec, dist_max_min, n_rec) %>%
- filter(!is.na(age_dist_next_rec) & n_rec>0)
- # not all rec's have a next rec for same id, and not all have >1 rec per kid.
- # dist_contig %>%
- # group_by(experiment) %>%
- # summarise(temp = list(c(summary(object = age_dist_next_rec)))) %>%
- # unnest_wider(temp)
- ggplot(dist_contig, aes(age_dist_next_rec))+
- geom_histogram(binwidth=2)+
- facet_wrap(~experiment, scales = "free_y")+
- ggtitle("distance (mo) b/w contiguous recordings per corpus")
- dist_contig_pairs <- dist_contig %>%
- arrange(experiment, child_id, age,age_dist_next_rec) %>% #this sorts the table by corpus then id then age then dist
- group_by(child_id) %>%
- slice(c(1,2)) # take only one pair per child, minimizing distance -- this will tend to select the younger kids... need to think about a fix for that
-
- mydist <- dist_contig_pairs %>% slice(1)
- mydist <- subset(mydist, age_dist_next_rec<=2)
- ggplot(mydist, aes(age_dist_next_rec))+
- geom_histogram(binwidth=1)+
- facet_wrap(~experiment, scales = "free_y")+
- ggtitle("distribution of distance in the minimum distance paired recs, per corpus")
- ```
- ```{r icc-allcorporatogether_mindat, eval = FALSE}
- # Declare df to store ICCs
- df.icc.mixed.cols_mindist = c("data_set","metric", "iqr",
- "age_b","age_se","age_t",
- "icc_adjusted", "icc_conditional", "icc_child_id_corpus", "icc_corpus",
- "child_id_var","corpus_var","residual_var",
- "child_id_sd","corpus_sd","residual_sd",
- "nobs",
- "formula","sw")
- df.icc.mixed_mindist = data.frame(matrix(ncol=length(df.icc.mixed.cols_mindist),nrow=0, dimnames=list(NULL, df.icc.mixed.cols_mindist)))
- for (data_set in data_sets){ #data_set="aclew"
-
- # Load data and remove unwanted corpora
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- mydat$usession_id <- paste(mydat$experiment,mydat$child_id,mydat$session_id)
- mydat <- mydat[is.element(mydat$experiment, corpora) & mydat$usession_id %in% levels(factor(dist_contig_pairs$usession_id)),] #this is the key condition: we only choose session id's from close recs
- all_cols = names(mydat)
- metrics = all_cols[!is.element(all_cols, no.scale.columns)]
-
- #remove outliers
- for(metric in metrics) mydat[, metric] <- ifelse(scale(mydat[, metric])>2.5 | scale(mydat[, metric]) < -2.5,NA,mydat[, metric])
- colnames(mydat)<-gsub(".V1","", colnames(mydat),fixed=T)
-
- # Scale mydat
- mydat[, metrics] <- scale(mydat[, metrics])
- metrics = metrics[!(metrics=="age")] #remove age, since a regression with age twice doesn't make sense
-
- #print(summary(mydat)) #to look at distribution of mydat
- # Fit LMER
- for (metric in metrics){#metric="lp_n"
- formula <- as.formula(paste0(metric, "~ age + (1|experiment/child_id)"))
- model <- lmer(formula, data=mydat)
- form="full"
-
- # get IQR of metric
- iqr = quantile(mydat[,metric],.75,na.rm=T)-quantile(mydat[,metric],.25,na.rm=T)
- # Ran Ef
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
-
- if(isSingular(model)){
- #check if this is due to exp causing overfit
- if(round(ranefs_vars[2],3)==round(ranefs_stdv[2],3) & round(ranefs_vars[2],3)==0){
- formula <- as.formula(paste0(metric, "~ age + (1|child_id)"))
- model <- lmer(formula, data=mydat)
- form = "no_exp"
- }else{#otherwise, the problem is probably the child
- if(round(ranefs_vars[1],3)==round(ranefs_stdv[1],3) & round(ranefs_vars[1],3)==0){
- formula <- as.formula(paste0(metric, "~ age + (1|experiment)"))
- model <- lmer(formula, data=mydat)
- form = "no_chi"
- }
- }
- # re-extract Ran Ef when model changed
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
- }
-
-
- # Mixed ICC
- icc.result.mixed <- as.data.frame(icc(model))[-3]
-
- # By group ICC
- icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
- if(form =="no_exp"){
- icc.result.split = cbind(icc.result.split,NA)
- ranefs_vars = cbind(ranefs_vars[1],NA,ranefs_vars[2])
- ranefs_stdv = cbind(ranefs_stdv[1],NA,ranefs_stdv[2])
- }
- if(form =="no_chi"){
- icc.result.split = cbind(NA,icc.result.split)
- ranefs_vars = cbind(NA,ranefs_vars)
- ranefs_stdv = cbind(NA,ranefs_stdv)
- }
-
- sw=shapiro.test(resid(model))$p
-
-
-
- icc.row.mixed = cbind(data_set,
- metric,iqr,
- t(coefficients(summary(model))["age",]),
- icc.result.mixed,
- icc.result.split,
- ranefs_vars,
- ranefs_stdv,
- nobs(model),
- form,sw)
-
-
- df.icc.mixed_mindist[nrow(df.icc.mixed_mindist) + 1,] <- icc.row.mixed
-
- #print('-------------------')
- # insight get variance
- }
- }
- #df.icc.mixed
- write.csv(df.icc.mixed_mindist,"OUTPUT/df.icc.mixed_mindist.csv",row.names=F)
- ```
- ```{r icc-allexp-mindist, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora, but subsetting to two recs per child that have the minimum distance between them. See Tables 4-5 for best and worst metrics in these contiguous recordings."}
- df.icc.mixed<-read.csv("OUTPUT/df.icc.mixed_mindist.csv")
- icc_exp <- ggplot(df.icc.mixed, aes(x = icc_corpus, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
- geom_jitter(aes(x = icc_corpus,y=0,colour=data_set))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette)
- icc_chi <- ggplot(df.icc.mixed, aes(x = icc_child_id_corpus, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
- geom_jitter( aes(x = icc_child_id_corpus,y=0,colour=data_set))+ scale_fill_colorblind()+scale_colour_manual(values=cbPalette)
- ggarrange(icc_exp, icc_chi,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
- ```
- ```{r worst2, results="as.is"}
- x <- head(df.icc.mixed[order(df.icc.mixed$icc_child_id_corpus),c("data_set","metric","icc_child_id_corpus","icc_corpus")]) #worst
- kable(x,row.names = F,digits=2,caption="Measures with the lowest ICC attributed to children when subsetting to two recs per child that have the minimum distance between them.")
- ```
- ```{r best2, results="as.is"}
- x <- tail(df.icc.mixed[order(df.icc.mixed$icc_child_id_corpus),c("data_set","metric","icc_child_id_corpus","icc_corpus")]) #best
- kable(x,row.names = F,digits=2,caption="Measures with the highest ICC attributed to children when subsetting to two recs per child that have the minimum distance between them.")
- ```
- ```{r sel2, results="as.is"}
- x <- cbind(df.icc.mixed[df.icc.mixed$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.mixed$data_set=="lena",c("metric","icc_child_id_corpus")] ,
- df.icc.mixed[df.icc.mixed$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.mixed$data_set=="aclew",c("icc_child_id_corpus")]
- )
- kable(x,row.names = F,digits=2,caption="Most commonly used metrics when subsetting to two recs per child that have the minimum distance between them.")
- ```
- ## Reliability analyses per corpus
- ```{r icc-bycor, echo=F,fig.width=4, fig.height=10,fig.cap="Distribution of ICC attributed to children in each separate corpus."}
- df.icc.corpus<-read.csv("OUTPUT/df.icc.corpus.csv")
- ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = data_set)) +
- geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
- labs( x = "ICC child ID")+
- geom_jitter(aes(x = icc_adjusted,y=0,colour=data_set)) +
- facet_grid(rows=vars(corpus)) +
- scale_fill_colorblind()+scale_colour_manual(values=cbPalette)
- ```
- ## Reliability by child age
- ```{r relBYage, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}
- df.icc.age<-read.csv("OUTPUT/df.icc.age.csv")
- df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=c("(0,6]" , "(6,12]", "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]", "(36,42]", "(42,48]", "(48,54]" ))
- icc_exp <- ggplot(df.icc.age, aes(x = icc_corpus, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
- geom_jitter(aes(x = icc_corpus,y=0,colour=data_set))+
- facet_grid(rows=vars(age_bin)) +
- scale_fill_colorblind() +scale_colour_manual(values=cbPalette)
- icc_chi <- ggplot(df.icc.age, aes(x = icc_child_id_corpus, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
- geom_jitter(aes(x = icc_child_id_corpus,y=0,colour=data_set))+
- facet_grid(rows=vars(age_bin)) +
- scale_fill_colorblind()+scale_colour_manual(values=cbPalette)
- ggarrange(icc_exp, icc_chi,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
- ```
- ```{r sel3, results="as.is"}
- # x <- cbind(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id_corpus")] ,
- # df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id_corpus")]
- # )
- kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id_corpus")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (LENA).")
- kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id_corpus")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (aclew).")
- write.csv(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph"),c("data_set","metric","age_bin","icc_child_id_corpus")],"tabXage.csv")
- ```
- ## Exploratory factor analysis
- ```{r}
- data_set="aclew" #focusing on ACLEW bc better results above
- mydat <- read.csv(paste0('DATA/', data_set,'_metrics.csv'))
- mydat <- mydat[is.element(mydat$experiment, corpora),]
-
- all_cols = names(mydat)
- metrics = all_cols[!is.element(all_cols, no.scale.columns)]
-
- #remove outliers
- for(metric in metrics) mydat[, metric] <- ifelse(scale(mydat[, metric])>2.5 | scale(mydat[, metric]) < -2.5,NA,mydat[, metric])
- colnames(mydat)<-gsub(".V1","", colnames(mydat),fixed=T)
-
- # Scale mydat
- mydat[, metrics] <- scale(mydat[, metrics])
- metrics = metrics[!(metrics=="age")] #remove age, since a regression with age twice doesn't make sense
-
- f1 <- psych::fa(mydat[, metrics])
- f1
- ```
- ## Validity against age
- ```{r ageFX, echo=F,fig.width=7, fig.height=10,fig.cap="Distribution of t for age when all corpora are analyzed together (a) or for each corpus separately (b)."}
- allcor <- ggplot(df.icc.mixed, aes(x = age_t, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "Age") +
- geom_jitter(aes(x = age_t,y=0,colour=data_set)) +
- scale_fill_colorblind() +scale_colour_manual(values=cbPalette)
- bycor <- ggplot(df.icc.corpus, aes(x = age_t, fill = data_set)) +
- geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
- labs( x = "Age")+
- geom_jitter(aes(x = age_t,y=0,colour=data_set)) +
- facet_grid(rows=vars(corpus)) +
- scale_fill_colorblind()+scale_colour_manual(values=cbPalette)
- ggarrange(allcor, bycor,
- labels = c("A", "B"),
- ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
- ```
|