all-analyses.Rmd 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
  1. ---
  2. title: Establishing the reliability and validity of measures extracted from long-form
  3. recordings
  4. output:
  5. pdf_document:
  6. toc: yes
  7. toc_depth: 3
  8. html_document:
  9. toc: yes
  10. toc_depth: '3'
  11. df_print: paged
  12. ---
  13. ```{r setup, include=FALSE, eval=TRUE}
  14. knitr::opts_chunk$set(echo = FALSE, warning = FALSE)
  15. set.seed(185729481)
  16. library("lme4")
  17. library("performance") # ICC
  18. library("ggplot2")
  19. library("ggthemes")
  20. library("ggpubr")
  21. library("kableExtra")
  22. library("psych")
  23. library("dplyr")
  24. library("tidyr")
  25. library("stringr")
  26. cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
  27. # List of corpora we are interested in
  28. corpora <- c('bergelson', 'lucid', 'winnipeg', 'warlaumont','cougar','fausey-trio')
  29. # Columns that should not be scaled or taken into account as metrics
  30. no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s', 'nobs', 'nchi',
  31. 'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","usession_id",
  32. "normative","age","duration_alice", "duration_vcm" , "duration_vtc","duration_its" )
  33. data_sets = c('aclew', 'lena')
  34. check_small_var<-function(x,y,i) round(x[i],3)==round(y[i],3) & round(y[i],3) == 0
  35. fit_child_model<-function(dataframe, metric){
  36. # Fit formula where experiment is removed
  37. formula <- as.formula(paste0(metric, "~ age_s + (1|child_id)"))
  38. model <- lmer(formula, data=dataframe)
  39. return (model)
  40. }
  41. extract_chi_variables<-function(model){
  42. icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
  43. icc.result.split <- c(as.data.frame(icc(model, by_group=TRUE))$ICC, NA)
  44. r2.nakagawa <- r2_nakagawa(model)
  45. ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
  46. ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
  47. # chi, NA, residual
  48. ranefs_vars <-c(ranefs_vars[1],NA,ranefs_vars[2])
  49. ranefs_stdv <-c(ranefs_stdv[1],NA,ranefs_stdv[2])
  50. # chi, NA
  51. ns<-c(unlist(summary(model)$n),NA)
  52. return (c(coefficients(summary(model))["age_s",],
  53. icc.result.mixed,
  54. icc.result.split,
  55. ranefs_vars,
  56. ranefs_stdv,
  57. r2.nakagawa,
  58. nobs(model),ns))
  59. }
  60. extract_full_variables<-function(model){
  61. icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
  62. icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
  63. r2.nakagawa <- r2_nakagawa(model)
  64. ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
  65. ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
  66. ns<-t(data.frame(summary(model)$n))
  67. return (c( coefficients(summary(model))["age_s",],
  68. icc.result.mixed,
  69. icc.result.split,
  70. ranefs_vars,
  71. ranefs_stdv,
  72. r2.nakagawa,
  73. nobs(model),ns))
  74. }
  75. new_fit_models<-function(dataframe, data_set, metric, age, fit_full = TRUE){
  76. #dataframe=mydat ; age = NA ; full = TRUE
  77. iqr = quantile(dataframe[,metric],.75,na.rm=T)-quantile(dataframe[,metric],.25,na.rm=T)
  78. if(fit_full){
  79. # Fit full model
  80. formula <- as.formula(paste0(metric, "~ age_s + (1|experiment/child_id)"))
  81. model <- lmer(formula, data=dataframe)
  82. }
  83. if(fit_full && !isSingular(model)) # Fitted full model short circuit and
  84. {
  85. form="full"
  86. sw=shapiro.test(resid(model))$p
  87. mod_variables <- extract_full_variables(model)
  88. # Build line
  89. } else {
  90. model <- fit_child_model(dataframe, metric)
  91. if(!isSingular(model)){
  92. form = "no_exp"
  93. sw=shapiro.test(resid(model))$p
  94. mod_variables <- extract_chi_variables(model)
  95. } else {
  96. form='no_chi_effect'
  97. sw = NA
  98. mod_variables = c(
  99. NA,NA,NA, # c(coefficients(summary(model))["age_s",]
  100. NA,NA, # icc.result.mixed
  101. NA,NA, # icc.result.split
  102. NA,NA,NA, # ranefs_vars
  103. NA,NA,NA, # raners_std
  104. NA,NA, # r2
  105. NA,NA,NA) # nobs (child, corpus), nobs
  106. }
  107. }
  108. icc.row = c(data_set, age, metric, iqr, mod_variables, form, sw)
  109. return (icc.row)
  110. }
  111. ```
  112. ## Recalculate everything or not?
  113. If RECALC is set to TRUE, then the ICC tables will be re-generated.
  114. ```{r}
  115. RECALC=FALSE
  116. ```
  117. ## Generate ICC tables
  118. ```{r create-all-icc, eval=RECALC}
  119. df.icc.mixed.cols = c("data_set","age_bin", "metric", "iqr",
  120. "age_b","age_se","age_t", # beta, standard error, T
  121. "icc_adjusted", "icc_conditional",
  122. "icc_child_id", "icc_corpus",
  123. "child_id_var","corpus_var","residual_var",
  124. "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
  125. "nobs","nchi", "ncor",
  126. "formula","sw")
  127. df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
  128. stringsAsFactors = FALSE)
  129. for (data_set in data_sets){ # data_set = "aclew"
  130. # Load data
  131. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  132. metrics <- colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  133. for(metric in metrics)
  134. { # metric = "voc_chi_ph"
  135. icc.row <- new_fit_models(mydat, data_set, metric, NA, TRUE)
  136. df.icc.mixed[nrow(df.icc.mixed) + 1,] <- icc.row
  137. }
  138. }
  139. write.csv(df.icc.mixed,"../output/df.icc.mixed.csv",row.names=F)
  140. # repeat for the version within each corpus
  141. df.icc.corpus.cols = c("data_set","corpus","metric", "iqr",
  142. "age_b","age_se","age_t",
  143. "icc_adjusted", "icc_conditional",
  144. "icc_child_id", "icc_corpus",
  145. "child_id_var","corpus_var","residual_var",
  146. "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
  147. "nobs","nchi", "ncor",
  148. "formula","sw")
  149. df.icc.corpus = data.frame(matrix(ncol=length(df.icc.corpus.cols),nrow=0, dimnames=list(NULL, df.icc.corpus.cols)),
  150. stringsAsFactors=FALSE)
  151. for (data_set in data_sets){ # data_set = "aclew"
  152. # Load data
  153. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  154. for(corpus in corpora){
  155. mycordat <- mydat[mydat$experiment == corpus, ]
  156. metrics <- colnames(mycordat)[!is.element(colnames(mycordat), no.scale.columns)]
  157. for(metric in metrics)
  158. { # metric = "avg_voc_dur_mal"
  159. icc.row <- new_fit_models(mycordat, data_set, metric, corpus, FALSE)
  160. df.icc.corpus[nrow(df.icc.corpus) + 1,] <- icc.row
  161. }
  162. }
  163. }
  164. write.csv(df.icc.corpus,"../output/df.icc.corpus.csv",row.names=F)
  165. ```
  166. ```{r}
  167. mydat <- read.csv(paste0('../data_output/', 'aclew','_metrics.csv'))
  168. child_per_corpus = setNames(aggregate(data = mydat, child_id ~ experiment, function(child_id) length(unique(child_id))), c('experiment', 'No_Children'))
  169. rec_per_corpus = setNames(aggregate(data = mydat, session_id ~ experiment, function(session_id) length(unique(session_id))), c('experiment', 'No_Rec'))
  170. dur_per_corpus = setNames(aggregate(data = mydat, duration_vtc ~ experiment, function(duration_vtc) sum(duration_vtc)/3.6e+6), c('experiment', 'Duration_h'))
  171. age_mean_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) mean(age)), c('experiment', 'Mean_Age'))
  172. age_min_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) min(age)), c('experiment', 'Min_Age'))
  173. age_max_per_corpus = setNames(aggregate(data = mydat, age ~ experiment, function(age) max(age)), c('experiment', 'Max_Age'))
  174. corp_code = data.frame(
  175. experiment=c("bergelson", "cougar", "fausey-trio", "lucid", "warlaumont", "winnipeg"),
  176. code=c("BER", "COU", "TRI", "L05", "WAR", "MCD"),
  177. location=c("Northeast US", "Northwest US", "Western US", "Northwest England", "Western US", "Western Canada")
  178. )
  179. corp_desc_list = list(corp_code, child_per_corpus, rec_per_corpus, dur_per_corpus, age_mean_per_corpus, age_min_per_corpus, age_max_per_corpus)
  180. corpus_description <- Reduce(function(x, y) merge(x, y, all=TRUE), corp_desc_list)
  181. corpus_description <- transform(corpus_description, Age_Range=paste(Min_Age, Max_Age, sep="-"))
  182. corpus_description <- subset(corpus_description, select = -c(Min_Age, Max_Age))
  183. write.csv(corpus_description, "../output/corpus_description.csv", sep='\t')
  184. nkids=length(levels(factor(paste(mydat$experiment,mydat$child_id))))
  185. ```
  186. ```{r icc-age, eval=RECALC}
  187. #We do this one separately because we want to standardize metrics within each age group
  188. # repeat within age group bins
  189. df.icc.age.cols = c("data_set","age_bin","metric", "iqr",
  190. "age_b","age_se","age_t", # beta, standard error, T
  191. "icc_adjusted", "icc_conditional",
  192. "icc_child_id", "icc_corpus",
  193. "child_id_var","corpus_var","residual_var",
  194. "child_id_sd","corpus_sd","residual_sd",
  195. "nobs", "nchi","ncor",
  196. "formula","sw")
  197. df.icc.age = data.frame(matrix(ncol=length(df.icc.age.cols),nrow=0, dimnames=list(NULL, df.icc.age.cols)),
  198. stringsAsFactors=FALSE)
  199. for (data_set in data_sets){ # data_set = "aclew"
  200. # Load data and calculate age cuts
  201. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
  202. mydat$age_bin <- cut(mydat$age,c(0:6*6))
  203. metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  204. for(thisage in levels(mydat$age_bin))
  205. { #thisage= "(0,6]"
  206. thiscordata <- mydat[mydat$age_bin == thisage,]
  207. for(metric in metrics)
  208. { # metric = "avg_voc_dur_mal"
  209. pre_scaled_metric <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
  210. thiscordata[abs(pre_scaled_metric)>2.5 & !is.na(pre_scaled_metric), metric] <- NA
  211. thiscordata[, metric] <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
  212. if(dim(thiscordata)[1] > 30 & length(levels(factor(thiscordata$experiment))) > 1)
  213. {
  214. icc.row <- new_fit_models(thiscordata, data_set, metric, thisage, TRUE)
  215. }else{
  216. icc.row <- c(data_set,thisage,metric,iqr,
  217. NA,NA,NA,
  218. NA,NA,
  219. NA,NA,
  220. NA,NA,NA,
  221. NA,NA,NA,
  222. NA,NA,NA,
  223. "not_enough_data",NA)
  224. }
  225. df.icc.age[nrow(df.icc.age) + 1,] <- icc.row
  226. }
  227. }
  228. }
  229. write.csv(df.icc.age,"../output/df.icc.age.csv",row.names=F)
  230. ```
  231. ## Describe datasets
  232. ```{r}
  233. df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
  234. ```
  235. We are looking here at `r length(corpora)` corpora, `r nkids` children, `r max(df.icc.mixed$nobs[df.icc.mixed$data_set=="lena"])` recordings, `r length(levels(factor(df.icc.mixed$metric)))` many metrics.
  236. <!-- The number of children comes from nkids, in the first chunk that is not evaluated. -->
  237. ## Reliability analyses combining all corpora
  238. ```{r icc-allexp, echo=F,fig.width=4, fig.height=3,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when combining data from all corpora."}
  239. df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
  240. icc_exp <- ggplot(df.icc.mixed, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
  241. geom_jitter( aes(x = icc_corpus,y=0,colour=toupper(data_set)))+ scale_fill_colorblind() +scale_colour_manual(values=cbPalette) + theme(text = element_text(size = 20)) + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline")
  242. icc_chi <- ggplot(df.icc.mixed, aes(x = icc_child_id, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
  243. geom_jitter(aes(x = icc_child_id,y=0,colour=toupper(data_set)))+ scale_fill_colorblind()+scale_colour_manual(values=cbPalette) + theme(text = element_text(size = 20)) + ylim(-0.5,11.25) + xlim(0,1) + labs(fill='Pipeline', color="Pipeline")
  244. ggarrange(icc_exp, icc_chi,
  245. labels = c("A", "B"),
  246. ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=-0.1,
  247. font.label = list(size = 20)) + theme(text = element_text(size = 20))
  248. ```
  249. ```{r worst, results="as.is"}
  250. x <- head(df.icc.mixed[order(df.icc.mixed$icc_child_id),c("data_set","metric","icc_child_id","icc_corpus")]) #worst
  251. kable(x,row.names = F,digits=2,caption="Measures with the lowest ICC attributed to children.")
  252. ```
  253. ```{r best, results="as.is"}
  254. x <- tail(df.icc.mixed[order(df.icc.mixed$icc_child_id),c("data_set","metric","icc_child_id","icc_corpus")]) #best
  255. kable(x,row.names = F,digits=2,caption="Measures with the highest ICC attributed to children.")
  256. ```
  257. ```{r sel, results="as.is"}
  258. key_metrics = c("lena_CVC", "lena_CTC", "voc_dur_chi_ph", "voc_chi_ph", "voc_fem_ph", "voc_mal_ph", "wc_adu_ph")
  259. x <- merge(df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="lena",c("metric","icc_child_id")] ,
  260. df.icc.mixed[df.icc.mixed$metric %in% key_metrics & df.icc.mixed$data_set=="aclew",c("metric","icc_child_id")],
  261. by='metric', all=TRUE)
  262. colnames(x) <- c("metric", "LENA ICC", "ACLEW ICC")
  263. kable(x,row.names = F,digits=2,caption="Most commonly used metrics.")
  264. ```
  265. ```{r icc-examples-fig2, echo=F,fig.width=4, fig.height=3,fig.cap="(A) scatterplot for one variable with relatively low ICCs versus (B) one with relatively higher ICCs (see Tables 1-2 for details)"}
  266. # figure of bad ICC: lena avg_voc_dur_chi; good ICC: lena voc_och_ph
  267. data_set="lena"
  268. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv'))
  269. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  270. mydat <- mydat[is.element(mydat$experiment, corpora),]
  271. # remove those data points altogether
  272. mydat <- mydat[!is.na(mydat$avg_voc_dur_chi) & !is.na(mydat$voc_och_ph),]
  273. #make sure cols work as we want them to
  274. mydat$child_id <- paste(mydat$experiment,mydat$child_id)
  275. mydat$unique <- paste(mydat$experiment,mydat$child_id,mydat$session_id)
  276. #sample down to get 2 recs per child
  277. mysample = NULL
  278. for(thischild in levels(as.factor(mydat$child_id))){
  279. onechidat <- mydat[mydat$child_id == thischild,c("child_id","experiment","age","unique","avg_voc_dur_chi","voc_och_ph")]
  280. if(dim(onechidat)[1]>=2){
  281. selrows=sample(1:dim(onechidat)[1],2)
  282. mysample = rbind(mysample,
  283. cbind(onechidat[selrows[1],],
  284. onechidat[selrows[2],c("unique","avg_voc_dur_chi","voc_och_ph")]
  285. )
  286. )
  287. }
  288. }
  289. colnames(mysample) <-
  290. c(
  291. "child_id",
  292. "corpus",
  293. "age",
  294. "unique1",
  295. "avg_voc_dur_chi1",
  296. "voc_och_ph1",
  297. "unique2",
  298. "avg_voc_dur_chi2",
  299. "voc_och_ph2"
  300. )
  301. mysample$corpus = factor(mysample$corpus)
  302. mylimits=range(mysample[,c("avg_voc_dur_chi1","avg_voc_dur_chi2")])
  303. bad <-
  304. ggplot(mysample, aes(avg_voc_dur_chi1, avg_voc_dur_chi2)) +
  305. geom_point(aes(colour = factor(corpus))) +
  306. geom_smooth(method = 'lm', formula = y ~ x) +
  307. labs(color = "Corpus") +
  308. theme(text = element_text(size = 20)) +
  309. theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
  310. panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  311. scale_x_continuous(name="Avg chi voc dur rec 1",limits=mylimits) +
  312. scale_y_continuous(name="Avg chi voc dur rec 2",limits=mylimits) +
  313. geom_abline(intercept = 0, slope = 1)
  314. mylimits=range(mysample[,c("voc_och_ph1","voc_och_ph2")])
  315. good <-
  316. ggplot(mysample, aes(voc_och_ph1, voc_och_ph2)) +
  317. geom_point(aes(colour = factor(corpus))) +
  318. geom_smooth(method = 'lm', formula = y ~ x) +
  319. labs(color = "Corpus") +
  320. theme(text = element_text(size = 20)) +
  321. theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
  322. panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  323. scale_x_continuous(name="N other ch voc rec 1",limits=mylimits) +
  324. scale_y_continuous(name="N other ch voc rec 2",limits=mylimits) +
  325. geom_abline(intercept = 0, slope = 1)
  326. ggarrange(bad, good,
  327. ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
  328. font.label = list(size = 20)) + labs(color= "Corpus") + theme(text = element_text(size = 20))
  329. ```
  330. ```{r reg model}
  331. read.csv("../output/df.icc.mixed.csv")->df.icc.mixed
  332. df.icc.mixed$subject[grepl("chi", df.icc.mixed$metric, fixed = TRUE)] <- "chi"
  333. df.icc.mixed$subject[df.icc.mixed$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
  334. df.icc.mixed$subject[grepl("och", df.icc.mixed$metric, fixed = TRUE)] <- "och"
  335. df.icc.mixed$subject[grepl("mal", df.icc.mixed$metric, fixed = TRUE)] <- "mal"
  336. df.icc.mixed$subject[grepl("fem", df.icc.mixed$metric, fixed = TRUE)] <- "fem"
  337. df.icc.mixed$subject[grepl("adu", df.icc.mixed$metric, fixed = TRUE)] <- "adu"
  338. df.icc.mixed$subject= factor(df.icc.mixed$subject,levels=c("chi","och","fem","mal","adu"))
  339. lr_icc_chi <- lm(icc_child_id ~ subject + data_set, data=df.icc.mixed)
  340. #binomial could be used, but diagnostic plots look pretty good (other than some outliers)
  341. summary(lr_icc_chi)
  342. # Get common ACLEW/LENA metrics to see if statistically different
  343. common_metrics = intersect(df.icc.mixed[df.icc.mixed$data_set=='aclew', "metric"],
  344. df.icc.mixed[df.icc.mixed$data_set=='lena', "metric"])
  345. lr_icc_chi_common <- lm(icc_child_id ~ subject + data_set, data=df.icc.mixed,subset=c(metric %in% common_metrics))
  346. summary(lr_icc_chi_common)
  347. ```
  348. ## Paired analysis with minimum distance between recs
  349. ```{r correlation analysis for close recs, warning = F, echo=FALSE}
  350. #to compare with Gilkerson, we just need to focus on AWC, CVC, CTC (in LENA, but for comparison purposes, we also include aclew)
  351. nsamples=10
  352. data_set="lena"
  353. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  354. mydat$uchild_id <- paste(mydat$experiment, mydat$child_id)
  355. mydat$usession_id <- paste(mydat$uchild_id, mydat$session_id)
  356. mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),]
  357. dist_contig <- mydat %>%
  358. arrange(experiment, child_id, age) %>% #this sorts the table by corpus then id then age
  359. group_by(experiment, child_id) %>%
  360. mutate(n_rec = n(),
  361. age_dist_next_rec = lead(age) - age,
  362. #this gets the diff corresponding cell in the preceding row and a given row
  363. next_session = lead(usession_id)) %>%
  364. filter(age_dist_next_rec<2)
  365. summary(dist_contig$n_rec) #distribution of n of recs per child
  366. table(dist_contig$experiment) #this is the number of eligible recordings per corpus
  367. sum(table(dist_contig$experiment)) #and overall
  368. table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])#this is the number of eligible children per corpus
  369. sum(table(dist_contig$experiment[!duplicated(dist_contig$uchild_id)])) #and overall
  370. #given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do 10 because there are a lot of recs in cougar & bergelson..
  371. data_set="aclew"
  372. mydat_aclew <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  373. mydat_aclew$uchild_id <- paste(mydat_aclew$experiment, mydat_aclew$child_id)
  374. mydat_aclew$usession_id <- paste(mydat_aclew$uchild_id, mydat_aclew$session_id)
  375. all_rs=matrix(NA,nrow=nsamples,ncol=5)
  376. colnames(all_rs)<-c("lena_CVC","lena_CTC","lena_wc_adu_ph","aclew_voc_chi_ph","aclew_wc_adu_ph")
  377. for(i in 1:nsamples){
  378. #for each child, sample 2 contiguous recordings that are less than 2 months away
  379. #step 1: sample one session per child among the list of sessions that are close by
  380. close_sessions <- dist_contig %>%
  381. group_by(uchild_id)%>%
  382. slice_sample(n = 1)
  383. #step 2: get data from those sampled sessions as rec1
  384. rec1 = subset(dist_contig,usession_id %in% close_sessions$usession_id)
  385. #step 3: get the next session
  386. rec2_sessions=levels(factor(rec1$next_session))
  387. rec2 = subset(mydat,usession_id %in% rec2_sessions)
  388. all_rs[i,"lena_CVC"]<-cor.test(rec1$lena_CVC,rec2$lena_CVC)$estimate
  389. all_rs[i,"lena_CTC"]<-cor.test(rec1$lena_CTC,rec2$lena_CTC)$estimate
  390. all_rs[i,"lena_wc_adu_ph"]<-cor.test(rec1$wc_adu_ph,rec2$wc_adu_ph)$estimate
  391. #step 2: get data from those sampled sessions as rec1
  392. rec1 = subset(mydat_aclew,usession_id %in% close_sessions$usession_id)
  393. #step 3: get the next session
  394. rec2 = subset(mydat_aclew,usession_id %in% rec2_sessions)
  395. all_rs[i,"aclew_voc_chi_ph"]<-cor.test(rec1$voc_chi_ph,rec2$voc_chi_ph)$estimate
  396. all_rs[i,"aclew_wc_adu_ph"]<-cor.test(rec1$wc_adu_ph,rec2$wc_adu_ph)$estimate
  397. }
  398. summary(all_rs)
  399. mean_rvalue <- as.data.frame(sapply(as.data.frame(all_rs), mean))
  400. mean_rvalue <- setNames(cbind(rownames(mean_rvalue), mean_rvalue, row.names = NULL),
  401. c("Metric", "Mean"))
  402. sd_rvalue <- as.data.frame(sapply(as.data.frame(all_rs), sd))
  403. sd_rvalue <- setNames(cbind(rownames(sd_rvalue), sd_rvalue, row.names = NULL),
  404. c("Metric", "SD"))
  405. mean_sd_rvalue <- merge(mean_rvalue, sd_rvalue, by='Metric')
  406. #
  407. kable(mean_sd_rvalue)
  408. print(mean_sd_rvalue)
  409. ```
  410. ## Reliability analyses per corpus
  411. ```{r icc-bycor, echo=F,fig.width=4, fig.height=10,fig.cap="Distribution of ICC attributed to children in each separate corpus."}
  412. df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
  413. ggplot(df.icc.corpus, aes(x = icc_adjusted, fill = toupper(data_set))) +
  414. geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
  415. labs( x = "ICC child ID")+
  416. geom_jitter(aes(x = icc_adjusted,y=0,colour=toupper(data_set))) +
  417. facet_grid(rows=vars(corpus)) +
  418. scale_fill_colorblind()+scale_colour_manual(values=cbPalette) +
  419. ylim(-0.5,6) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline") + theme(text = element_text(size = 20))
  420. ```
  421. ## Reliability by child age
  422. ```{r relBYage, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}
  423. df.icc.age<-read.csv("../output/df.icc.age.csv")
  424. df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=c("(0,6]" , "(6,12]", "(12,18]" ,"(18,24]" ,"(24,30]", "(30,36]", "(36,42]", "(42,48]", "(48,54]" ))
  425. icc_exp <- ggplot(df.icc.age, aes(x = icc_corpus, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC corpus")+
  426. geom_jitter(aes(x = icc_corpus,y=0,colour=toupper(data_set)))+
  427. facet_grid(rows=vars(age_bin)) +
  428. scale_fill_colorblind() +scale_colour_manual(values=cbPalette)+ ylim(-0.5,16) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline") + theme(text = element_text(size = 15))
  429. icc_chi <- ggplot(df.icc.age, aes(x = icc_child_id, fill = toupper(data_set))) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "ICC child ID")+
  430. geom_jitter(aes(x = icc_child_id,y=0,colour=toupper(data_set)))+
  431. facet_grid(rows=vars(age_bin)) +
  432. scale_fill_colorblind()+scale_colour_manual(values=cbPalette)+ ylim(-0.5,15) + xlim(0, 1) + labs(fill='Pipeline', color="Pipeline") + theme(text = element_text(size = 15))
  433. ggarrange(icc_exp, icc_chi,
  434. labels = c("A", "B"),
  435. ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.05, hjust=0.45,
  436. font.label = list(size = 15))
  437. ```
  438. ```{r sel3, results="as.is"}
  439. # x <- cbind(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id")] ,
  440. # df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id")]
  441. # )
  442. kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="lena",c("metric","age_bin","icc_child_id")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (LENA).")
  443. kable(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph") & df.icc.age$data_set=="aclew",c("metric","age_bin","icc_child_id")],row.names = F,digits=2,caption="Most commonly used metrics by age bin (aclew).")
  444. write.csv(df.icc.age[df.icc.age$metric %in% c("voc_chi_ph","wc_adu_ph"),c("data_set","metric","age_bin","icc_child_id")],"tabXage.csv")
  445. ```
  446. ```{r reg mod in age}
  447. common_metrics = intersect(df.icc.age[df.icc.age$data_set=='aclew', "metric"],
  448. df.icc.age[df.icc.age$data_set=='lena', "metric"])
  449. df.icc.age$subject[grepl("chi", df.icc.age$metric, fixed = TRUE)] <- "chi"
  450. df.icc.age$subject[df.icc.age$metric %in% c("lp_dur","lp_n","lena_CVC","cp_dur","cp_n")] <- "chi"
  451. df.icc.age$subject[grepl("och", df.icc.age$metric, fixed = TRUE)] <- "och"
  452. df.icc.age$subject[grepl("mal", df.icc.age$metric, fixed = TRUE)] <- "mal"
  453. df.icc.age$subject[grepl("fem", df.icc.age$metric, fixed = TRUE)] <- "fem"
  454. df.icc.age$subject[grepl("adu", df.icc.age$metric, fixed = TRUE)] <- "adu"
  455. df.icc.age$subject= factor(df.icc.age$subject,levels=c("chi","och","fem","mal","adu"))
  456. lr_icc_chi <- lm(icc_child_id ~ subject + data_set + age_bin, data=df.icc.age)
  457. #binomial could be used, but diagnostic plots look pretty good (other than some outliers)
  458. plot(lr_icc_chi)
  459. summary(lr_icc_chi)
  460. lr_icc_chi_common <- lm(icc_child_id ~ subject + data_set + age_bin, data=df.icc.age,subset=c(metric %in% common_metrics))
  461. plot(lr_icc_chi_common)
  462. summary(lr_icc_chi_common)
  463. ```
  464. ## Validity against age
  465. ```{r ageFX, echo=F,fig.width=7, fig.height=10,fig.cap="Distribution of t for age when all corpora are analyzed together (a) or for each corpus separately (b)."}
  466. allcor <- ggplot(df.icc.mixed, aes(x = age_t, fill = data_set)) + geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +labs( x = "Age") +
  467. geom_jitter(aes(x = age_t,y=0,colour=data_set)) +
  468. scale_fill_colorblind() +scale_colour_manual(values=cbPalette)
  469. bycor <- ggplot(df.icc.corpus, aes(x = age_t, fill = data_set)) +
  470. geom_density(alpha = 0.5) + theme(legend.position = "top", axis.title.y=element_blank() ) +
  471. labs( x = "Age")+
  472. geom_jitter(aes(x = age_t,y=0,colour=data_set)) +
  473. facet_grid(rows=vars(corpus)) +
  474. scale_fill_colorblind()+scale_colour_manual(values=cbPalette)
  475. ggarrange(allcor, bycor,
  476. labels = c("A", "B"),
  477. ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5)
  478. ```
  479. ```{r}
  480. df.icc.mixed<-read.csv("../output/df.icc.mixed.csv")
  481. df.icc.age<-read.csv("../output/df.icc.age.csv")
  482. df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
  483. ```
  484. ```{r}
  485. target_metrics = c("lena_CVC", "lena_CTC", "voc_dur_chi_ph", "voc_chi_ph", "voc_fem_ph", "voc_mal_ph", "wc_adu_ph")
  486. ggplot(
  487. df.icc.age[df.icc.age$metric %in% target_metrics, ],
  488. aes(x=factor(age_bin, level=str_sort(unique(df.icc.age$age_bin), numeric=TRUE)),
  489. y=icc_child_id, group=interaction(metric, data_set), color=metric, linetype=grepl('lena', metric, fixed = TRUE))) +
  490. scale_linetype_manual(values = c("TRUE" = "dashed", "FALSE" = "solid")) +
  491. guides(linetype = "none")+
  492. geom_line() +
  493. facet_grid(. ~ toupper(data_set)) +
  494. ylim(0, 1) +
  495. xlab('Age bins') +
  496. ylab('ICC Child') +
  497. labs(color="Metric") +
  498. theme(legend.position = "top")
  499. ggsave(paste0("plots/metrics/age_metrics.png"))
  500. ```
  501. ## Formula stats
  502. Mixed models formula summary:
  503. `r kable(table(df.icc.mixed$formula))`
  504. Corpus models formula summary:
  505. `r kable(table(df.icc.corpus$formula))`
  506. Age models formula summary:
  507. `r kable(table(df.icc.age$formula))`
  508. ## Save information about packages used
  509. ```{r}
  510. writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
  511. ```