1 year ago · 3dcf0e953f
--- a/CODE/SM.Rmd
+++ b/CODE/SM.Rmd
@@ -489,7 +489,7 @@ kable(reg_anova_cor)
 
				 
			
 
				 
			
 
				 
			
 
				-## Code to reproduce "Exploratory analyses: Reliability within corpus"
			
 
				+## Code to reproduce text and figures in "Exploratory analyses: Reliability within corpus"
			
 
				 
			
 
				 ```{r read in icc by corpus}
			
 
				 df.icc.corpus<-read.csv("../output/df.icc.corpus.csv")
			
@@ -497,6 +497,8 @@ df.icc.corpus$Type <- get_type(df.icc.corpus)
 
				 
			
 
				 ```
			
 
				 
			
 
				+Figure 5A addresses this question, showing the distribution of ICC across our 53 metrics in each of the `r length(levels(factor(df.icc.corpus$corpus)))` included corpora.  Out of `r dim(df.icc.corpus)[1]` fitted models (53 metrics times `r length(levels(factor(df.icc.corpus$corpus)))` corpora), `r sum(df.icc.corpus$formula=="no_chi_effect")` were singular when including a random intercept per child, and therefore they could not be included in these analyses at all, and the remaining `r sum(df.icc.corpus$formula=="no_exp")` were singular when including a random intercept per corpus.
			
 
				+
			
 
				 
			
 
				 ```{r icc-bycor-fig5A, echo=F,fig.width=4, fig.height=10,fig.cap="Child ICC by metric type and pipeline, when considering each corpus separately."}
			
 
				 
			
@@ -535,7 +537,7 @@ ggplot(r_X_corpus, aes(y = cor, x = corpusA)) +
 
				 
			
 
				 
			
 
				 
			
 
				-```{r reg model cor}
			
 
				+```{r reg model corpusm}
			
 
				 
			
 
				 
			
 
				 cor_icc <- lm(icc_child_id ~ Type * data_set * corpus, data=df.icc.corpus) 
			
@@ -548,15 +550,17 @@ reg_anova_cor_icc=Anova(cor_icc)
 
				 
			
 
				 ```
			
 
				 
			
 
				-We checked whether Child ICC differed by talker types and pipelines across corpora by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * corpus)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children;  pipeline LENA or ACLEW; and corpus the corpus ID. We found an adjusted R-squared of `r round(reg_sum_cor_icc$adj.r.squared*100)`%, suggesting this model explained over half of the variance in Child ICC. A Type 3 ANOVA on this model revealed several significant effects and interactions, including a three-way interaction of type, pipeline, and corpus.
			
 
				+The fact that we cannot infer reliability from one corpus based on another one was confirmed statistically: We checked whether Child ICC differed by talker types and pipelines across corpora by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * corpus)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children;  pipeline LENA or ACLEW; and corpus the corpus ID. We found an adjusted R-squared of `r round(reg_sum_cor_icc$adj.r.squared*100)`%, suggesting this model explained over half of the variance in Child ICC. A Type 3 ANOVA on this model revealed several significant effects and interactions, including a three-way interaction of type, pipeline, and corpus; a two-way interaction of type and corpus; and a main effect of corpus. See the Supplementary Materials for more information.
			
 
				 
			
 
				-```{r print out anova results rec on cor}
			
 
				-kable(reg_anova_cor)
			
 
				+```{r print out anova results rec on icc by corpus}
			
 
				+kable(reg_anova_cor_icc)
			
 
				 ```
			
 
				 
			
 
				 
			
 
				 
			
 
				-## Code to reproduce "Exploratory analyses: Reliability across age groups"
			
 
				+## Code to reproduce text and figures in "Exploratory analyses: Reliability across age groups"
			
 
				+
			
 
				+
			
 
				 
			
 
				 ```{r prepAge}
			
 
				 df.icc.age<-read.csv("../output/df.icc.age.csv")
			
@@ -568,20 +572,24 @@ df.icc.age$age_bin<-factor(df.icc.age$age_bin,levels=age_levels)
 
				 df.icc.age$Type<-get_type(df.icc.age)
			
 
				 ```
			
 
				 
			
 
				+Out of `r dim(df.icc.age)[1]` fitted models (53 metrics times `r length(levels(factor(df.icc.age$age_bin)))` age bins), `r sum(df.icc.age$formula=="no_chi_effect")` were singular when including a random intercept per child, and therefore they could not be included in these analyses at all. In addition, `r sum(df.icc.age$formula=="no_exp")` were singular when including a random intercept per corpus. The remaining `r sum(df.icc.age$formula=="full")` could be analyzed with the full model.
			
 
				 
			
 
				 
			
 
				 ```{r relBYage-fig6A, echo=F,fig.width=6, fig.height=10,fig.cap="Distribution of ICC attributed to corpus (a) and children (b), when binning children's age."}
			
 
				 
			
 
				 #this complicated section is just to add N of participants in each facet, we first estimate it:
			
 
				-facet_labels=NULL
			
 
				+facet_labels_chi=facet_labels_cor=NULL
			
 
				 for(thisage in levels(df.icc.age$age_bin)){#thisage="(0,6]"
			
 
				-  if(min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T) !=max(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T)){
			
 
				-    facet_labels = c(facet_labels,paste0("N chi=",paste(range(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T),collapse="-"))) 
			
 
				-  } else facet_labels = c(facet_labels,paste0("N chi=",min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T))) 
			
 
				+  facet_labels_cor = c(facet_labels_cor,paste0("N cor=",min(df.icc.age$ncor[df.icc.age$age_bin==thisage],na.rm=T))) #checked: there is no variation across metrics in n of corpora included
			
 
				+    if(min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T) !=max(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T)){
			
 
				+    facet_labels_chi = c(facet_labels_chi,paste0("N chi=",paste(range(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T),collapse="-"))) 
			
 
				+  } else {
			
 
				+    facet_labels_chi = c(facet_labels_chi,paste0("N chi=",min(df.icc.age$nchi[df.icc.age$age_bin==thisage],na.rm=T))) 
			
 
				+  }
			
 
				 }
			
 
				 
			
 
				-#and then we structure it so that it goes ont he plot
			
 
				-f_labels<-data.frame(age_bin=levels(df.icc.age$age_bin),label=facet_labels)
			
 
				+#and then we structure it so that it goes on the plot
			
 
				+f_labels<-data.frame(age_bin=levels(df.icc.age$age_bin),facet_labels_chi=facet_labels_chi,facet_labels_cor=facet_labels_cor)
			
 
				 
			
 
				 f_labels$age_bin<-factor(f_labels$age_bin,levels=age_levels)
			
 
				 
			
@@ -589,12 +597,28 @@ ggplot(df.icc.age, aes(y = icc_child_id, x = toupper(data_set))) +
 
				   geom_violin(alpha = 0.5) +
			
 
				   geom_quasirandom(aes(colour = Type,shape = Type)) +  
			
 
				   theme(legend.position="none") +labs( y = "r",x="Pipeline") + facet_wrap(~age_bin, ncol = 3) +
			
 
				-  geom_text(x=1.5,y=max(df.icc.age$icc_child_id,na.rm=T),aes(label=label),data=f_labels,size=2)
			
 
				+  geom_text(x=1.5,y=max(df.icc.age$icc_child_id,na.rm=T),aes(label=facet_labels_chi),data=f_labels,size=2) +
			
 
				+  geom_text(x=1.5,y=max(df.icc.age$icc_child_id,na.rm=T)*.95,aes(label=facet_labels_cor),data=f_labels,size=2)
			
 
				 
			
 
				 ```
			
 
				 
			
 
				 
			
 
				 
			
 
				+```{r reg model age}
			
 
				+
			
 
				+
			
 
				+age_icc <- lm(icc_child_id ~ Type * data_set * age_bin, data=df.icc.age) 
			
 
				+#plot(age_icc)
			
 
				+#binomial could be used,  diagnostic plots look good
			
 
				+
			
 
				+reg_sum_age_icc=summary(age_icc)
			
 
				+
			
 
				+reg_anova_age_icc=Anova(age_icc)
			
 
				+
			
 
				+```
			
 
				+
			
 
				+As we did in the previous section for corpus, we checked whether Child ICC differed by talker types and pipelines across age bins by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * age_bin)$. We found an adjusted R-squared of `r round(reg_sum_age_icc$adj.r.squared*100)`%, suggesting this model explained over half of the variance in Child ICC. However, a Type 3 ANOVA on this model revealed only an interaction of type and age bin, as well as a main effect of age bin, suggesting less complex effects than in the case of corpus. See the Supplementary Materials for more information.
			
 
				+
			
 
				 
			
 
				 ```{r icc-bycor-fig6B, echo=F,fig.width=4, fig.height=4,fig.cap="Correlations in Child ICC across corpora. Each point indicates the correlation in Child ICC for the corpus named in the x-axis with every other corpus."}
			
 
				 
			
--- a/CODE/SM.log
+++ b/CODE/SM.log
--- a/CODE/SM.pdf
+++ b/CODE/SM.pdf
--- a/CODE/sessionInfo.txt
+++ b/CODE/sessionInfo.txt
@@ -1,6 +1,6 @@
 
				-R version 4.2.2 (2022-10-31)
			
 
				+R version 4.2.0 (2022-04-22)
			
 
				 Platform: x86_64-apple-darwin17.0 (64-bit)
			
 
				-Running under: macOS Big Sur ... 10.16
			
 
				+Running under: macOS Big Sur/Monterey 10.16
			
 
				 
			
 
				 Matrix products: default
			
 
				 BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
			
@@ -13,25 +13,26 @@ attached base packages:
 
				 [1] stats     graphics  grDevices utils     datasets  methods   base     
			
 
				 
			
 
				 other attached packages:
			
 
				- [1] stringr_1.4.0      tidyr_1.2.0        dplyr_1.0.9        psych_2.2.5       
			
 
				- [5] kableExtra_1.3.4   ggpubr_0.4.0       ggthemes_4.2.4     ggplot2_3.3.6     
			
 
				- [9] performance_0.10.2 lme4_1.1-29        Matrix_1.5-1      
			
 
				+ [1] ggbeeswarm_0.7.1  car_3.1-1         carData_3.0-5     stringr_1.5.0    
			
 
				+ [5] tidyr_1.2.1       dplyr_1.0.10      psych_2.2.9       kableExtra_1.3.4 
			
 
				+ [9] ggpubr_0.5.0      ggthemes_4.2.4    ggplot2_3.4.0     performance_0.9.0
			
 
				+[13] lme4_1.1-31       Matrix_1.5-3     
			
 
				 
			
 
				 loaded via a namespace (and not attached):
			
 
				- [1] Rcpp_1.0.8.3      svglite_2.1.0     lattice_0.20-45   assertthat_0.2.1 
			
 
				- [5] digest_0.6.29     utf8_1.2.2        R6_2.5.1          backports_1.4.1  
			
 
				- [9] evaluate_0.15     highr_0.9         httr_1.4.3        pillar_1.7.0     
			
 
				-[13] rlang_1.0.2       rstudioapi_0.13   minqa_1.2.4       car_3.0-13       
			
 
				-[17] nloptr_2.0.3      rmarkdown_2.14    labeling_0.4.2    splines_4.2.2    
			
 
				-[21] webshot_0.5.3     munsell_0.5.0     broom_0.8.0       compiler_4.2.2   
			
 
				-[25] xfun_0.31         pkgconfig_2.0.3   systemfonts_1.0.4 mnormt_2.1.0     
			
 
				-[29] mgcv_1.8-41       htmltools_0.5.2   insight_0.18.8    tidyselect_1.1.2 
			
 
				-[33] gridExtra_2.3     tibble_3.1.7      fansi_1.0.3       viridisLite_0.4.0
			
 
				-[37] crayon_1.5.1      withr_2.5.0       MASS_7.3-58.1     grid_4.2.2       
			
 
				-[41] nlme_3.1-160      gtable_0.3.0      lifecycle_1.0.1   DBI_1.1.2        
			
 
				-[45] magrittr_2.0.3    scales_1.2.0      cli_3.3.0         stringi_1.7.6    
			
 
				-[49] carData_3.0-5     farver_2.1.0      ggsignif_0.6.3    xml2_1.3.3       
			
 
				-[53] ellipsis_0.3.2    generics_0.1.2    vctrs_0.4.1       cowplot_1.1.1    
			
 
				-[57] boot_1.3-28       tools_4.2.2       glue_1.6.2        purrr_0.3.4      
			
 
				-[61] parallel_4.2.2    abind_1.4-5       fastmap_1.1.0     yaml_2.3.5       
			
 
				-[65] colorspace_2.0-3  rstatix_0.7.0     rvest_1.0.2       knitr_1.39       
			
 
				+ [1] Rcpp_1.0.9        svglite_2.1.0     lattice_0.20-45   assertthat_0.2.1 
			
 
				+ [5] digest_0.6.31     utf8_1.2.2        R6_2.5.1          backports_1.4.1  
			
 
				+ [9] evaluate_0.19     highr_0.10        httr_1.4.4        pillar_1.8.1     
			
 
				+[13] rlang_1.0.6       rstudioapi_0.14   minqa_1.2.5       nloptr_2.0.3     
			
 
				+[17] rmarkdown_2.19    labeling_0.4.2    splines_4.2.0     webshot_0.5.4    
			
 
				+[21] munsell_0.5.0     broom_1.0.2       vipor_0.4.5       compiler_4.2.0   
			
 
				+[25] xfun_0.36         pkgconfig_2.0.3   systemfonts_1.0.4 mnormt_2.1.1     
			
 
				+[29] mgcv_1.8-40       htmltools_0.5.4   insight_0.17.1    tidyselect_1.2.0 
			
 
				+[33] gridExtra_2.3     tibble_3.1.8      fansi_1.0.3       viridisLite_0.4.1
			
 
				+[37] withr_2.5.0       MASS_7.3-56       grid_4.2.0        nlme_3.1-157     
			
 
				+[41] gtable_0.3.1      lifecycle_1.0.3   DBI_1.1.2         magrittr_2.0.3   
			
 
				+[45] scales_1.2.1      cli_3.5.0         stringi_1.7.8     farver_2.1.1     
			
 
				+[49] ggsignif_0.6.4    xml2_1.3.3        generics_0.1.3    vctrs_0.5.1      
			
 
				+[53] cowplot_1.1.1     boot_1.3-28       tools_4.2.0       beeswarm_0.4.0   
			
 
				+[57] glue_1.6.2        purrr_1.0.0       abind_1.4-5       parallel_4.2.0   
			
 
				+[61] fastmap_1.1.0     yaml_2.3.6        colorspace_2.0-3  rstatix_0.7.1    
			
 
				+[65] rvest_1.0.3       knitr_1.41