4 months ago · 4ce629128c
--- a/CODE/Rplot.pdf
+++ b/CODE/Rplot.pdf
--- a/CODE/SM.Rmd
+++ b/CODE/SM.Rmd
@@ -1,13 +1,13 @@
 
				 ---
			
 
				 title: Supplementary Materials to Establishing the reliability and validity of measures extracted from long-form recordings
			
 
				 output:
			
 
				+  pdf_document:
			
 
				+    toc: yes
			
 
				+    toc_depth: 3
			
 
				   html_document:
			
 
				     toc: yes
			
 
				     toc_depth: '3'
			
 
				     df_print: paged
			
 
				-  pdf_document:
			
 
				-    toc: yes
			
 
				-    toc_depth: 3
			
 
				 ---
			
 
				 
			
 
				 ```{r setup, include=FALSE, eval=TRUE}
			
@@ -157,30 +157,30 @@ Third, and perhaps most relevant, we looked for references that evaluated the ps
 
				 
			
 
				 
			
 
				 ```{r tab2}
			
 
				-chi_per_corpus= aggregate(data = mydat_aclew, child_id ~ experiment, function(child_id) length(unique(child_id)))[,2]
			
 
				+chiXcor= aggregate(data = mydat_aclew, child_id ~ experiment, function(child_id) length(unique(child_id)))[,2]
			
 
				 
			
 
				-rec_per_corpus = aggregate(data = mydat_aclew, session_id ~ experiment, function(session_id) length(unique(session_id)))[,2]
			
 
				+recXcor = aggregate(data = mydat_aclew, session_id ~ experiment, function(session_id) length(unique(session_id)))[,2]
			
 
				 
			
 
				 rec_per_child = setNames(aggregate(data = mydat_aclew, session_id ~ experiment*child_id, function(session_id) length(unique(session_id))), c('experiment', 'Chi', 'No_rec'))
			
 
				 
			
 
				 min_rec_per_child = aggregate(data = rec_per_child, No_rec ~ experiment, min)[,2]
			
 
				 max_rec_per_child = aggregate(data = rec_per_child, No_rec ~ experiment, max)[,2]
			
 
				-rec_r_per_child = paste(min_rec_per_child,max_rec_per_child,sep="-")
			
 
				+recRXchi = paste(min_rec_per_child,max_rec_per_child,sep="-")
			
 
				 
			
 
				-dur_per_corpus = aggregate(data = mydat_aclew, duration_vtc ~ experiment, function(duration_vtc) round(mean(duration_vtc)/3.6e+6,1))[,2]
			
 
				+durXcor = aggregate(data = mydat_aclew, duration_vtc ~ experiment, function(duration_vtc) round(mean(duration_vtc)/3.6e+6,1))[,2]
			
 
				 
			
 
				-age_mean_per_corpus = aggregate(data = mydat_aclew, age ~ experiment, function(age) round(mean(age),1))[,2]
			
 
				+ageXcor = aggregate(data = mydat_aclew, age ~ experiment, function(age) round(mean(age),1))[,2]
			
 
				 
			
 
				 age_min_per_corpus = aggregate(data = mydat_aclew, age ~ experiment, function(age) min(age))[,2]
			
 
				 
			
 
				 age_max_per_corpus = aggregate(data = mydat_aclew, age ~ experiment, function(age) max(age))[,2]
			
 
				 
			
 
				-age_r_per_corpus = paste(age_min_per_corpus,age_max_per_corpus,sep="-")
			
 
				+ageRXcor = paste(age_min_per_corpus,age_max_per_corpus,sep="-")
			
 
				 
			
 
				 corpus=c("bergelson", "cougar", "fausey-trio", "lucid","lyon", "quechua",  "warlaumont", "winnipeg")
			
 
				 location=c("Northeast US", "Northwest US", "Western US", "Northwest England", "Central France", "Highlands Bolivia", "Western US", "Western Canada")
			
 
				 
			
 
				-corpus_description=cbind(corpus,location,chi_per_corpus, rec_r_per_child, rec_per_corpus, dur_per_corpus, age_mean_per_corpus,age_r_per_corpus)
			
 
				+corpus_description=cbind(corpus,location,chiXcor, recRXchi, recXcor, durXcor, ageXcor,ageRXcor)
			
 
				 
			
 
				 write.table(corpus_description, "../output/corpus_description.csv", sep='\t')
			
 
				 
			
@@ -195,7 +195,7 @@ nrecs=length(levels(mydat_aclew$session_id))
 
				 
			
 
				 ## SM D: Code to reproduce Fig. 2
			
 
				 
			
 
				-```{r icc-examples-fig2, fig.width=4, fig.height=3,fig.cap="Figure 2 (reproduced). Scatterplots for two selected variables. The left one has relatively low ICCs; the right one has relatively higher ICCs."}
			
 
				+```{r icc-examples-fig2,  fig.width=6, fig.height=4.5,fig.cap="Figure 2 (reproduced). Scatterplots for two selected variables. The left one has relatively low ICCs; the right one has relatively higher ICCs."}
			
 
				 # figure of bad ICC: lena     used to be: avg_voc_dur_chi, now is: peak_wc_adu_ph; good ICC: lena used to be: voc_och_ph, now is: voc_dur_och_ph
			
 
				 
			
 
				 # remove missing data points altogether
			
@@ -258,11 +258,12 @@ panel.background = element_blank(), axis.line = element_line(colour = "black"))
 
				   geom_abline(intercept = 0, slope = 1)
			
 
				 
			
 
				 
			
 
				-ggarrange(bad, good,
			
 
				+fig2 = ggarrange(bad, good,
			
 
				           ncol = 2, nrow = 1, common.legend = TRUE, vjust = 1.5, hjust=0,
			
 
				           font.label = list(size = 20))  + labs(color= "Corpus")  +  theme(text = element_text(size = 20))
			
 
				+fig2 
			
 
				 
			
 
				-
			
 
				+ggsave("fig2.png", plot = fig2, width = 6, height = 4.5, units = "in")
			
 
				 ```
			
 
				 
			
 
				 ## SM E: Code to reproduce text at the beginning of the "Setting the stage" section
			
@@ -318,7 +319,7 @@ cor_t=t.test(rval_tab$m ~ rval_tab$data_set)
 
				 
			
 
				 ```
			
 
				 
			
 
				-> To see whether correlations in this analysis differed by talker types and pipelines, we fit a linear model with the formula $lm(cor ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. The model was overall significant (F(`round(reg_sum_cor$fstatistic["dendf"],2)`) = `round(reg_sum_cor$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum_cor$adj.r.squared*100)`%, suggesting this model did not explain a great deal of variance in correlation coefficients. A Type 3 ANOVA on this model revealed a significant effect of pipeline (F = `r round(reg_anova_cor["data_set","F value"],2)`, p = `r round(reg_anova_cor["data_set","Pr(>F)"],2)`), due to higher correlations for ACLEW (`r r_msds["aclew","x"]`) than for LENA metrics (m = `r r_msds["lena","x"]`). 
			
 
				+> To see whether correlations in this analysis differed by talker types and pipelines, we fit a linear model with the formula $lm(cor ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. The model was overall significant (F(`r round(reg_sum_cor$fstatistic["dendf"],2)`) = `r round(reg_sum_cor$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum_cor$adj.r.squared*100)`%, suggesting this model did not explain a great deal of variance in correlation coefficients. A Type 3 ANOVA on this model revealed a significant effect of pipeline (F = `r round(reg_anova_cor["data_set","F value"],2)`, p = `r round(reg_anova_cor["data_set","Pr(>F)"],2)`), due to higher correlations for ACLEW (`r r_msds["aclew","x"]`) than for LENA metrics (m = `r r_msds["lena","x"]`). 
			
 
				 
			
 
				 See table below for results of the Type 3 ANOVA.
			
 
				 
			
@@ -333,11 +334,16 @@ kable(round(reg_anova_cor,2),caption="Type 3 ANOVA on model attempting to explai
 
				 ```{r r-fig4, echo=F,fig.width=4, fig.height=3,fig.cap="Figure 4 (reproduced). Violin plot reflecting the distribution of correlations."}
			
 
				 
			
 
				 
			
 
				-ggplot(rval_tab, aes(y = m, x = toupper(data_set))) +
			
 
				+fig4 <- ggplot(rval_tab, aes(y = m, x = toupper(data_set))) +
			
 
				   geom_violin(alpha = 0.5) +
			
 
				   geom_quasirandom(aes(colour = Type,shape = Type)) +  
			
 
				-  theme() +labs( y = "r",x="Pipeline")
			
 
				+  theme() +labs( y = "r",x="Pipeline") + 
			
 
				+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
			
 
				+panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 
			
 
				 
			
 
				+fig4
			
 
				+
			
 
				+ggsave("fig4.png", plot = fig4, width = 4, height = 3, units = "in")
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -446,13 +452,17 @@ panel.background = element_blank(), legend.key=element_blank(), axis.line = elem
 
				 ```{r icc-allexp-fig5, echo=F,fig.width=4, fig.height=3,fig.cap="Figure 5 (reproduced). Violin plot reflecting the distribution of Child ICC."}
			
 
				 
			
 
				 
			
 
				-ggplot(df.icc.mixed, aes(y = icc_child_id, x = toupper(data_set))) +
			
 
				+fig5 <- ggplot(df.icc.mixed, aes(y = icc_child_id, x = toupper(data_set))) +
			
 
				   geom_violin(alpha = 0.5) +
			
 
				   geom_quasirandom(aes(colour = Type,shape = Type)) +  
			
 
				   labs( y = "Child ICC",x="Pipeline") +  theme(text = element_text(size = 20)) + 
			
 
				   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
			
 
				 panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 
			
 
				 
			
 
				+fig5
			
 
				+
			
 
				+ggsave("fig5.png", plot = fig5, width = 4, height = 3, units = "in")
			
 
				+
			
 
				 ```
			
 
				 
			
 
				 
			
@@ -478,7 +488,7 @@ rownames(msds_p)<-msds_p$Group.1
 
				 ```
			
 
				 
			
 
				 
			
 
				-> Next, we explored how similar Child ICCs were across different talker types and pipelines. We fit a linear model with the formula $lm(icc\_child\_id ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. The model was overall significant (F(`round(reg_sum$fstatistic["dendf"],2)`) = `round(reg_sum$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum$adj.r.squared*100)`%, suggesting much of the variance across Child ICCs was explained by these factors. A Type 3 ANOVA on this model revealed type was a signficant predictor (F(`r reg_anova["Type","Df"]`) = `r round(reg_anova["Type","F value"],1)`, p<.001), as was pipeline (F(`r reg_anova["data_set","Df"]`) = `r round(reg_anova["data_set","F value"],1)`, p = `r round(reg_anova["data_set","Pr(>F)"],3)`); the interaction between type and pipeline was not significant. The main effect of type emerged because output metrics tended to have higher Child ICC (`r msds["Output","x"]`)  than those associated to adults in general (`r msds["Adults","x"]`), females (`r msds["Female","x"]`), and males (`r msds["Male","x"]`); whereas those associated with other children had even higher Child ICCs (`r msds["Other children","x"]`). The main effect of pipeline arose because of slightly higher Child ICCs for the ACLEW metrics (`r msds_p["aclew","x"]`) than for LENA metrics (`r msds_p["lena","x"]`). 
			
 
				+> Next, we explored how similar Child ICCs were across different talker types and pipelines. We fit a linear model with the formula $lm(icc\_child\_id ~ type * pipeline)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children; and pipeline LENA or ACLEW. The model was overall significant (F(`r round(reg_sum$fstatistic["dendf"],2)`) = `r round(reg_sum$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum$adj.r.squared*100)`%, suggesting much of the variance across Child ICCs was explained by these factors. A Type 3 ANOVA on this model revealed type was a signficant predictor (F(`r reg_anova["Type","Df"]`) = `r round(reg_anova["Type","F value"],1)`, p<.001), as was pipeline (F(`r reg_anova["data_set","Df"]`) = `r round(reg_anova["data_set","F value"],1)`, p = `r round(reg_anova["data_set","Pr(>F)"],3)`); the interaction between type and pipeline was not significant. The main effect of type emerged because output metrics tended to have higher Child ICC (`r msds["Output","x"]`)  than those associated to adults in general (`r msds["Adults","x"]`), females (`r msds["Female","x"]`), and males (`r msds["Male","x"]`); whereas those associated with other children had even higher Child ICCs (`r msds["Other children","x"]`). The main effect of pipeline arose because of slightly higher Child ICCs for the ACLEW metrics (`r msds_p["aclew","x"]`) than for LENA metrics (`r msds_p["lena","x"]`). 
			
 
				 
			
 
				 
			
 
				 ## SM O: Code to reproduce Table 4
			
@@ -528,7 +538,7 @@ f_labels<-data.frame(age_bin=levels(df.icc.age$age_bin),facet_labels_chi=facet_l
 
				 
			
 
				 f_labels$age_bin<-factor(f_labels$age_bin,levels=age_levels)
			
 
				 
			
 
				-ggplot(df.icc.age, aes(y = icc_child_id, x = toupper(data_set))) +
			
 
				+fig6 <- ggplot(df.icc.age, aes(y = icc_child_id, x = toupper(data_set))) +
			
 
				   geom_violin(alpha = 0.5) +
			
 
				   geom_quasirandom(aes(colour = Type,shape = Type)) +  
			
 
				   theme(legend.position="none") +labs( y = "r",x="Pipeline") + facet_wrap(~age_bin, ncol = 3) +
			
@@ -537,6 +547,9 @@ ggplot(df.icc.age, aes(y = icc_child_id, x = toupper(data_set))) +
 
				   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
			
 
				 panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 
			
 
				 
			
 
				+fig6
			
 
				+
			
 
				+ggsave("fig6.png", plot = fig6, width = 6, height = 10, units = "in")
			
 
				 
			
 
				 
			
 
				 ```
			
@@ -556,7 +569,7 @@ reg_anova_age_icc=Anova(age_icc)
 
				 
			
 
				 ```
			
 
				 
			
 
				-> To interrogate these results statistically, and assess whether Child ICCs tended to be higher or lower in certain age bins, we fit a linear model with the formula $lm(Child_ICC ~ type * pipeline * age_bin)$. The model was overall significant (F(`round(reg_sum_age_icc$fstatistic["dendf"],2)`) = `round(reg_sum_age_icc$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum_age_icc$adj.r.squared*100)`%, suggesting this model explained about a third of the variance in Child ICC.  A Type 3 ANOVA on this model revealed type was a signficant predictor (F(`r reg_anova["Type","Df"]`) = `r round(reg_anova["Type","F value"],1)`, p<.001), whereas as was pipeline (F(`r reg_anova["data_set","Df"]`) = `r round(reg_anova["data_set","F value"],1)`, p = `r round(reg_anova["data_set","Pr(>F)"],3)`); the interaction between type and pipeline was not significant. 
			
 
				+> To interrogate these results statistically, and assess whether Child ICCs tended to be higher or lower in certain age bins, we fit a linear model with the formula $lm(Child_ICC ~ type * pipeline * age_bin)$. The model was overall significant (F(`r round(reg_sum_age_icc$fstatistic["dendf"],2)`) = `r round(reg_sum_age_icc$fstatistic["value"],2)`, p < .001). We found an adjusted R-squared of `r round(reg_sum_age_icc$adj.r.squared*100)`%, suggesting this model explained about a third of the variance in Child ICC.  A Type 3 ANOVA on this model revealed type was a signficant predictor (F(`r reg_anova["Type","Df"]`) = `r round(reg_anova["Type","F value"],1)`, p<.001), whereas as was pipeline (F(`r reg_anova["data_set","Df"]`) = `r round(reg_anova["data_set","F value"],1)`, p = `r round(reg_anova["data_set","Pr(>F)"],3)`); the interaction between type and pipeline was not significant. 
			
 
				 
			
 
				 See table below for results of the Type 3 ANOVA.
			
 
				 
			
@@ -591,12 +604,16 @@ r_X_age$ageA=factor(r_X_age$ageA,levels=age_levels)
 
				 
			
 
				 #summary(r_X_age$cor) #mean correlation across corpora is zero!
			
 
				 
			
 
				-ggplot(r_X_age, aes(y = cor, x = ageA)) +
			
 
				+fig7 <- ggplot(r_X_age, aes(y = cor, x = ageA)) +
			
 
				   geom_violin(alpha = 0.5) +
			
 
				   geom_quasirandom() +
			
 
				   theme() +labs( y = "Correlation coefficient r",x="Age") + 
			
 
				   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
			
 
				 panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 
			
 
				+
			
 
				+fig7
			
 
				+
			
 
				+ggsave("fig7.png", plot = fig7, width = 4, height = 4, units = "in")
			
 
				 ```
			
 
				 
			
 
				 
			
@@ -610,16 +627,16 @@ panel.background = element_blank(), legend.key=element_blank(), axis.line = elem
 
				 ## SM U: Code to reproduce Figure 8
			
 
				 
			
 
				 
			
 
				-```{r icc-bycor-fig8, echo=F,fig.width=4, fig.height=10,fig.cap="Figure 8 (reproduced). Child ICC by metric type and pipeline, when considering each corpus separately."}
			
 
				+```{r icc-bycor-fig8, echo=F,fig.width=4, fig.height=4,fig.cap="Figure 8 (reproduced). Child ICC by metric type and pipeline, when considering each corpus separately."}
			
 
				 
			
 
				-facet_labels_chi = paste0("N chi=",chi_per_corpus)
			
 
				+facet_labels_chi = paste0("N chi=",chiXcor)
			
 
				 
			
 
				 #and then we structure it so that it goes on the plot
			
 
				 f_labels<-data.frame(levels(factor(df.icc.corpus$corpus)),facet_labels_chi=facet_labels_chi)
			
 
				 
			
 
				 colnames(f_labels)<-c("corpus","nchi")
			
 
				 
			
 
				-ggplot(df.icc.corpus, aes(y = icc_child_id, x = toupper(data_set))) +
			
 
				+fig8 <- ggplot(df.icc.corpus, aes(y = icc_child_id, x = toupper(data_set))) +
			
 
				   geom_violin(alpha = 0.5) +
			
 
				   geom_quasirandom(aes(colour = Type,shape = Type)) +  
			
 
				   theme(legend.position = "top", axis.title.y=element_blank() ,axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +labs( y = "Child ICC",x="Pipeline") +   
			
@@ -628,6 +645,9 @@ ggplot(df.icc.corpus, aes(y = icc_child_id, x = toupper(data_set))) +
 
				   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
			
 
				 panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 
			
 
				 
			
 
				+fig8
			
 
				+
			
 
				+ggsave("fig8.png", plot = fig8, width = 4, height = 4, units = "in")
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -646,7 +666,7 @@ reg_anova_cor_icc=Anova(cor_icc)
 
				 
			
 
				 ```
			
 
				 
			
 
				-> The fact that we cannot infer reliability from one corpus based on another one was confirmed statistically: We checked whether Child ICC differed by talker types and pipelines across corpora by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * corpus)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children;  pipeline LENA or ACLEW; and corpus the corpus ID. The model was overall significant (F(`round(reg_sum_cor_icc$fstatistic["dendf"],2)`) = `round(reg_sum_cor_icc$fstatistic["value"],2)`, p < .001).  We found an adjusted R-squared of `r round(reg_sum_cor_icc$adj.r.squared*100)`%, suggesting this model explained nearly half of the variance in Child ICC. A Type 3 ANOVA on this model revealed several significant effects and interactions, including a three-way interaction of type, pipeline, and corpus  (F(`r reg_anova_cor_icc["Type:data_set:corpus","Df"]`) = `r round(reg_anova_cor_icc["Type:data_set:corpus","F value"],1)`, p<.001); a two-way interaction of type and corpus  (F(`r reg_anova_cor_icc["data_set:corpus","Df"]`) = `r round(reg_anova_cor_icc["data_set:corpus","F value"],1)`, p<.001); and a main effect of corpus (F(`r reg_anova_cor_icc["corpus","Df"]`) = `r round(reg_anova_cor_icc["corpus","F value"],1)`, p<.001). 
			
 
				+> The fact that we cannot infer reliability from one corpus based on another one was confirmed statistically: We checked whether Child ICC differed by talker types and pipelines across corpora by fitting a linear model with the formula $lm(Child_ICC ~ type * pipeline * corpus)$, where type indicates whether the measure pertained to the key child, (female/male) adults, other children;  pipeline LENA or ACLEW; and corpus the corpus ID. The model was overall significant (F(`r round(reg_sum_cor_icc$fstatistic["dendf"],2)`) = `r round(reg_sum_cor_icc$fstatistic["value"],2)`, p < .001).  We found an adjusted R-squared of `r round(reg_sum_cor_icc$adj.r.squared*100)`%, suggesting this model explained nearly half of the variance in Child ICC. A Type 3 ANOVA on this model revealed several significant effects and interactions, including a three-way interaction of type, pipeline, and corpus  (F(`r reg_anova_cor_icc["Type:data_set:corpus","Df"]`) = `r round(reg_anova_cor_icc["Type:data_set:corpus","F value"],1)`, p<.001); a two-way interaction of type and corpus  (F(`r reg_anova_cor_icc["data_set:corpus","Df"]`) = `r round(reg_anova_cor_icc["data_set:corpus","F value"],1)`, p<.001); and a main effect of corpus (F(`r reg_anova_cor_icc["corpus","Df"]`) = `r round(reg_anova_cor_icc["corpus","F value"],1)`, p<.001). 
			
 
				 
			
 
				 See Table below for results of the Type 3 ANOVA.
			
 
				 
			
@@ -658,7 +678,7 @@ kable(round(reg_anova_cor_icc,2),caption="Type 3 ANOVA on model attempting to ex
 
				 
			
 
				 ## SM W: Code to reproduce Figure 9
			
 
				 
			
 
				-```{r icc-bycor-fig9, echo=F,fig.width=4, fig.height=10,fig.cap="Figure 9 (reproduced). Correlations in Child ICC across corpora."}
			
 
				+```{r icc-bycor-fig9, echo=F,fig.width=4, fig.height=4,fig.cap="Figure 9 (reproduced). Correlations in Child ICC across corpora."}
			
 
				 
			
 
				 
			
 
				 
			
@@ -678,12 +698,16 @@ r_X_corpus$cor=as.numeric(as.character(r_X_corpus$cor))
 
				 
			
 
				 #summary(r_X_corpus$cor) #mean correlation across corpora is zero!
			
 
				 
			
 
				-ggplot(r_X_corpus, aes(y = cor, x = corpusA)) +
			
 
				+fig9 <- ggplot(r_X_corpus, aes(y = cor, x = corpusA)) +
			
 
				   geom_violin(alpha = 0.5) +
			
 
				   geom_quasirandom() +  
			
 
				   theme() +labs( y = "Correlation coefficient r",x="Corpus") + 
			
 
				   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
			
 
				 panel.background = element_blank(), legend.key=element_blank(), axis.line = element_line(colour = "black")) 
			
 
				+
			
 
				+fig9
			
 
				+
			
 
				+ggsave("fig9.png", plot = fig9, width = 4, height = 4, units = "in")
			
 
				 ```
			
 
				 
			
 
				 ## SM X: Code to reproduce text in the Discussion section
			
@@ -703,14 +727,14 @@ northam[grep("Bolivia",location)]<-F
 
				 northam[grep("France",location)]<-F
			
 
				 northam[grep("England",location)]<-F
			
 
				 
			
 
				-bias_tab<-data.frame(cbind(chi_per_corpus, rec_per_corpus))
			
 
				-bias_tab$chi_per_corpus<-bias_tab$chi_per_corpus/sum(bias_tab$chi_per_corpus)
			
 
				-bias_tab$rec_per_corpus<-bias_tab$rec_per_corpus/sum(bias_tab$rec_per_corpus)
			
 
				+bias_tab<-data.frame(cbind(chiXcor, recXcor))
			
 
				+bias_tab$chiXcor<-bias_tab$chiXcor/sum(bias_tab$chiXcor)
			
 
				+bias_tab$recXcor<-bias_tab$recXcor/sum(bias_tab$recXcor)
			
 
				 
			
 
				 
			
 
				 ```
			
 
				 
			
 
				-> Our data draws mainly from urban (`r round(sum(bias_tab$rec_per_corpus[urban])*100)`% of recordings, `r round(sum(bias_tab$chi_per_corpus[urban])*100)`% of the children, `r round(sum(urban)/length(urban)*100)`% of the corpora), English-speaking settings (`r round(sum(bias_tab$rec_per_corpus[english])*100)`% of recordings, `r round(sum(bias_tab$chi_per_corpus[english])*100)`% of the children, `r round(sum(english)/length(english)*100)`% of the corpora), and almost exclusively from North America (`r round(sum(bias_tab$rec_per_corpus[northam])*100)`% of recordings, `r round(sum(bias_tab$chi_per_corpus[northam])*100)`% of the children, `r round(sum(northam)/length(northam)*100)`% of the corpora). 
			
 
				+> Our data draws mainly from urban (`r round(sum(bias_tab$recXcor[urban])*100)`% of recordings, `r round(sum(bias_tab$chiXcor[urban])*100)`% of the children, `r round(sum(urban)/length(urban)*100)`% of the corpora), English-speaking settings (`r round(sum(bias_tab$recXcor[english])*100)`% of recordings, `r round(sum(bias_tab$chiXcor[english])*100)`% of the children, `r round(sum(english)/length(english)*100)`% of the corpora), and almost exclusively from North America (`r round(sum(bias_tab$recXcor[northam])*100)`% of recordings, `r round(sum(bias_tab$chiXcor[northam])*100)`% of the children, `r round(sum(northam)/length(northam)*100)`% of the corpora). 
			
 
				 
			
 
				 ## SM Y: Variability as a function of hardware
			
 
				 
			
--- a/CODE/SM.log
+++ b/CODE/SM.log
--- a/CODE/SM.pdf
+++ b/CODE/SM.pdf
--- a/CODE/fig2.png
+++ b/CODE/fig2.png
--- a/CODE/fig4.png
+++ b/CODE/fig4.png
--- a/CODE/fig5.png
+++ b/CODE/fig5.png
--- a/CODE/fig6.png
+++ b/CODE/fig6.png
--- a/CODE/fig7.png
+++ b/CODE/fig7.png
--- a/CODE/fig8.png
+++ b/CODE/fig8.png
--- a/CODE/fig9.png
+++ b/CODE/fig9.png
--- a/CODE/sessionInfo.txt
+++ b/CODE/sessionInfo.txt
@@ -23,21 +23,20 @@ other attached packages:
 
				 [13] lme4_1.1-33           Matrix_1.5-4.1       
			
 
				 
			
 
				 loaded via a namespace (and not attached):
			
 
				- [1] beeswarm_0.4.0    gtable_0.3.3      xfun_0.39         bslib_0.5.0      
			
 
				- [5] insight_0.19.2    rstatix_0.7.2     lattice_0.21-8    vctrs_0.6.3      
			
 
				- [9] tools_4.3.0       generics_0.1.3    parallel_4.3.0    tibble_3.2.1     
			
 
				-[13] fansi_1.0.4       highr_0.10        pkgconfig_2.0.3   webshot_0.5.5    
			
 
				-[17] lifecycle_1.0.3   farver_2.1.1      compiler_4.3.0    mnormt_2.1.1     
			
 
				-[21] munsell_0.5.0     vipor_0.4.5       htmltools_0.5.5   sass_0.4.7       
			
 
				-[25] yaml_2.3.7        pillar_1.9.0      nloptr_2.0.3      jquerylib_0.1.4  
			
 
				-[29] MASS_7.3-60       cachem_1.0.8      boot_1.3-28.1     abind_1.4-5      
			
 
				-[33] nlme_3.1-162      tidyselect_1.2.0  rvest_1.0.3       digest_0.6.33    
			
 
				-[37] stringi_1.7.12    purrr_1.0.1       labeling_0.4.2    splines_4.3.0    
			
 
				-[41] cowplot_1.1.1     fastmap_1.1.1     grid_4.3.0        colorspace_2.1-0 
			
 
				-[45] cli_3.6.1         magrittr_2.0.3    utf8_1.2.3        broom_1.0.5      
			
 
				-[49] withr_2.5.0       scales_1.2.1      backports_1.4.1   rmarkdown_2.23   
			
 
				-[53] httr_1.4.6        gridExtra_2.3     ggsignif_0.6.4    evaluate_0.21    
			
 
				-[57] knitr_1.43        viridisLite_0.4.2 mgcv_1.8-42       rlang_1.1.1      
			
 
				-[61] Rcpp_1.0.10       glue_1.6.2        xml2_1.3.5        svglite_2.1.1    
			
 
				-[65] rstudioapi_0.15.0 minqa_1.2.5       jsonlite_1.8.7    R6_2.5.1         
			
 
				-[69] systemfonts_1.0.4
			
 
				+ [1] beeswarm_0.4.0    gtable_0.3.3      xfun_0.39         insight_0.19.2   
			
 
				+ [5] rstatix_0.7.2     lattice_0.21-8    vctrs_0.6.3       tools_4.3.0      
			
 
				+ [9] generics_0.1.3    parallel_4.3.0    tibble_3.2.1      fansi_1.0.4      
			
 
				+[13] highr_0.10        pkgconfig_2.0.3   webshot_0.5.5     lifecycle_1.0.3  
			
 
				+[17] farver_2.1.1      compiler_4.3.0    textshaping_0.3.6 munsell_0.5.0    
			
 
				+[21] mnormt_2.1.1      vipor_0.4.5       htmltools_0.5.5   yaml_2.3.7       
			
 
				+[25] pillar_1.9.0      nloptr_2.0.3      MASS_7.3-60       boot_1.3-28.1    
			
 
				+[29] abind_1.4-5       nlme_3.1-162      tidyselect_1.2.0  rvest_1.0.3      
			
 
				+[33] digest_0.6.33     stringi_1.7.12    purrr_1.0.1       labeling_0.4.2   
			
 
				+[37] splines_4.3.0     cowplot_1.1.1     fastmap_1.1.1     grid_4.3.0       
			
 
				+[41] colorspace_2.1-0  cli_3.6.1         magrittr_2.0.3    utf8_1.2.3       
			
 
				+[45] broom_1.0.5       withr_2.5.0       scales_1.2.1      backports_1.4.1  
			
 
				+[49] rmarkdown_2.23    httr_1.4.6        gridExtra_2.3     ggsignif_0.6.4   
			
 
				+[53] ragg_1.2.5        evaluate_0.21     knitr_1.43        viridisLite_0.4.2
			
 
				+[57] mgcv_1.8-42       rlang_1.1.1       Rcpp_1.0.10       glue_1.6.2       
			
 
				+[61] xml2_1.3.5        svglite_2.1.1     rstudioapi_0.15.0 minqa_1.2.5      
			
 
				+[65] R6_2.5.1          systemfonts_1.0.4
			
--- a/OUTPUT/corpus_description.csv
+++ b/OUTPUT/corpus_description.csv
@@ -1,4 +1,4 @@
 
				-"corpus"	"location"	"chi_per_corpus"	"rec_r_per_child"	"rec_per_corpus"	"dur_per_corpus"	"age_mean_per_corpus"	"age_r_per_corpus"
			
 
				+"corpus"	"location"	"chiXcor"	"recRXchi"	"recXcor"	"durXcor"	"ageXcor"	"ageRXcor"
			
 
				 "1"	"bergelson"	"Northeast US"	"44"	"10-12"	"522"	"14"	"11.2"	"6-17"
			
 
				 "2"	"cougar"	"Northwest US"	"26"	"3-45"	"239"	"11.1"	"26.6"	"0-59"
			
 
				 "3"	"fausey-trio"	"Western US"	"28"	"3-3"	"84"	"13.7"	"8.9"	"6-12"