create-all-icc.R 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # This code can be reproduced without access to the underlying datasets, unlike regenerate_data.R
  2. # It relies on packages, functions, & variables that are called in SM.Rmd
  3. # You may need to "get" the data -- so if you get
  4. #Error in file(file, "rt") : cannot open the connection
  5. # then do datalad get FILE on terminal to get the relevant files
  6. # Create main ICC including all corpora & all measurements
  7. df.icc.mixed.cols = c("data_set","age_bin", "metric", "iqr",
  8. "age_b","age_se","age_t", # beta, standard error, T
  9. "icc_adjusted", "icc_conditional",
  10. "icc_child_id", "icc_corpus",
  11. "child_id_var","corpus_var","residual_var",
  12. "child_id_sd","corpus_sd","residual_sd",
  13. "r2_cond", "r2_marg",
  14. "nobs","nchi", "ncor",
  15. "formula","sw")
  16. df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
  17. stringsAsFactors = FALSE)
  18. for (data_set in data_sets){ # data_set = "aclew"
  19. # Load data
  20. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  21. #remove peak_voc_chi, it is redundant with peak_voc_chi_ph
  22. mydat <- within(mydat,rm("peak_voc_chi"))
  23. metrics <- colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  24. for(metric in metrics)
  25. { # metric = "voc_chi_ph"
  26. icc.row <- new_fit_models(mydat, data_set, metric, NA, TRUE)
  27. df.icc.mixed[nrow(df.icc.mixed) + 1,] <- icc.row
  28. }
  29. }
  30. write.csv(df.icc.mixed,"../output/df.icc.mixed.csv",row.names=F)
  31. print("done with overall ana, tackling within corpus version")
  32. # repeat for the version within each corpus
  33. df.icc.corpus.cols = c("data_set","corpus","metric", "iqr",
  34. "age_b","age_se","age_t",
  35. "icc_adjusted", "icc_conditional",
  36. "icc_child_id", "icc_corpus",
  37. "child_id_var","corpus_var","residual_var",
  38. "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
  39. "nobs","nchi", "ncor",
  40. "formula","sw")
  41. df.icc.corpus = data.frame(matrix(ncol=length(df.icc.corpus.cols),nrow=0, dimnames=list(NULL, df.icc.corpus.cols)),
  42. stringsAsFactors=FALSE)
  43. for (data_set in data_sets){ # data_set = "aclew"
  44. # Load data
  45. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  46. for(corpus in corpora){
  47. mycordat <- mydat[mydat$experiment == corpus, ]
  48. metrics <- colnames(mycordat)[!is.element(colnames(mycordat), no.scale.columns)] #this should be defined here bc it varies by data_set
  49. for(metric in metrics) { # metric = "avg_voc_dur_mal"
  50. icc.row <- new_fit_models(mycordat, data_set, metric, corpus, FALSE)
  51. df.icc.corpus[nrow(df.icc.corpus) + 1,] <- icc.row
  52. }
  53. }
  54. }
  55. write.csv(df.icc.corpus,"../output/df.icc.corpus.csv",row.names=F)
  56. print("done with within corpus ana, tackling within age group version")
  57. #We do this one separately because we want to standardize metrics within each age group
  58. # repeat within age group bins
  59. df.icc.age.cols = c("data_set","age_bin","metric", "iqr",
  60. "age_b","age_se","age_t",
  61. "icc_adjusted", "icc_conditional",
  62. "icc_child_id", "icc_corpus",
  63. "child_id_var","corpus_var","residual_var",
  64. "child_id_sd","corpus_sd","residual_sd","r2_cond", "r2_marg",
  65. "nobs","nchi", "ncor",
  66. "formula","sw")
  67. df.icc.age = data.frame(matrix(ncol=length(df.icc.age.cols),nrow=0, dimnames=list(NULL, df.icc.age.cols)),
  68. stringsAsFactors=FALSE)
  69. for (data_set in data_sets){ # data_set = "aclew"
  70. # Load data and calculate age cuts
  71. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics.csv')) # /!\ Do not use scaled version -> we'll scale by age later
  72. mydat$age_bin <- cut(mydat$age,c(0:6*6))
  73. metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  74. for(thisage in levels(mydat$age_bin))
  75. { #thisage= "(0,6]"
  76. thiscordata <- mydat[mydat$age_bin == thisage,]
  77. for(metric in metrics)
  78. { # metric = "avg_voc_dur_mal"
  79. pre_scaled_metric <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
  80. thiscordata[abs(pre_scaled_metric)>2.5 & !is.na(pre_scaled_metric), metric] <- NA
  81. thiscordata[, metric] <- (thiscordata[, metric] - mean(thiscordata[, metric], na.rm=T))/sd(thiscordata[, metric], na.rm=T)
  82. if(dim(thiscordata)[1] > 30 & length(levels(factor(thiscordata$experiment))) > 1)
  83. {
  84. icc.row <- new_fit_models(thiscordata, data_set, metric, thisage, TRUE)
  85. }else{
  86. icc.row <- c(data_set,thisage,metric,iqr,
  87. NA,NA,NA,
  88. NA,NA,
  89. NA,NA,
  90. NA,NA,NA,
  91. NA,NA,NA,
  92. NA,NA,NA,
  93. "not_enough_data",NA)
  94. }
  95. df.icc.age[nrow(df.icc.age) + 1,] <- icc.row
  96. }
  97. }
  98. }
  99. write.csv(df.icc.age,"../output/df.icc.age.csv",row.names=F)