regenerate_data.R 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. # This code cannot be reproduced without access to the underlying datasets
  2. # It also relies on packages, functions, & variables that are called in SM.Rmd
  3. #contains ugly fix due to simple ctc & lena ctc not controlling for length
  4. remove_single_rec_kids = TRUE
  5. for (data_set in data_sets){ #data_set="aclew";data_set="lena"
  6. if(data_set=="aclew"){
  7. mydat <- read.csv(paste0('../input/el1000-metrics/output/', data_set,'_metrics.csv'))
  8. #ugly fix for simple CTC & lena CTC not controlling for length
  9. mydat$simple_CTC_ph = mydat$simple_CTC/(mydat$duration_vtc/(60*60*1000))
  10. } else{
  11. mydat <- read.csv(paste0('../input/el1000-metrics/output/', data_set,'_metrics_avass.csv'))
  12. mydat=mydat[,colnames(mydat)!="X"] #for some reason, this version has row names - remove that col to avoid having issues downastream
  13. #ugly fix for simple CTC & lena CTC not controlling for length
  14. mydat$lena_CTC_ph=mydat$lena_CTC/(mydat$duration_its/(60*60*1000))
  15. mydat$lena_CVC_ph=mydat$lena_CVC/(mydat$duration_its/(60*60*1000))
  16. }
  17. # Remove Cougar non-normatives
  18. # if needed, in the terminal, do `datalad get input/el1000-metrics/EL1000/cougar/metadata/children.csv`
  19. cougar <- read.csv('../input/el1000-metrics/EL1000/cougar/metadata/children.csv')
  20. cougar_normative <- cougar[cougar$normative == "Y", ]
  21. mydat <- mydat[mydat$experiment != 'cougar' | mydat$child_id %in% cougar_normative$child_id, ]
  22. if(data_set=="aclew"){
  23. mydat2 <- read.csv(paste0('../input/laac-metrics/output/', data_set,'_metrics.csv'))
  24. #ugly fix for simple CTC & lena CTC not controlling for length
  25. mydat2$simple_CTC_ph = mydat2$simple_CTC/(mydat2$duration_vtc/(60*60*1000))
  26. } else{
  27. mydat2 <- read.csv(paste0('../input/laac-metrics/output/', data_set,'_metrics_avass.csv'))
  28. mydat2=mydat2[,colnames(mydat2)!="X"] #for some reason, this version has row names - remove that col to avoid having issues downastream
  29. #ugly fix for simple CTC & lena CTC not controlling for length
  30. mydat2$lena_CTC_ph=mydat2$lena_CTC/(mydat2$duration_its/(60*60*1000))
  31. mydat2$lena_CVC_ph=mydat2$lena_CVC/(mydat2$duration_its/(60*60*1000))
  32. }
  33. # Remove FauseyElse
  34. # if needed, in the terminal, do `datalad get input/laac-metrics/datasets/fausey-trio/metadata/recordings.csv`
  35. fausey_trio <- read.csv('../input/laac-metrics/datasets/fausey-trio/metadata/recordings.csv')
  36. fausey_trio_full <- fausey_trio[fausey_trio$Trio_Subset == "Trio_Full", ]
  37. fausey_trio_full$session_id <- paste0(fausey_trio_full$HomeBank_ID, "/", fausey_trio_full$fileName)
  38. mydat2 <- mydat2[mydat2$experiment=="quechua" | mydat2$session_id %in% fausey_trio_full$session_id,]
  39. #note that since we are only taking quechua & fausey-trio, then the fact that png2019 & tsimane2017 are repeated across laac & el1000 is not a problem
  40. #columns are not in the same order across el1000-metrics & laac-metrics
  41. mydat2=mydat2[,colnames(mydat)]
  42. mydat=rbind(mydat,mydat2)
  43. #ugly fix removing columns that we do not want to include
  44. mydat=mydat[,!(colnames(mydat)%in%c("lena_CTC","simple_CTC","lena_CVC","voc_chi"))]
  45. #remove no overlap metrics
  46. mydat = mydat[,grep("noov",colnames(mydat),invert=T)]
  47. mydat = mydat[,grep("no_overlap",colnames(mydat),invert=T)]
  48. #remove not included corpora
  49. mydat <- mydat[is.element(mydat$experiment, corpora),]
  50. #make sure child_id & session_id is unique across corpora
  51. mydat$child_id=paste(mydat$experiment,mydat$child_id)
  52. mydat$session_id=paste(mydat$child_id,mydat$session_id)
  53. #table(mydat$session_id)[table(mydat$session_id)>1] #no repeated sessions
  54. if(remove_single_rec_kids){
  55. #remove short recs
  56. if(data_set=='aclew') dur_col="duration_vtc" else dur_col="duration_its"
  57. mydat=mydat[mydat[,dur_col]/3.6e+6>=4,]
  58. #remove kids with only one rec
  59. rec_per_child = setNames(aggregate(data = mydat, session_id ~ child_id, function(session_id) length(unique(session_id))), c('child_id', 'n'))
  60. mydat=mydat[mydat$child_id %in% rec_per_child$child_id[rec_per_child$n>1],]
  61. }
  62. # Save data
  63. write.csv(mydat,paste0('../data_output/', data_set,'_base_data_set.csv'),row.names = F)
  64. print(paste0('Save to ', paste0('../data_output/', data_set,'_base_data_set.csv')))
  65. }
  66. for (data_set in data_sets){
  67. mydat <- read.csv(paste0('../data_output/', data_set,'_base_data_set.csv'))
  68. mydat$age_s=scale(mydat$age)
  69. mydat$age_s=(mydat$age - mean(mydat$age , na.rm=T))/sd(mydat$age , na.rm=T)
  70. metrics <- colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  71. #remove outliers
  72. for(metric in metrics) mydat[abs((mydat[,metric]-mean(mydat[,metric], na.rm=T))/sd(mydat[,metric], na.rm=T)) > 2.5 & !is.na(abs((mydat[,metric]-mean(mydat[,metric], na.rm=T))/sd(mydat[,metric], na.rm=T))), metric]<-NA
  73. #NA values that are beyond 2.5 SD from mean
  74. write.csv(mydat,paste0('../data_output/', data_set,'_metrics.csv'),row.names = F) #all variables are unscaled, except for age
  75. print(paste0('Save to ', paste0('../data_output/', data_set,'_metrics.csv')))
  76. for(metric in metrics){ #metric="pc_mal_ph"
  77. # Scale
  78. mydat[, metric] <- (mydat[, metric] - mean(mydat[, metric], na.rm=T)) /sd(mydat[, metric], na.rm=T)
  79. }
  80. # Save data
  81. write.csv(mydat,paste0('../data_output/', data_set,'_metrics_scaled.csv'),row.names = F)
  82. print(paste0('Save to ', paste0('../data_output/', data_set,'_metrics_scaled.csv')))
  83. }