create-all-rs.R 5.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. mydat_lena <- read.csv(paste0('../data_output/', "lena",'_metrics_scaled.csv'))
  2. #table(mydat_lena$session_id)[table(mydat_lena$session_id)>1]
  3. mydat_lena=mydat_lena[order(mydat_lena$experiment,mydat_lena$child_id,mydat_lena$age),]
  4. # dim(mydat_lena) #1253 obs
  5. key=mydat_lena[,c("experiment","child_id","age")]
  6. dist_contig_lena <- define_contiguous(mydat_lena)
  7. # dim(dist_contig_lena) #684
  8. # table(dist_contig_lena$session_id)[table(dist_contig_lena$session_id)>1] #0=no repeats
  9. mydat_lena = merge(mydat_lena,dist_contig_lena[,c("session_id","next_session")],by="session_id", all.x=T) #note that we need to do all.x=T, bc we need to keep others that are next session
  10. #table(dist_contig_lena$experiment) #this is the number of eligible recordings per corpus
  11. #table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])#this is the number of eligible children per corpus
  12. #sum(table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])) #and overall
  13. # maximally, we'll have 148 rows in the samples below
  14. #given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do more because there are a lot of recs in cougar & bergelson. Later increased to 20 bc there was a lot of variability still in the average r
  15. mydat_aclew <- read.csv(paste0('../data_output/', "aclew",'_metrics_scaled.csv')) #1254
  16. #mydat_aclew=mydat_aclew[order(mydat_aclew$experiment,mydat_aclew$child_id,mydat_aclew$age),]
  17. #dim(mydat_aclew)
  18. # dim(dist_contig_aclew) #686 -- we have 2 more eligible recs here:
  19. #length(dist_contig_aclew$session_id[!(dist_contig_aclew$session_id %in% dist_contig_lena$session_id)]) # in fact, we have lots of sessions not in common!
  20. #length(dist_contig_lena$session_id[!(dist_contig_lena$session_id %in% dist_contig_aclew$session_id)])
  21. # they are present in aclew but not in lena
  22. # NOTE: I have "winnipeg C175 C175_20151201" "winnipeg C175 C175_20160301" for lena but not aclew; and i have "fausey-trio T066 T066/T066_000700" "quechua 1096 20190630_190025_009107" "quechua 1096 20190702_193551_008712"
  23. #one thing that drove me crazy was that, probably because of the small differences in inclusion
  24. # (2 recs in aclew & lena respectively), I was ending up with different lists of pairings across
  25. # aclew & lena. So to simplify, I'll impose the same pairing across both, which involves losing a
  26. # couple of additional recs in lena
  27. xxx=mydat_aclew[mydat_aclew$session_id %in% mydat_lena$session_id,]
  28. rownames(xxx)<-xxx$session_id
  29. xxx=xxx[mydat_lena$session_id,]
  30. dist_contig_aclew <- define_contiguous(xxx)
  31. mydat_aclew = merge(mydat_aclew,dist_contig_aclew[,c("session_id","next_session")],by="session_id", all.x=T)
  32. # dist_contig_lena=dist_contig_lena[((dist_contig_lena$session_id %in% dist_contig_aclew$session_id) & (dist_contig_lena$next_session %in% dist_contig_aclew$next_session)),]
  33. # dist_contig_lena=dist_contig_lena[order(dist_contig_lena$session_id),]
  34. #
  35. # dist_contig_aclew=dist_contig_aclew[((dist_contig_aclew$session_id %in% dist_contig_lena$session_id) & (dist_contig_aclew$next_session %in% dist_contig_aclew$next_session)),]
  36. # dist_contig_aclew=dist_contig_aclew[order(dist_contig_aclew$session_id),]
  37. nsamples=20
  38. all_rs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples)))
  39. # it should have as many rows as there are ICCs, namely number of metrics = 71
  40. colnames(all_rs[,1:nsamples])<-paste0("sample",1:nsamples)
  41. all_rs$data_set<-df.icc.mixed$data_set
  42. all_rs$metric<-df.icc.mixed$metric
  43. # dim(all_rs)
  44. all_iccs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples)))
  45. colnames(all_iccs[,1:nsamples])<-paste0("sample",1:nsamples)
  46. all_iccs$data_set<-df.icc.mixed$data_set
  47. all_iccs$metric<-df.icc.mixed$metric
  48. for(i in 1:nsamples){#i=1
  49. #for each child, sample 2 contiguous recordings that are less than 2 months away
  50. #step 1: sample one session per child among the list of sessions that are close by
  51. #we use the lena one because there is no difference across lena & aclew in terms of which kids have which sessions, since in this paper all recs have both lena and aclew, so it's just a question of using one of them
  52. close_sessions <- dist_contig_lena %>%
  53. group_by(child_id)%>%
  54. slice_sample(n = 1)
  55. #table(close_sessions$experiment) #these have to be similar
  56. #dim(close_sessions)
  57. for(j in 1:dim(df.icc.mixed)[1]){# j=1 #this loops over all the metrics
  58. data_set=df.icc.mixed[j,"data_set"]
  59. metric=df.icc.mixed[j,"metric"]
  60. if(data_set=="aclew") dat_for_cor<-mydat_aclew else dat_for_cor<-mydat_lena
  61. #step 2: get data from those sampled sessions as rec1
  62. rec1 = subset(dat_for_cor,session_id %in% close_sessions$session_id)
  63. #step 3: get the next session
  64. rec2 = subset(dat_for_cor,session_id %in% close_sessions$next_session)
  65. all_rs[all_rs$data_set==data_set & all_rs$metric==metric,i]<-
  66. cor.test(rec1[,metric],rec2[,metric])$estimate
  67. }
  68. }
  69. write.csv(all_rs,'../data_output/all_rs.csv', row.names = F)
  70. write.csv(dist_contig_lena,'../data_output/dist_contig_lena.csv', row.names = F)