create-all-rs.R 4.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. mydat_lena <- read.csv(paste0('../data_output/', "lena",'_metrics_scaled.csv'))
  2. #table(mydat_lena$session_id)[table(mydat_lena$session_id)>1]
  3. mydat_lena=mydat_lena[order(mydat_lena$experiment,mydat_lena$child_id,mydat_lena$age),]
  4. # dim(mydat_lena) #1253 obs
  5. key=mydat_lena[,c("experiment","child_id","age")]
  6. dist_contig_lena <- define_contiguous(mydat_lena)
  7. # dim(dist_contig_lena) #684
  8. # table(dist_contig_lena$session_id)[table(dist_contig_lena$session_id)>1] #0=no repeats
  9. mydat_lena = merge(mydat_lena,dist_contig_lena[,c("session_id","next_session")],by="session_id", all.x=T) #note that we need to do all.x=T, bc we need to keep others that are next session
  10. #table(dist_contig_lena$experiment) #this is the number of eligible recordings per corpus
  11. #table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])#this is the number of eligible children per corpus
  12. #sum(table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])) #and overall
  13. # maximally, we'll have 148 rows in the samples below
  14. #given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do 10 because there are a lot of recs in cougar & bergelson. Later increased to 20 bc there was a lot of variability still in the average r
  15. mydat_aclew <- read.csv(paste0('../data_output/', "aclew",'_metrics_scaled.csv')) #1254
  16. #mydat_aclew=mydat_aclew[order(mydat_aclew$experiment,mydat_aclew$child_id,mydat_aclew$age),]
  17. #dim(mydat_aclew)
  18. # dim(dist_contig_aclew) #686 -- for some reason, we have 2 more eligible recs here... not sure why
  19. #length(dist_contig_aclew$session_id[!(dist_contig_aclew$session_id %in% dist_contig_lena$session_id)]) # in fact, we have lots of sessions not in common!
  20. #length(dist_contig_lena$session_id[!(dist_contig_lena$session_id %in% dist_contig_aclew$session_id)])
  21. # they are present in aclew but not in lena
  22. # NOTE: I have "winnipeg C175 C175_20151201" "winnipeg C175 C175_20160301" for lena but not aclew; and i have "fausey-trio T066 T066/T066_000700" "quechua 1096 20190630_190025_009107" "quechua 1096 20190702_193551_008712"
  23. #one thing that drove me crazy was that, probably because of the small differences in inclusion (2 recs in aclew & lena respectively), I was ending up with different lists of pairings across aclew & lena. So to simplify, I'll impose the same pairing across both, which involves losing a couple of additional recs in lena
  24. xxx=mydat_aclew[mydat_aclew$session_id %in% mydat_lena$session_id,]
  25. rownames(xxx)<-xxx$session_id
  26. xxx=xxx[mydat_lena$session_id,]
  27. dist_contig_aclew <- define_contiguous(xxx)
  28. mydat_aclew = merge(mydat_aclew,dist_contig_aclew[,c("session_id","next_session")],by="session_id", all.x=T)
  29. # dist_contig_lena=dist_contig_lena[((dist_contig_lena$session_id %in% dist_contig_aclew$session_id) & (dist_contig_lena$next_session %in% dist_contig_aclew$next_session)),]
  30. # dist_contig_lena=dist_contig_lena[order(dist_contig_lena$session_id),]
  31. #
  32. # dist_contig_aclew=dist_contig_aclew[((dist_contig_aclew$session_id %in% dist_contig_lena$session_id) & (dist_contig_aclew$next_session %in% dist_contig_aclew$next_session)),]
  33. # dist_contig_aclew=dist_contig_aclew[order(dist_contig_aclew$session_id),]
  34. nsamples=20
  35. all_rs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples)))
  36. colnames(all_rs[,1:nsamples])<-paste0("sample",1:nsamples)
  37. all_rs$data_set<-df.icc.mixed$data_set
  38. all_rs$metric<-df.icc.mixed$metric
  39. all_iccs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples)))
  40. colnames(all_iccs[,1:nsamples])<-paste0("sample",1:nsamples)
  41. all_iccs$data_set<-df.icc.mixed$data_set
  42. all_iccs$metric<-df.icc.mixed$metric
  43. for(i in 1:nsamples){#i=1
  44. #for each child, sample 2 contiguous recordings that are less than 2 months away
  45. #step 1: sample one session per child among the list of sessions that are close by
  46. #we use the lena one because there is no difference across lena & aclew in terms of which kids have which sessions, since in this paper all recs have both lena and aclew, so it's just a question of using one of them
  47. close_sessions <- dist_contig_lena %>%
  48. group_by(child_id)%>%
  49. slice_sample(n = 1)
  50. #table(close_sessions$experiment) #these have to be similar
  51. #dim(close_sessions)
  52. for(j in 1:dim(df.icc.mixed)[1]){# j=1
  53. data_set=df.icc.mixed[j,"data_set"]
  54. metric=df.icc.mixed[j,"metric"]
  55. if(data_set=="aclew") dat_for_cor<-mydat_aclew else dat_for_cor<-mydat_lena
  56. #step 2: get data from those sampled sessions as rec1
  57. rec1 = subset(dat_for_cor,session_id %in% close_sessions$session_id)
  58. #step 3: get the next session
  59. rec2 = subset(dat_for_cor,session_id %in% close_sessions$next_session)
  60. all_rs[all_rs$data_set==data_set & all_rs$metric==metric,i]<-
  61. cor.test(rec1[,metric],rec2[,metric])$estimate
  62. }
  63. }
  64. write.csv(all_rs,'../data_output/all_rs.csv')