mydat_lena <- read.csv(paste0('../data_output/', "lena",'_metrics_scaled.csv')) #table(mydat_lena$session_id)[table(mydat_lena$session_id)>1] mydat_lena=mydat_lena[order(mydat_lena$experiment,mydat_lena$child_id,mydat_lena$age),] # dim(mydat_lena) #1253 obs key=mydat_lena[,c("experiment","child_id","age")] dist_contig_lena <- define_contiguous(mydat_lena) # dim(dist_contig_lena) #684 # table(dist_contig_lena$session_id)[table(dist_contig_lena$session_id)>1] #0=no repeats mydat_lena = merge(mydat_lena,dist_contig_lena[,c("session_id","next_session")],by="session_id", all.x=T) #note that we need to do all.x=T, bc we need to keep others that are next session #table(dist_contig_lena$experiment) #this is the number of eligible recordings per corpus #table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])#this is the number of eligible children per corpus #sum(table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])) #and overall # maximally, we'll have 148 rows in the samples below #given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do more because there are a lot of recs in cougar & bergelson. Later increased to 20 bc there was a lot of variability still in the average r mydat_aclew <- read.csv(paste0('../data_output/', "aclew",'_metrics_scaled.csv')) #1254 #mydat_aclew=mydat_aclew[order(mydat_aclew$experiment,mydat_aclew$child_id,mydat_aclew$age),] #dim(mydat_aclew) # dim(dist_contig_aclew) #686 -- we have 2 more eligible recs here: #length(dist_contig_aclew$session_id[!(dist_contig_aclew$session_id %in% dist_contig_lena$session_id)]) # in fact, we have lots of sessions not in common! #length(dist_contig_lena$session_id[!(dist_contig_lena$session_id %in% dist_contig_aclew$session_id)]) # they are present in aclew but not in lena # NOTE: I have "winnipeg C175 C175_20151201" "winnipeg C175 C175_20160301" for lena but not aclew; and i have "fausey-trio T066 T066/T066_000700" "quechua 1096 20190630_190025_009107" "quechua 1096 20190702_193551_008712" #one thing that drove me crazy was that, probably because of the small differences in inclusion # (2 recs in aclew & lena respectively), I was ending up with different lists of pairings across # aclew & lena. So to simplify, I'll impose the same pairing across both, which involves losing a # couple of additional recs in lena xxx=mydat_aclew[mydat_aclew$session_id %in% mydat_lena$session_id,] rownames(xxx)<-xxx$session_id xxx=xxx[mydat_lena$session_id,] dist_contig_aclew <- define_contiguous(xxx) mydat_aclew = merge(mydat_aclew,dist_contig_aclew[,c("session_id","next_session")],by="session_id", all.x=T) # dist_contig_lena=dist_contig_lena[((dist_contig_lena$session_id %in% dist_contig_aclew$session_id) & (dist_contig_lena$next_session %in% dist_contig_aclew$next_session)),] # dist_contig_lena=dist_contig_lena[order(dist_contig_lena$session_id),] # # dist_contig_aclew=dist_contig_aclew[((dist_contig_aclew$session_id %in% dist_contig_lena$session_id) & (dist_contig_aclew$next_session %in% dist_contig_aclew$next_session)),] # dist_contig_aclew=dist_contig_aclew[order(dist_contig_aclew$session_id),] nsamples=20 all_rs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples))) # it should have as many rows as there are ICCs, namely number of metrics = 71 colnames(all_rs[,1:nsamples])<-paste0("sample",1:nsamples) all_rs$data_set<-df.icc.mixed$data_set all_rs$metric<-df.icc.mixed$metric # dim(all_rs) all_iccs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples))) colnames(all_iccs[,1:nsamples])<-paste0("sample",1:nsamples) all_iccs$data_set<-df.icc.mixed$data_set all_iccs$metric<-df.icc.mixed$metric for(i in 1:nsamples){#i=1 #for each child, sample 2 contiguous recordings that are less than 2 months away #step 1: sample one session per child among the list of sessions that are close by #we use the lena one because there is no difference across lena & aclew in terms of which kids have which sessions, since in this paper all recs have both lena and aclew, so it's just a question of using one of them close_sessions <- dist_contig_lena %>% group_by(child_id)%>% slice_sample(n = 1) #table(close_sessions$experiment) #these have to be similar #dim(close_sessions) for(j in 1:dim(df.icc.mixed)[1]){# j=1 #this loops over all the metrics data_set=df.icc.mixed[j,"data_set"] metric=df.icc.mixed[j,"metric"] if(data_set=="aclew") dat_for_cor<-mydat_aclew else dat_for_cor<-mydat_lena #step 2: get data from those sampled sessions as rec1 rec1 = subset(dat_for_cor,session_id %in% close_sessions$session_id) #step 3: get the next session rec2 = subset(dat_for_cor,session_id %in% close_sessions$next_session) all_rs[all_rs$data_set==data_set & all_rs$metric==metric,i]<- cor.test(rec1[,metric],rec2[,metric])$estimate } } write.csv(all_rs,'../data_output/all_rs.csv', row.names = F) write.csv(dist_contig_lena,'../data_output/dist_contig_lena.csv', row.names = F)