1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- mydat_lena <- read.csv(paste0('../data_output/', "lena",'_metrics_scaled.csv'))
- #table(mydat_lena$session_id)[table(mydat_lena$session_id)>1]
- mydat_lena=mydat_lena[order(mydat_lena$experiment,mydat_lena$child_id,mydat_lena$age),]
- # dim(mydat_lena) #1253 obs
- key=mydat_lena[,c("experiment","child_id","age")]
- dist_contig_lena <- define_contiguous(mydat_lena)
- # dim(dist_contig_lena) #684
- # table(dist_contig_lena$session_id)[table(dist_contig_lena$session_id)>1] #0=no repeats
- mydat_lena = merge(mydat_lena,dist_contig_lena[,c("session_id","next_session")],by="session_id", all.x=T) #note that we need to do all.x=T, bc we need to keep others that are next session
- #table(dist_contig_lena$experiment) #this is the number of eligible recordings per corpus
- #table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])#this is the number of eligible children per corpus
- #sum(table(dist_contig_lena$experiment[!duplicated(dist_contig_lena$child_id)])) #and overall
- # maximally, we'll have 148 rows in the samples below
- #given those two numbers, with 5 draws we'd cover many combinations in winni, lucid, & trio; but we'll do more because there are a lot of recs in cougar & bergelson. Later increased to 20 bc there was a lot of variability still in the average r
- mydat_aclew <- read.csv(paste0('../data_output/', "aclew",'_metrics_scaled.csv')) #1254
- #mydat_aclew=mydat_aclew[order(mydat_aclew$experiment,mydat_aclew$child_id,mydat_aclew$age),]
- #dim(mydat_aclew)
- # dim(dist_contig_aclew) #686 -- we have 2 more eligible recs here:
- #length(dist_contig_aclew$session_id[!(dist_contig_aclew$session_id %in% dist_contig_lena$session_id)]) # in fact, we have lots of sessions not in common!
- #length(dist_contig_lena$session_id[!(dist_contig_lena$session_id %in% dist_contig_aclew$session_id)])
- # they are present in aclew but not in lena
- # NOTE: I have "winnipeg C175 C175_20151201" "winnipeg C175 C175_20160301" for lena but not aclew; and i have "fausey-trio T066 T066/T066_000700" "quechua 1096 20190630_190025_009107" "quechua 1096 20190702_193551_008712"
- #one thing that drove me crazy was that, probably because of the small differences in inclusion
- # (2 recs in aclew & lena respectively), I was ending up with different lists of pairings across
- # aclew & lena. So to simplify, I'll impose the same pairing across both, which involves losing a
- # couple of additional recs in lena
- xxx=mydat_aclew[mydat_aclew$session_id %in% mydat_lena$session_id,]
- rownames(xxx)<-xxx$session_id
- xxx=xxx[mydat_lena$session_id,]
- dist_contig_aclew <- define_contiguous(xxx)
- mydat_aclew = merge(mydat_aclew,dist_contig_aclew[,c("session_id","next_session")],by="session_id", all.x=T)
- # dist_contig_lena=dist_contig_lena[((dist_contig_lena$session_id %in% dist_contig_aclew$session_id) & (dist_contig_lena$next_session %in% dist_contig_aclew$next_session)),]
- # dist_contig_lena=dist_contig_lena[order(dist_contig_lena$session_id),]
- #
- # dist_contig_aclew=dist_contig_aclew[((dist_contig_aclew$session_id %in% dist_contig_lena$session_id) & (dist_contig_aclew$next_session %in% dist_contig_aclew$next_session)),]
- # dist_contig_aclew=dist_contig_aclew[order(dist_contig_aclew$session_id),]
- nsamples=20
- all_rs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples)))
- # it should have as many rows as there are ICCs, namely number of metrics = 71
- colnames(all_rs[,1:nsamples])<-paste0("sample",1:nsamples)
- all_rs$data_set<-df.icc.mixed$data_set
- all_rs$metric<-df.icc.mixed$metric
- # dim(all_rs)
- all_iccs=data.frame(matrix(NA,nrow=dim(df.icc.mixed)[1],ncol=(nsamples)))
- colnames(all_iccs[,1:nsamples])<-paste0("sample",1:nsamples)
- all_iccs$data_set<-df.icc.mixed$data_set
- all_iccs$metric<-df.icc.mixed$metric
- for(i in 1:nsamples){#i=1
-
- #for each child, sample 2 contiguous recordings that are less than 2 months away
- #step 1: sample one session per child among the list of sessions that are close by
- #we use the lena one because there is no difference across lena & aclew in terms of which kids have which sessions, since in this paper all recs have both lena and aclew, so it's just a question of using one of them
- close_sessions <- dist_contig_lena %>%
- group_by(child_id)%>%
- slice_sample(n = 1)
- #table(close_sessions$experiment) #these have to be similar
- #dim(close_sessions)
-
- for(j in 1:dim(df.icc.mixed)[1]){# j=1 #this loops over all the metrics
- data_set=df.icc.mixed[j,"data_set"]
- metric=df.icc.mixed[j,"metric"]
-
- if(data_set=="aclew") dat_for_cor<-mydat_aclew else dat_for_cor<-mydat_lena
-
- #step 2: get data from those sampled sessions as rec1
- rec1 = subset(dat_for_cor,session_id %in% close_sessions$session_id)
- #step 3: get the next session
- rec2 = subset(dat_for_cor,session_id %in% close_sessions$next_session)
-
- all_rs[all_rs$data_set==data_set & all_rs$metric==metric,i]<-
- cor.test(rec1[,metric],rec2[,metric])$estimate
-
- }
- }
- write.csv(all_rs,'../data_output/all_rs.csv', row.names = F)
- write.csv(dist_contig_lena,'../data_output/dist_contig_lena.csv', row.names = F)
|