library(simstudy) ## Functions ###---------- check_small_var<-function(x,y,i) round(x[i],3)==round(y[i],3) & round(y[i],3) == 0 fit_child_model<-function(dataframe, metric){ # Fit formula where experiment is removed formula <- as.formula(paste0(metric, "~ (1|child_id)")) #removed age_s + model <- lmer(formula, data=dataframe) return (model) } extract_chi_variables<-function(model){ icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional) icc.result.split <- c(as.data.frame(icc(model, by_group=TRUE))$ICC, NA) ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"]) ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"]) # chi, NA, residual ranefs_vars <-c(ranefs_vars[1],NA,ranefs_vars[2]) ranefs_stdv <-c(ranefs_stdv[1],NA,ranefs_stdv[2]) # chi, NA ns<-c(unlist(summary(model)$n),NA) return (c(#coefficients(summary(model))["age_s",], icc.result.mixed, icc.result.split, ranefs_vars, ranefs_stdv, nobs(model),ns)) } extract_full_variables<-function(model){ icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional) icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC) ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"]) ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"]) ns<-t(data.frame(summary(model)$n)) return (c( # coefficients(summary(model))["age_s",], icc.result.mixed, icc.result.split, ranefs_vars, ranefs_stdv, nobs(model),ns)) } new_fit_models_sim<-function(dataframe, data_set, metric, fit_full = TRUE){ #age, iqr = quantile(dataframe[,metric],.75,na.rm=T)-quantile(dataframe[,metric],.25,na.rm=T) if(fit_full){ # Fit full model formula <- as.formula(paste0(metric, "~ (1|experiment/child_id)")) #removed age_s + model <- lmer(formula, data=dataframe) } if(fit_full & !isSingular(model)) # Fitted full model { form="full" sw=shapiro.test(resid(model))$p mod_variables <- extract_full_variables(model) # Build line } else { model <- fit_child_model(dataframe, metric) if(!isSingular(model)){ form = "no_exp" sw=shapiro.test(resid(model))$p mod_variables <- extract_chi_variables(model) } else { form='no_chi_effect' sw = NA mod_variables = c( # NA,NA,NA, # c(coefficients(summary(model))["age_s",] NA,NA, # icc.result.mixed NA,NA, # icc.result.split NA,NA,NA, # ranefs_vars NA,NA,NA, # raners_std NA,NA,NA) # nobs (child, corpus), nobs } } icc.row = c(data_set, metric, iqr, mod_variables, form, sw) #age, return (icc.row) } ## Main loop ###---------- #create matrix to hold info in df.vars.cols = c("data_set","experiment","n","metric", "mean","sd") df.vars = data.frame(matrix(ncol=length(df.vars.cols),nrow=0, dimnames=list(NULL, df.vars.cols)), stringsAsFactors=FALSE) ### Pull out the mean and SD for first day ###---------- for (data_set in data_sets){ # data_set = "aclew" mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv')) metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)] #select down to first recording by child mydat$uchild_id=paste(mydat$experiment,mydat$child_id) mydat$child_id_age=paste(mydat$experiment,mydat$child_id,mydat$age) mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),] #sort by child ID & age mydat=mydat[!duplicated(mydat$uchild_id),] #keep only the first line for each child means=stack(aggregate(mydat[,metrics],by=list(mydat$experiment),mean,na.rm=T)[,-1]) sds=stack(aggregate(mydat[,metrics],by=list(mydat$experiment),sd,na.rm=T)[-1]) df.vars=rbind(df.vars, cbind(data_set,levels(factor(mydat$experiment)),data.frame(table(mydat$experiment))$Freq,means[,c(2,1)],sds[,-2])) } colnames(df.vars)<-df.vars.cols ### Generate second day with varying levels of association ###---------- alldays=NULL for(i in 1:nrow(df.vars)) for(myr in c(.1,.3,.5, .7, .9)){#i=2;myr=.5 #use a while loop to make sure data generated are close to the target r C <- matrix(c( 1, myr,myr,1), nrow=2) simulation_unsatisfactory <- TRUE while(simulation_unsatisfactory==TRUE){ try.this <- as.data.frame( genCorData(df.vars$n[i], mu = c(df.vars$mean[i],df.vars$mean[i]), sigma = df.vars$sd[i], corMatrix = C) ) try.this <- try.this[,-1] #remove the ID column, since it's useless sim.cor <- cor.test(try.this$V1,try.this$V2)$estimate simulation_unsatisfactory = !(abs(sim.cor-myr)<0.01) } thisdat=cbind(df.vars$data_set[i],df.vars$experiment[i],1:df.vars$n[i],stack(try.this),as.character(df.vars$metric[i]),myr) colnames(thisdat)<-c("data_set","experiment","child_id","value","day","metric","myr") alldays=rbind(alldays,thisdat) }#end i write.csv(alldays,"../output/simulated_correlated_2day.csv",row.names=F) ### Estimate ICCs for paired observations for the whole dataset ###---------- read.csv("../output/simulated_correlated_2day.csv")->alldays df.icc.mixed.cols = c("data_set", "metric", "iqr", # removed "age_bin", #"age_b","age_se","age_t", # beta, standard error, T #removed age "icc_adjusted", "icc_conditional", "icc_child_id", "icc_corpus", "child_id_var","corpus_var","residual_var", "child_id_sd","corpus_sd","residual_sd", "nobs","nchi", "ncor", "formula","sw","myr") df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)), stringsAsFactors = FALSE) for(myr in levels(factor(alldays$myr)))for(data_set in levels(factor(alldays$data_set))) for(metric in levels(factor(alldays$metric[alldays$data_set==data_set]))) { # myr=.5 ; data_set = "aclew"; metric="wc_adu_ph" #select data mydat=alldays[alldays$myr==myr & alldays$data_set==data_set & alldays$metric==metric,] #reshape mydat so that it fits expectation from the following function colnames(mydat)[4]<-metric icc.row <- new_fit_models_sim(mydat, data_set, metric, TRUE) #removed age NA, df.icc.mixed[nrow(df.icc.mixed) + 1,] <- cbind(icc.row,myr) } write.csv(df.icc.mixed,"../output/df.icc.simu.csv",row.names=F) ### Estimate ICCs for paired observations halving the dataset, to see if obs N affects results ###---------- df.icc.mixed.cols = c("data_set", "metric", "iqr", # removed "age_bin", #"age_b","age_se","age_t", # beta, standard error, T #removed age "icc_adjusted", "icc_conditional", "icc_child_id", "icc_corpus", "child_id_var","corpus_var","residual_var", "child_id_sd","corpus_sd","residual_sd", "nobs","nchi", "ncor", "formula","sw","myr") df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)), stringsAsFactors = FALSE) for(myr in levels(factor(alldays$myr)))for(data_set in levels(factor(alldays$data_set))) for(metric in levels(factor(alldays$metric[alldays$data_set==data_set]))) { # myr=.5 ; data_set = "aclew"; metric="wc_adu_ph" #select data mydat=alldays[alldays$myr==myr & alldays$data_set==data_set & alldays$metric==metric,] mydat$child_id=factor(mydat$child_id) mydat=mydat[mydat$child_id %in% sample(levels(mydat$child_id),size=round(length(levels(mydat$child_id))/2)),] #sample half of the kids #reshape mydat so that it fits expectation from the following function colnames(mydat)[4]<-metric icc.row <- new_fit_models_sim(mydat, data_set, metric, TRUE) #removed age NA, df.icc.mixed[nrow(df.icc.mixed) + 1,] <- cbind(icc.row,myr) } write.csv(df.icc.mixed,"../output/df.icc.simu_halved.csv",row.names=F) read.csv("../output/df.icc.simu.csv")->full plot(full$icc_child_id~df.icc.mixed$icc_child_id)