simulate-r-for-icc.R 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. library(simstudy)
  2. ## Functions ###----------
  3. check_small_var<-function(x,y,i) round(x[i],3)==round(y[i],3) & round(y[i],3) == 0
  4. fit_child_model<-function(dataframe, metric){
  5. # Fit formula where experiment is removed
  6. formula <- as.formula(paste0(metric, "~ (1|child_id)")) #removed age_s +
  7. model <- lmer(formula, data=dataframe)
  8. return (model)
  9. }
  10. extract_chi_variables<-function(model){
  11. icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
  12. icc.result.split <- c(as.data.frame(icc(model, by_group=TRUE))$ICC, NA)
  13. ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
  14. ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
  15. # chi, NA, residual
  16. ranefs_vars <-c(ranefs_vars[1],NA,ranefs_vars[2])
  17. ranefs_stdv <-c(ranefs_stdv[1],NA,ranefs_stdv[2])
  18. # chi, NA
  19. ns<-c(unlist(summary(model)$n),NA)
  20. return (c(#coefficients(summary(model))["age_s",],
  21. icc.result.mixed,
  22. icc.result.split,
  23. ranefs_vars,
  24. ranefs_stdv,
  25. nobs(model),ns))
  26. }
  27. extract_full_variables<-function(model){
  28. icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
  29. icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
  30. ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
  31. ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
  32. ns<-t(data.frame(summary(model)$n))
  33. return (c( # coefficients(summary(model))["age_s",],
  34. icc.result.mixed,
  35. icc.result.split,
  36. ranefs_vars,
  37. ranefs_stdv,
  38. nobs(model),ns))
  39. }
  40. new_fit_models_sim<-function(dataframe, data_set, metric, fit_full = TRUE){ #age,
  41. iqr = quantile(dataframe[,metric],.75,na.rm=T)-quantile(dataframe[,metric],.25,na.rm=T)
  42. if(fit_full){
  43. # Fit full model
  44. formula <- as.formula(paste0(metric, "~ (1|experiment/child_id)")) #removed age_s +
  45. model <- lmer(formula, data=dataframe)
  46. }
  47. if(fit_full & !isSingular(model)) # Fitted full model
  48. {
  49. form="full"
  50. sw=shapiro.test(resid(model))$p
  51. mod_variables <- extract_full_variables(model)
  52. # Build line
  53. } else {
  54. model <- fit_child_model(dataframe, metric)
  55. if(!isSingular(model)){
  56. form = "no_exp"
  57. sw=shapiro.test(resid(model))$p
  58. mod_variables <- extract_chi_variables(model)
  59. } else {
  60. form='no_chi_effect'
  61. sw = NA
  62. mod_variables = c(
  63. # NA,NA,NA, # c(coefficients(summary(model))["age_s",]
  64. NA,NA, # icc.result.mixed
  65. NA,NA, # icc.result.split
  66. NA,NA,NA, # ranefs_vars
  67. NA,NA,NA, # raners_std
  68. NA,NA,NA) # nobs (child, corpus), nobs
  69. }
  70. }
  71. icc.row = c(data_set, metric, iqr, mod_variables, form, sw) #age,
  72. return (icc.row)
  73. }
  74. ## Main loop ###----------
  75. #create matrix to hold info in
  76. df.vars.cols = c("data_set","experiment","n","metric", "mean","sd")
  77. df.vars = data.frame(matrix(ncol=length(df.vars.cols),nrow=0, dimnames=list(NULL, df.vars.cols)), stringsAsFactors=FALSE)
  78. ### Pull out the mean and SD for first day ###----------
  79. for (data_set in data_sets){ # data_set = "aclew"
  80. mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv'))
  81. metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
  82. #select down to first recording by child
  83. mydat$uchild_id=paste(mydat$experiment,mydat$child_id)
  84. mydat$child_id_age=paste(mydat$experiment,mydat$child_id,mydat$age)
  85. mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),] #sort by child ID & age
  86. mydat=mydat[!duplicated(mydat$uchild_id),] #keep only the first line for each child
  87. means=stack(aggregate(mydat[,metrics],by=list(mydat$experiment),mean,na.rm=T)[,-1])
  88. sds=stack(aggregate(mydat[,metrics],by=list(mydat$experiment),sd,na.rm=T)[-1])
  89. df.vars=rbind(df.vars,
  90. cbind(data_set,levels(factor(mydat$experiment)),data.frame(table(mydat$experiment))$Freq,means[,c(2,1)],sds[,-2]))
  91. }
  92. colnames(df.vars)<-df.vars.cols
  93. ### Generate second day with varying levels of association ###----------
  94. alldays=NULL
  95. for(i in 1:nrow(df.vars)) for(myr in c(.1,.3,.5, .7, .9)){#i=2;myr=.5
  96. #use a while loop to make sure data generated are close to the target r
  97. C <- matrix(c( 1, myr,myr,1), nrow=2)
  98. simulation_unsatisfactory <- TRUE
  99. while(simulation_unsatisfactory==TRUE){
  100. try.this <- as.data.frame(
  101. genCorData(df.vars$n[i], mu = c(df.vars$mean[i],df.vars$mean[i]), sigma = df.vars$sd[i], corMatrix = C) )
  102. try.this <- try.this[,-1] #remove the ID column, since it's useless
  103. sim.cor <- cor.test(try.this$V1,try.this$V2)$estimate
  104. simulation_unsatisfactory = !(abs(sim.cor-myr)<0.01)
  105. }
  106. thisdat=cbind(df.vars$data_set[i],df.vars$experiment[i],1:df.vars$n[i],stack(try.this),as.character(df.vars$metric[i]),myr)
  107. colnames(thisdat)<-c("data_set","experiment","child_id","value","day","metric","myr")
  108. alldays=rbind(alldays,thisdat)
  109. }#end i
  110. write.csv(alldays,"../output/simulated_correlated_2day.csv",row.names=F)
  111. ### Estimate ICCs for paired observations for the whole dataset ###----------
  112. read.csv("../output/simulated_correlated_2day.csv")->alldays
  113. df.icc.mixed.cols = c("data_set", "metric", "iqr", # removed "age_bin",
  114. #"age_b","age_se","age_t", # beta, standard error, T #removed age
  115. "icc_adjusted", "icc_conditional",
  116. "icc_child_id", "icc_corpus",
  117. "child_id_var","corpus_var","residual_var",
  118. "child_id_sd","corpus_sd","residual_sd",
  119. "nobs","nchi", "ncor",
  120. "formula","sw","myr")
  121. df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
  122. stringsAsFactors = FALSE)
  123. for(myr in levels(factor(alldays$myr)))for(data_set in levels(factor(alldays$data_set))) for(metric in levels(factor(alldays$metric[alldays$data_set==data_set]))) { # myr=.5 ; data_set = "aclew"; metric="wc_adu_ph"
  124. #select data
  125. mydat=alldays[alldays$myr==myr & alldays$data_set==data_set & alldays$metric==metric,]
  126. #reshape mydat so that it fits expectation from the following function
  127. colnames(mydat)[4]<-metric
  128. icc.row <- new_fit_models_sim(mydat, data_set, metric, TRUE) #removed age NA,
  129. df.icc.mixed[nrow(df.icc.mixed) + 1,] <- cbind(icc.row,myr)
  130. }
  131. write.csv(df.icc.mixed,"../output/df.icc.simu.csv",row.names=F)
  132. ### Estimate ICCs for paired observations halving the dataset, to see if obs N affects results ###----------
  133. df.icc.mixed.cols = c("data_set", "metric", "iqr", # removed "age_bin",
  134. #"age_b","age_se","age_t", # beta, standard error, T #removed age
  135. "icc_adjusted", "icc_conditional",
  136. "icc_child_id", "icc_corpus",
  137. "child_id_var","corpus_var","residual_var",
  138. "child_id_sd","corpus_sd","residual_sd",
  139. "nobs","nchi", "ncor",
  140. "formula","sw","myr")
  141. df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
  142. stringsAsFactors = FALSE)
  143. for(myr in levels(factor(alldays$myr)))for(data_set in levels(factor(alldays$data_set))) for(metric in levels(factor(alldays$metric[alldays$data_set==data_set]))) { # myr=.5 ; data_set = "aclew"; metric="wc_adu_ph"
  144. #select data
  145. mydat=alldays[alldays$myr==myr & alldays$data_set==data_set & alldays$metric==metric,]
  146. mydat$child_id=factor(mydat$child_id)
  147. mydat=mydat[mydat$child_id %in% sample(levels(mydat$child_id),size=round(length(levels(mydat$child_id))/2)),] #sample half of the kids
  148. #reshape mydat so that it fits expectation from the following function
  149. colnames(mydat)[4]<-metric
  150. icc.row <- new_fit_models_sim(mydat, data_set, metric, TRUE) #removed age NA,
  151. df.icc.mixed[nrow(df.icc.mixed) + 1,] <- cbind(icc.row,myr)
  152. }
  153. write.csv(df.icc.mixed,"../output/df.icc.simu_halved.csv",row.names=F)
  154. read.csv("../output/df.icc.simu.csv")->full
  155. plot(full$icc_child_id~df.icc.mixed$icc_child_id)