123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- ---
- title: "Simulation test"
- output: html_document
- ---
- ```{r setup, include=FALSE}
- knitr::opts_chunk$set(echo = TRUE)
- library(simstudy)
- library(lme4)
- library("performance") # ICC
- library(ggplot2)
- RECALC=TRUE
- data_sets = c('aclew', 'lena')
- check_small_var<-function(x,y,i) round(x[i],3)==round(y[i],3) & round(y[i],3) == 0
- fit_child_model<-function(dataframe, metric){
- # Fit formula where experiment is removed
- formula <- as.formula(paste0(metric, "~ (1|child_id)")) #removed age_s +
- model <- lmer(formula, data=dataframe)
-
- return (model)
- }
- extract_chi_variables<-function(model){
- icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
- icc.result.split <- c(as.data.frame(icc(model, by_group=TRUE))$ICC, NA)
-
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
- # chi, NA, residual
- ranefs_vars <-c(ranefs_vars[1],NA,ranefs_vars[2])
- ranefs_stdv <-c(ranefs_stdv[1],NA,ranefs_stdv[2])
- # chi, NA
- ns<-c(unlist(summary(model)$n),NA)
- return (c(#coefficients(summary(model))["age_s",],
- icc.result.mixed,
- icc.result.split,
- ranefs_vars,
- ranefs_stdv,
- nobs(model),ns))
- }
- extract_full_variables<-function(model){
- icc.result.mixed <- c(icc(model)$ICC_adjusted,icc(model)$ICC_conditional)
- icc.result.split <- t(as.data.frame(icc(model, by_group=TRUE))$ICC)
- ranefs_vars <- t(as.data.frame(VarCorr(model))["vcov"])
- ranefs_stdv <- t(as.data.frame(VarCorr(model))["sdcor"])
-
- ns<-t(data.frame(summary(model)$n))
-
- return (c( # coefficients(summary(model))["age_s",],
- icc.result.mixed,
- icc.result.split,
- ranefs_vars,
- ranefs_stdv,
- nobs(model),ns))
- }
- new_fit_models<-function(dataframe, data_set, metric, fit_full = TRUE){ #age,
- iqr = quantile(dataframe[,metric],.75,na.rm=T)-quantile(dataframe[,metric],.25,na.rm=T)
-
- if(fit_full){
- # Fit full model
- formula <- as.formula(paste0(metric, "~ (1|experiment/child_id)")) #removed age_s +
- model <- lmer(formula, data=dataframe)
- }
-
-
- if(fit_full & !isSingular(model)) # Fitted full model
- {
- form="full"
-
- sw=shapiro.test(resid(model))$p
- mod_variables <- extract_full_variables(model)
-
- # Build line
- } else {
- model <- fit_child_model(dataframe, metric)
- if(!isSingular(model)){
-
- form = "no_exp"
- sw=shapiro.test(resid(model))$p
- mod_variables <- extract_chi_variables(model)
-
- } else {
- form='no_chi_effect'
- sw = NA
- mod_variables = c(
- # NA,NA,NA, # c(coefficients(summary(model))["age_s",]
- NA,NA, # icc.result.mixed
- NA,NA, # icc.result.split
- NA,NA,NA, # ranefs_vars
- NA,NA,NA, # raners_std
- NA,NA,NA) # nobs (child, corpus), nobs
- }
- }
-
- icc.row = c(data_set, metric, iqr, mod_variables, form, sw) #age,
- return (icc.row)
- }
- ```
- ## Assumptions behind the simulation
- We will inform the simulation by the data we have as follows:
- - we'll have the same N of corpora, and of children in each corpus
- - we'll have the same variables for each -- and these variables will have the same mean & SD for day 1 of recordings as in observed data
- We'll depart from reality as follows:
- - we will not consider the r across multiple days observed in the data, but instead generate data points to vary r from a small correlation (r=.1), a moderate one (r=.3), and a larger one (r=.5) -- higher values of r do not seem reasonable given what we know about infant measures
- - we will not consider child age, nor variable re-recording periods
- - we will have a single pair of recordings (rather than variable N of re-recordings)
- ## Implementation approach
- We use simstudy, a package created for such simulations, following the vignette https://cran.r-project.org/web/packages/simstudy/vignettes/correlated.html to create correlated data providing a correlation matrix
- ```{r extract-parameters, eval=RECALC}
- # Columns that should not be scaled or taken into account as metrics
- no.scale.columns <- c('experiment', 'session_id', 'child_id','child_id_unique','age_s',
- 'date_iso', 'child_dob', 'missing_audio',"age_bin","duration","usession_id",
- "normative","age","duration_alice", "duration_vcm" , "duration_vtc","duration_its" )
- #create matrix to hold info in
- df.vars.cols = c("data_set","experiment","n","metric", "mean","sd")
- df.vars = data.frame(matrix(ncol=length(df.vars.cols),nrow=0, dimnames=list(NULL, df.vars.cols)), stringsAsFactors=FALSE)
- for (data_set in data_sets){ # data_set = "aclew"
- mydat <- read.csv(paste0('../data_output/', data_set,'_metrics_scaled.csv')) # TO DISCUSS: scaled or unscaled?
- metrics = colnames(mydat)[!is.element(colnames(mydat), no.scale.columns)]
-
- #select down to first recording by child
- mydat$uchild_id=paste(mydat$experiment,mydat$child_id)
- mydat$child_id_age=paste(mydat$experiment,mydat$child_id,mydat$age)
- mydat=mydat[order(mydat$experiment,mydat$child_id,mydat$age),] #sort by child ID & age
- mydat=mydat[!duplicated(mydat$uchild_id),] #keep only the first line for each child
-
-
- means=stack(aggregate(mydat[,metrics],by=list(mydat$experiment),mean,na.rm=T)[,-1])
- sds=stack(aggregate(mydat[,metrics],by=list(mydat$experiment),sd,na.rm=T)[-1])
-
- df.vars=rbind(df.vars,
- cbind(data_set,levels(factor(mydat$experiment)),data.frame(table(mydat$experiment))$Freq,means[,c(2,1)],sds[,-2]))
-
- }
- colnames(df.vars)<-df.vars.cols
- ```
- ```{r generate-data, eval=RECALC}
- alldays=NULL
- for(i in 1:nrow(df.vars)) for(myr in c(.1,.3,.5, .7, .9)){#i=2;myr=.5
- #use a while loop to make sure data generated are close to the target r
- C <- matrix(c( 1, myr,myr,1), nrow=2)
- simulation_unsatisfactory <- TRUE
- while(simulation_unsatisfactory==TRUE){
- try.this <- as.data.frame(
- genCorData(df.vars$n[i], mu = c(df.vars$mean[i],df.vars$mean[i]), sigma = df.vars$sd[i], corMatrix = C) )
- try.this <- try.this[,-1] #remove the ID column, since it's useless
- sim.cor <- cor.test(try.this$V1,try.this$V2)$estimate
- simulation_unsatisfactory = !(abs(sim.cor-myr)<0.01)
- }
- thisdat=cbind(df.vars$data_set[i],df.vars$experiment[i],1:df.vars$n[i],stack(try.this),as.character(df.vars$metric[i]),myr)
- colnames(thisdat)<-c("data_set","experiment","child_id","value","day","metric","myr")
- alldays=rbind(alldays,thisdat)
- }#end i
- write.csv(alldays,"../output/simulated_correlated_2day.csv",row.names=F)
- ```
- ## Create ICC dataframe
- ```{r, eval=RECALC}
- read.csv("../output/simulated_correlated_2day.csv")->alldays
- df.icc.mixed.cols = c("data_set", "metric", "iqr", # removed "age_bin",
- #"age_b","age_se","age_t", # beta, standard error, T #removed age
- "icc_adjusted", "icc_conditional",
- "icc_child_id", "icc_corpus",
- "child_id_var","corpus_var","residual_var",
- "child_id_sd","corpus_sd","residual_sd",
- "nobs","nchi", "ncor",
- "formula","sw","myr")
- df.icc.mixed = data.frame(matrix(ncol=length(df.icc.mixed.cols),nrow=0, dimnames=list(NULL, df.icc.mixed.cols)),
- stringsAsFactors = FALSE)
- for(myr in levels(factor(alldays$myr)))for(data_set in levels(factor(alldays$data_set))) for(metric in levels(factor(alldays$metric[alldays$data_set==data_set]))) { # myr=.5 ; data_set = "aclew"; metric="wc_adu_ph"
- #select data
- mydat=alldays[alldays$myr==myr & alldays$data_set==data_set & alldays$metric==metric,]
- #reshape mydat so that it fits expectation from the following function
- colnames(mydat)[4]<-metric
- icc.row <- new_fit_models(mydat, data_set, metric, TRUE) #removed age NA,
- df.icc.mixed[nrow(df.icc.mixed) + 1,] <- cbind(icc.row,myr)
-
- }
- write.csv(df.icc.mixed,"../output/df.icc.simu.csv",row.names=F)
- ```
- ## Analyze ICCs
- What is the relationship between ICC and r? It looks like they are similar, but ICC undershoots. But just a little - nothing like squaring.
- ```{r}
- read.csv("../output/df.icc.simu.csv")->df.icc.mixed
- plot(df.icc.mixed$icc_child_id~df.icc.mixed$myr)
- ```
- What is the relationship between iqr & ICC? None
- ```{r}
- ggplot(df.icc.mixed, aes(iqr,icc_child_id, color=myr)) + geom_point()
- ```
- What is the relationship between iqr & SW? None
- ```{r}
- ggplot(df.icc.mixed, aes(icc_child_id, sw, color=myr)) + geom_point()
- ```
|