123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 |
- # SETUP ----
- n_cores <- 40
- library(lme4)
- library(readr)
- library(dplyr)
- library(tibble)
- library(purrr)
- library(faux)
- library(Matrix)
- library(parallel)
- library(RhpcBLASctl)
- ggplot2::theme_set(theme_bw() + theme(legend.position="top"))
- blas_set_num_threads(1)
- # set.seed(1)
- norm01 <- function(x, ...) (x-min(x, ...))/(max(x, ...)-min(x, ...))
- # some labels for fixed effects used for ease of reference in various places
- fe_vars <- c("0", "cong", "pred", "int")
- # function to convert a tibble or dataframe of correlations to a matrix (see tribbles below for example)
- cor_tbl2mat <- function(cor_tbl, vars_labs=fe_vars) {
- # initialise as matrix
- mat <- matrix(
- data = rep(NA, length(vars_labs)**2),
- ncol = length(vars_labs),
- nrow = length(vars_labs)
- )
- # pull relevant values from dataframe
- for (i_x in 1:length(vars_labs)) {
- for (j_x in 1:length(vars_labs)) {
- mat[i_x, j_x] <- if (i_x==j_x) {
- 1
- } else {
- cor_tbl %>%
- filter(
- i %in% vars_labs[c(i_x, j_x)] &
- j %in% vars_labs[c(i_x, j_x)]
- ) %>%
- pull(cor) %>%
- unique()
- }
- }
- }
- # return result
- mat
- }
- stim_csv <- read_csv("stim_tidy_long.csv") %>%
- mutate(
- prop_agree = perc_name_agree/100,
- pred_norm = norm01(prop_agree, na.rm=TRUE),
- pred_minned = prop_agree - min(prop_agree, na.rm=TRUE)
- )
- # SIMULATION SETUP ----
- # simulate the predictor coding, with expected effects
- # (this is used to extract predicted coefficient)
- min_pred <- min(stim_csv$pred_norm, na.rm=T)
- max_pred <- max(stim_csv$pred_norm, na.rm=T)
- cells <- tribble(
- ~cong, ~pred, ~uV,
- -0.5, min_pred, -5, # incongruent, lowest predictability
- 0.5, min_pred, -5, # congruent, lowest predictability
- -0.5, max_pred, -5, # incongruent, highest predictability
- 0.5, max_pred, -4.25 # congruent, highest predictability
- )
- cells_m <- lm(uV ~ cong * pred, data=cells)
- cells_coefs <- coef(cells_m) %>%
- round(10) # avoid floating point errors
- # list which will contain values used in the simulation
- sim <- list()
- # set fixed effect parameters (values in uV)
- sim$B0 <- cells_coefs[["(Intercept)"]] # intercept (typical peak n170 amplitude)
- sim$Bcong <- cells_coefs[["cong"]] # effect of congruency
- sim$Bpred <- cells_coefs[["pred"]] # effect of predictability
- sim$Bint <- cells_coefs[["cong:pred"]] # interaction (effect of interest)
- # set random intercepts
- # subject intercepts are informed by prior n170 research
- # item intercepts are informed by prior n170 research and a behavioural pilot, in which images (the highest level of the nested structure) captured most variance, and words and word-image combinations captured much smaller proportions of variance
- # channel and subject-channel combination random effects are based on a reanalysis of the EEG data at 230 ms from Gagl (2020) using a maximal random effects structure
- sim$S0_sd <- 2.5 # by-subject random intercept sd
- sim$I0_sd <- 0.25 # by-image random intercept sd
- sim$W0_sd <- 0.25 # by-word random intercept sd
- # set random slopes, and correlations between random terms
- # Which random slopes are simulated depends on what the experiment's design allows us to model
- # subject random slopes
- sim$Scong_sd <- 0.75
- sim$Spred_sd <- 1
- sim$Sint_sd <- 1
- # image random slopes
- sim$Icong_sd <- 0.5
- # # image random effects correlation
- # sim$I0_cong_cor <- 0.74
- # SD of residuals
- sim$resid_sd <- 3
- # sim$nS <- 50 # number of subjects (commented out as defined as function argument)
- sim$nI <- 200 # number of images
- sim$nW <- sim$nI * 2 # number of words
- # SIMULATE MODEL ----
- # function for simulating N subjects' data
- sim_experiment <- function(nS=50, re_corr=0) {
-
- sim$nS <- nS
-
- # image random effects correlation (randomised)
- sim$I0_cong_cor <- re_corr
-
- # subject random correlations (randomised)
- S0_cors <- tribble(
- ~i, ~j, ~cor,
- "0", "cong", re_corr,
- "0", "pred", re_corr,
- "0", "int", re_corr,
- "cong", "pred", re_corr,
- "cong", "int", re_corr,
- "pred", "int", re_corr
- )
-
- # initialise as matrix
- sim$Scor_mat <- cor_tbl2mat(S0_cors)
-
- # find nearest positive definite var-covar matrix to that generated from random correlations
- sim$Scor_mat <- Matrix::nearPD(sim$Scor_mat, corr=TRUE, keepDiag=TRUE, ensureSymmetry=TRUE, maxit=10000) %>%
- with(mat)
-
- # simulate item random effects
- images <- faux::sim_design(
- within = list(effect = c(
- I0 = "By-image random intercepts",
- Icong = "By-subject interaction slopes"
- )),
- n = sim$nI,
- sd = c(sim$I0_sd, sim$Icong_sd),
- r = sim$I0_cong_cor,
- id = "image_id",
- plot = FALSE
- )
-
- # simulate subject random effects
- subjects <- faux::sim_design(
- within = list(effect = c(
- S0 = "By-subject random intercepts",
- Scong = "By-subject congruency slopes",
- Spred = "By-subject predictability slopes",
- Sint = "By-subject interaction slopes"
- )),
- n = sim$nS,
- sd = c(sim$S0_sd, sim$Scong_sd, sim$Spred_sd, sim$Sint_sd),
- r = sim$Scor_mat,
- id = "subj_id",
- plot = FALSE
- ) %>%
- mutate(stim_set = sample(rep(c(1, 2), length.out=nrow(.))))
-
- # get image-word combinations
- images_words <- stim_csv %>%
- mutate(
- item_id = row_number(),
- pred = pred_norm,
- condition = ifelse(condition=="A1", "congruent", "incongruent"),
- cong = scale(ifelse(condition=="incongruent", 0, 1), scale=FALSE, center=TRUE)
- ) %>%
- select(item_id, string, filename, condition, pred, cong) %>%
- rename(word_id=string, image_id=filename) %>%
- mutate(
- image_word_id = paste(image_id, word_id, sep="."),
- stim_set = sample(rep(c(1, 2), length.out=nrow(.)))
- ) %>%
- group_by(image_id) %>%
- mutate(pred = max(pred, na.rm=TRUE)) %>%
- ungroup() %>%
- mutate(
- I0 = rep(images$I0, each=2),
- Icong = rep(images$Icong, each=2),
- W0 = rnorm(sim$nW, 0, sim$W0_sd)
- )
-
- # collate to simulate trials ----
-
- trials <- crossing(
- subj_id = subjects$subj_id,
- image_word_id = images_words$image_word_id
- ) %>%
- inner_join(subjects, by="subj_id") %>%
- inner_join(images_words, by=c("image_word_id", "stim_set")) # joining by stim_set as well as the image_word_id ensures each simulated participant only sees their trials
-
- # simulate the outcome ----
-
- sim_data <- trials %>%
- mutate(
- resid = rnorm(nrow(.), mean=0, sd=sim$resid_sd),
- uV = sim$B0 + I0 + W0 + S0 +
- (sim$Bcong + Icong + Scong) * cong +
- (sim$Bpred + Spred) * pred +
- (sim$Bint + Sint) * cong * pred +
- resid
- ) %>%
- select(image_id, word_id, image_word_id, subj_id, condition, cong, pred, uV)
-
- # simulate data loss ----
-
- # use full model of trial accuracy to predict whether each trial is responded to correctly, then exclude incorrect responses
-
- sim_data_clean <- sim_data %>%
- # simulate 5% data loss to errors and extreme RTs (based on behavioural pilot) and 5% to technical issues
- slice_sample(prop = 0.9) %>%
- # rebalance deviation coding
- mutate(cong = scale(ifelse(condition=="incongruent", 0, 1), center=TRUE, scale=FALSE))
-
- sim_data_clean
- }
- sim_experiment(nS = 12) %>%
- ggplot(aes(pred, uV, colour=condition)) +
- geom_point(alpha = 0.1) +
- geom_smooth(method="lm", formula=y~x) +
- facet_wrap(vars(subj_id))
- sim_experiment(nS = 50) %>%
- ggplot(aes(pred, uV, colour=condition)) +
- geom_smooth(method="lm", formula=y~x)
- sim_experiment(nS = 150) %>%
- ggplot(aes(pred, uV, colour=condition)) +
- geom_smooth(method="lm", formula=y~x)
- sim_results <- function(sim_nr=1, nS=50, re_corr=0, alpha_lvl=0.05, save=FALSE) {
- # simulate data
- d <- sim_experiment(nS = nS, re_corr = re_corr)
- # get the results and record how long it takes
- t <- system.time({
- # fit model
- m <- lmer(
- uV ~ cong * pred +
- (cong * pred | subj_id) +
- (cong | image_id) +
- (1 | word_id),
- REML=FALSE,
- control = lmerControl(optimizer="bobyqa"),
- data=d
- )
- # fit model without the effect of interest
- m_no_int <- update(m, .~. -cong:pred)
- # if non-convergence, refit without random correlations - a chisquare of 0 (and p of exactly 1) is usually a good indicator
- if (with(anova(m, m_no_int), Chisq)[[2]]==0) {
- # fit model
- m <- lmer(
- uV ~ cong * pred +
- (cong * pred || subj_id) +
- (cong || image_id) +
- (1 | word_id),
- REML=FALSE,
- control = lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e8)),
- data=d
- )
- # fit model without the effect of interest
- m_no_int <- update(m, ~. -cong:pred)
- # record that this has occurred
- corrs_removed <- TRUE
- } else {
- corrs_removed <- FALSE
- }
- # get the effect of interest
- int_estimate <- fixef(m)[["cong:pred"]]
- # model comparison to test hypothesis
- chisq_int <- anova(m, m_no_int)
- chisq_value <- with(chisq_int, Chisq)[[2]]
- p_value <- with(chisq_int, `Pr(>Chisq)`)[[2]]
- })
- # record results
- res <- tibble(
- sim_nr = sim_nr,
- nS = nS,
- re_corr = re_corr,
- estimate = int_estimate,
- chisq = chisq_value,
- p = p_value,
- is_sig = p_value<alpha_lvl,
- corrs_removed = corrs_removed,
- time_taken = t[3]
- )
-
- if (save) {
- if (!dir.exists("simulations")) dir.create("simulations")
- out_path <- file.path("simulations", sprintf("power_%s_%s_%d.csv", Sys.info()[["nodename"]], round(as.numeric(Sys.time())), sim_nr))
- write_csv(res, out_path)
- }
-
- res
- }
- cl <- makeCluster(n_cores)
- cl_packages <- clusterEvalQ(cl, {library(tidyverse); library(lme4); library(faux)})
- clusterExport(cl, c("sim_results", "sim_experiment", "stim_csv", "cor_tbl2mat", "fe_vars", "sim"))
- subj_nrs <- rev(seq(10, 100, 5))
- re_corr_vals <- seq(0, 0.8, 0.2)
- n_sim <- 500
- power <- map_df(subj_nrs, function(nS_i) {
- map_df(re_corr_vals, function(re_corr_i) {
- n_sim_i <- if (re_corr_i==0) n_sim else n_sim/5
- cat(sprintf("Simulating %s results when N subjects = %s and random effects have corr of %s\n", n_sim_i, nS_i, re_corr_i))
- start <- Sys.time()
- res <- parLapply(cl, 1:n_sim_i, function(iter_i) sim_results(sim_nr=iter_i, nS=nS_i, re_corr=re_corr_i)) %>%
- reduce(bind_rows)
- end <- Sys.time()
- cat(" - finished in: ")
- print(end-start)
- cat(sprintf(" - estimated power of %s\n", round(mean(res$p<0.1 & res$estimate>0), 3)))
- res
- })
- })
- stopCluster(cl)
- write_csv(power, "power.csv")
|