Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

01-power_analysis.R 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. # SETUP ----
  2. n_cores <- 40
  3. library(lme4)
  4. library(readr)
  5. library(dplyr)
  6. library(tibble)
  7. library(purrr)
  8. library(faux)
  9. library(Matrix)
  10. library(parallel)
  11. library(RhpcBLASctl)
  12. ggplot2::theme_set(theme_bw() + theme(legend.position="top"))
  13. blas_set_num_threads(1)
  14. # set.seed(1)
  15. norm01 <- function(x, ...) (x-min(x, ...))/(max(x, ...)-min(x, ...))
  16. # some labels for fixed effects used for ease of reference in various places
  17. fe_vars <- c("0", "cong", "pred", "int")
  18. # function to convert a tibble or dataframe of correlations to a matrix (see tribbles below for example)
  19. cor_tbl2mat <- function(cor_tbl, vars_labs=fe_vars) {
  20. # initialise as matrix
  21. mat <- matrix(
  22. data = rep(NA, length(vars_labs)**2),
  23. ncol = length(vars_labs),
  24. nrow = length(vars_labs)
  25. )
  26. # pull relevant values from dataframe
  27. for (i_x in 1:length(vars_labs)) {
  28. for (j_x in 1:length(vars_labs)) {
  29. mat[i_x, j_x] <- if (i_x==j_x) {
  30. 1
  31. } else {
  32. cor_tbl %>%
  33. filter(
  34. i %in% vars_labs[c(i_x, j_x)] &
  35. j %in% vars_labs[c(i_x, j_x)]
  36. ) %>%
  37. pull(cor) %>%
  38. unique()
  39. }
  40. }
  41. }
  42. # return result
  43. mat
  44. }
  45. stim_csv <- read_csv("stim_tidy_long.csv") %>%
  46. mutate(
  47. prop_agree = perc_name_agree/100,
  48. pred_norm = norm01(prop_agree, na.rm=TRUE),
  49. pred_minned = prop_agree - min(prop_agree, na.rm=TRUE)
  50. )
  51. # SIMULATION SETUP ----
  52. # simulate the predictor coding, with expected effects
  53. # (this is used to extract predicted coefficient)
  54. min_pred <- min(stim_csv$pred_norm, na.rm=T)
  55. max_pred <- max(stim_csv$pred_norm, na.rm=T)
  56. cells <- tribble(
  57. ~cong, ~pred, ~uV,
  58. -0.5, min_pred, -5, # incongruent, lowest predictability
  59. 0.5, min_pred, -5, # congruent, lowest predictability
  60. -0.5, max_pred, -5, # incongruent, highest predictability
  61. 0.5, max_pred, -4.25 # congruent, highest predictability
  62. )
  63. cells_m <- lm(uV ~ cong * pred, data=cells)
  64. cells_coefs <- coef(cells_m) %>%
  65. round(10) # avoid floating point errors
  66. # list which will contain values used in the simulation
  67. sim <- list()
  68. # set fixed effect parameters (values in uV)
  69. sim$B0 <- cells_coefs[["(Intercept)"]] # intercept (typical peak n170 amplitude)
  70. sim$Bcong <- cells_coefs[["cong"]] # effect of congruency
  71. sim$Bpred <- cells_coefs[["pred"]] # effect of predictability
  72. sim$Bint <- cells_coefs[["cong:pred"]] # interaction (effect of interest)
  73. # set random intercepts
  74. # subject intercepts are informed by prior n170 research
  75. # item intercepts are informed by prior n170 research and a behavioural pilot, in which images (the highest level of the nested structure) captured most variance, and words and word-image combinations captured much smaller proportions of variance
  76. # channel and subject-channel combination random effects are based on a reanalysis of the EEG data at 230 ms from Gagl (2020) using a maximal random effects structure
  77. sim$S0_sd <- 2.5 # by-subject random intercept sd
  78. sim$I0_sd <- 0.25 # by-image random intercept sd
  79. sim$W0_sd <- 0.25 # by-word random intercept sd
  80. # set random slopes, and correlations between random terms
  81. # Which random slopes are simulated depends on what the experiment's design allows us to model
  82. # subject random slopes
  83. sim$Scong_sd <- 0.75
  84. sim$Spred_sd <- 1
  85. sim$Sint_sd <- 1
  86. # image random slopes
  87. sim$Icong_sd <- 0.5
  88. # # image random effects correlation
  89. # sim$I0_cong_cor <- 0.74
  90. # SD of residuals
  91. sim$resid_sd <- 3
  92. # sim$nS <- 50 # number of subjects (commented out as defined as function argument)
  93. sim$nI <- 200 # number of images
  94. sim$nW <- sim$nI * 2 # number of words
  95. # SIMULATE MODEL ----
  96. # function for simulating N subjects' data
  97. sim_experiment <- function(nS=50, re_corr=0) {
  98. sim$nS <- nS
  99. # image random effects correlation (randomised)
  100. sim$I0_cong_cor <- re_corr
  101. # subject random correlations (randomised)
  102. S0_cors <- tribble(
  103. ~i, ~j, ~cor,
  104. "0", "cong", re_corr,
  105. "0", "pred", re_corr,
  106. "0", "int", re_corr,
  107. "cong", "pred", re_corr,
  108. "cong", "int", re_corr,
  109. "pred", "int", re_corr
  110. )
  111. # initialise as matrix
  112. sim$Scor_mat <- cor_tbl2mat(S0_cors)
  113. # find nearest positive definite var-covar matrix to that generated from random correlations
  114. sim$Scor_mat <- Matrix::nearPD(sim$Scor_mat, corr=TRUE, keepDiag=TRUE, ensureSymmetry=TRUE, maxit=10000) %>%
  115. with(mat)
  116. # simulate item random effects
  117. images <- faux::sim_design(
  118. within = list(effect = c(
  119. I0 = "By-image random intercepts",
  120. Icong = "By-subject interaction slopes"
  121. )),
  122. n = sim$nI,
  123. sd = c(sim$I0_sd, sim$Icong_sd),
  124. r = sim$I0_cong_cor,
  125. id = "image_id",
  126. plot = FALSE
  127. )
  128. # simulate subject random effects
  129. subjects <- faux::sim_design(
  130. within = list(effect = c(
  131. S0 = "By-subject random intercepts",
  132. Scong = "By-subject congruency slopes",
  133. Spred = "By-subject predictability slopes",
  134. Sint = "By-subject interaction slopes"
  135. )),
  136. n = sim$nS,
  137. sd = c(sim$S0_sd, sim$Scong_sd, sim$Spred_sd, sim$Sint_sd),
  138. r = sim$Scor_mat,
  139. id = "subj_id",
  140. plot = FALSE
  141. ) %>%
  142. mutate(stim_set = sample(rep(c(1, 2), length.out=nrow(.))))
  143. # get image-word combinations
  144. images_words <- stim_csv %>%
  145. mutate(
  146. item_id = row_number(),
  147. pred = pred_norm,
  148. condition = ifelse(condition=="A1", "congruent", "incongruent"),
  149. cong = scale(ifelse(condition=="incongruent", 0, 1), scale=FALSE, center=TRUE)
  150. ) %>%
  151. select(item_id, string, filename, condition, pred, cong) %>%
  152. rename(word_id=string, image_id=filename) %>%
  153. mutate(
  154. image_word_id = paste(image_id, word_id, sep="."),
  155. stim_set = sample(rep(c(1, 2), length.out=nrow(.)))
  156. ) %>%
  157. group_by(image_id) %>%
  158. mutate(pred = max(pred, na.rm=TRUE)) %>%
  159. ungroup() %>%
  160. mutate(
  161. I0 = rep(images$I0, each=2),
  162. Icong = rep(images$Icong, each=2),
  163. W0 = rnorm(sim$nW, 0, sim$W0_sd)
  164. )
  165. # collate to simulate trials ----
  166. trials <- crossing(
  167. subj_id = subjects$subj_id,
  168. image_word_id = images_words$image_word_id
  169. ) %>%
  170. inner_join(subjects, by="subj_id") %>%
  171. inner_join(images_words, by=c("image_word_id", "stim_set")) # joining by stim_set as well as the image_word_id ensures each simulated participant only sees their trials
  172. # simulate the outcome ----
  173. sim_data <- trials %>%
  174. mutate(
  175. resid = rnorm(nrow(.), mean=0, sd=sim$resid_sd),
  176. uV = sim$B0 + I0 + W0 + S0 +
  177. (sim$Bcong + Icong + Scong) * cong +
  178. (sim$Bpred + Spred) * pred +
  179. (sim$Bint + Sint) * cong * pred +
  180. resid
  181. ) %>%
  182. select(image_id, word_id, image_word_id, subj_id, condition, cong, pred, uV)
  183. # simulate data loss ----
  184. # use full model of trial accuracy to predict whether each trial is responded to correctly, then exclude incorrect responses
  185. sim_data_clean <- sim_data %>%
  186. # simulate 5% data loss to errors and extreme RTs (based on behavioural pilot) and 5% to technical issues
  187. slice_sample(prop = 0.9) %>%
  188. # rebalance deviation coding
  189. mutate(cong = scale(ifelse(condition=="incongruent", 0, 1), center=TRUE, scale=FALSE))
  190. sim_data_clean
  191. }
  192. sim_experiment(nS = 12) %>%
  193. ggplot(aes(pred, uV, colour=condition)) +
  194. geom_point(alpha = 0.1) +
  195. geom_smooth(method="lm", formula=y~x) +
  196. facet_wrap(vars(subj_id))
  197. sim_experiment(nS = 50) %>%
  198. ggplot(aes(pred, uV, colour=condition)) +
  199. geom_smooth(method="lm", formula=y~x)
  200. sim_experiment(nS = 150) %>%
  201. ggplot(aes(pred, uV, colour=condition)) +
  202. geom_smooth(method="lm", formula=y~x)
  203. sim_results <- function(sim_nr=1, nS=50, re_corr=0, alpha_lvl=0.05, save=FALSE) {
  204. # simulate data
  205. d <- sim_experiment(nS = nS, re_corr = re_corr)
  206. # get the results and record how long it takes
  207. t <- system.time({
  208. # fit model
  209. m <- lmer(
  210. uV ~ cong * pred +
  211. (cong * pred | subj_id) +
  212. (cong | image_id) +
  213. (1 | word_id),
  214. REML=FALSE,
  215. control = lmerControl(optimizer="bobyqa"),
  216. data=d
  217. )
  218. # fit model without the effect of interest
  219. m_no_int <- update(m, .~. -cong:pred)
  220. # if non-convergence, refit without random correlations - a chisquare of 0 (and p of exactly 1) is usually a good indicator
  221. if (with(anova(m, m_no_int), Chisq)[[2]]==0) {
  222. # fit model
  223. m <- lmer(
  224. uV ~ cong * pred +
  225. (cong * pred || subj_id) +
  226. (cong || image_id) +
  227. (1 | word_id),
  228. REML=FALSE,
  229. control = lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e8)),
  230. data=d
  231. )
  232. # fit model without the effect of interest
  233. m_no_int <- update(m, ~. -cong:pred)
  234. # record that this has occurred
  235. corrs_removed <- TRUE
  236. } else {
  237. corrs_removed <- FALSE
  238. }
  239. # get the effect of interest
  240. int_estimate <- fixef(m)[["cong:pred"]]
  241. # model comparison to test hypothesis
  242. chisq_int <- anova(m, m_no_int)
  243. chisq_value <- with(chisq_int, Chisq)[[2]]
  244. p_value <- with(chisq_int, `Pr(>Chisq)`)[[2]]
  245. })
  246. # record results
  247. res <- tibble(
  248. sim_nr = sim_nr,
  249. nS = nS,
  250. re_corr = re_corr,
  251. estimate = int_estimate,
  252. chisq = chisq_value,
  253. p = p_value,
  254. is_sig = p_value<alpha_lvl,
  255. corrs_removed = corrs_removed,
  256. time_taken = t[3]
  257. )
  258. if (save) {
  259. if (!dir.exists("simulations")) dir.create("simulations")
  260. out_path <- file.path("simulations", sprintf("power_%s_%s_%d.csv", Sys.info()[["nodename"]], round(as.numeric(Sys.time())), sim_nr))
  261. write_csv(res, out_path)
  262. }
  263. res
  264. }
  265. cl <- makeCluster(n_cores)
  266. cl_packages <- clusterEvalQ(cl, {library(tidyverse); library(lme4); library(faux)})
  267. clusterExport(cl, c("sim_results", "sim_experiment", "stim_csv", "cor_tbl2mat", "fe_vars", "sim"))
  268. subj_nrs <- rev(seq(10, 100, 5))
  269. re_corr_vals <- seq(0, 0.8, 0.2)
  270. n_sim <- 500
  271. power <- map_df(subj_nrs, function(nS_i) {
  272. map_df(re_corr_vals, function(re_corr_i) {
  273. n_sim_i <- if (re_corr_i==0) n_sim else n_sim/5
  274. cat(sprintf("Simulating %s results when N subjects = %s and random effects have corr of %s\n", n_sim_i, nS_i, re_corr_i))
  275. start <- Sys.time()
  276. res <- parLapply(cl, 1:n_sim_i, function(iter_i) sim_results(sim_nr=iter_i, nS=nS_i, re_corr=re_corr_i)) %>%
  277. reduce(bind_rows)
  278. end <- Sys.time()
  279. cat(" - finished in: ")
  280. print(end-start)
  281. cat(sprintf(" - estimated power of %s\n", round(mean(res$p<0.1 & res$estimate>0), 3)))
  282. res
  283. })
  284. })
  285. stopCluster(cl)
  286. write_csv(power, "power.csv")