# SETUP ----
n_cores <- 40
library(lme4)
library(readr)
library(dplyr)
library(tibble)
library(purrr)
library(faux)
library(Matrix)
library(parallel)
library(RhpcBLASctl)
ggplot2::theme_set(theme_bw() + theme(legend.position="top"))
blas_set_num_threads(1)

# set.seed(1)

norm01 <- function(x, ...) (x-min(x, ...))/(max(x, ...)-min(x, ...))

# some labels for fixed effects used for ease of reference in various places
fe_vars <- c("0", "cong", "pred", "int")

# function to convert a tibble or dataframe of correlations to a matrix (see tribbles below for example)
cor_tbl2mat <- function(cor_tbl, vars_labs=fe_vars) {
  # initialise as matrix
  mat <- matrix(
    data = rep(NA, length(vars_labs)**2),
    ncol = length(vars_labs),
    nrow = length(vars_labs)
  )
  # pull relevant values from dataframe
  for (i_x in 1:length(vars_labs)) {
    for (j_x in 1:length(vars_labs)) {
      mat[i_x, j_x] <- if (i_x==j_x) {
        1
      } else {
        cor_tbl %>%
          filter(
            i %in% vars_labs[c(i_x, j_x)] &
              j %in% vars_labs[c(i_x, j_x)]
          ) %>%
          pull(cor) %>%
          unique()
      }
    }
  }
  # return result
  mat
}

stim_csv <- read_csv("stim_tidy_long.csv") %>%
  mutate(
    prop_agree = perc_name_agree/100,
    pred_norm = norm01(prop_agree, na.rm=TRUE),
    pred_minned = prop_agree - min(prop_agree, na.rm=TRUE)
  )

# SIMULATION SETUP ----

# simulate the predictor coding, with expected effects
# (this is used to extract predicted coefficient)
min_pred <- min(stim_csv$pred_norm, na.rm=T)
max_pred <- max(stim_csv$pred_norm, na.rm=T)

cells <- tribble(
  ~cong, ~pred, ~uV,
  -0.5, min_pred, -5,  # incongruent, lowest predictability
  0.5, min_pred,  -5,  # congruent, lowest predictability
  -0.5, max_pred, -5,  # incongruent, highest predictability
  0.5, max_pred,  -4.25  # congruent, highest predictability
)
cells_m <- lm(uV ~ cong * pred, data=cells)

cells_coefs <- coef(cells_m) %>%
  round(10)  # avoid floating point errors

# list which will contain values used in the simulation
sim <- list()

# set fixed effect parameters (values in uV)
sim$B0 <- cells_coefs[["(Intercept)"]]  # intercept (typical peak n170 amplitude)
sim$Bcong <- cells_coefs[["cong"]]  # effect of congruency
sim$Bpred <- cells_coefs[["pred"]]  # effect of predictability
sim$Bint <- cells_coefs[["cong:pred"]]  # interaction (effect of interest)

# set random intercepts
# subject intercepts are informed by prior n170 research
# item intercepts are informed by prior n170 research and a behavioural pilot, in which images (the highest level of the nested structure) captured most variance, and words and word-image combinations captured much smaller proportions of variance
# channel and subject-channel combination random effects are based on a reanalysis of the EEG data at 230 ms from Gagl (2020) using a maximal random effects structure
sim$S0_sd <-  2.5 # by-subject random intercept sd
sim$I0_sd <-  0.25 # by-image random intercept sd
sim$W0_sd <-  0.25 # by-word random intercept sd

# set random slopes, and correlations between random terms
# Which random slopes are simulated depends on what the experiment's design allows us to model

# subject random slopes
sim$Scong_sd <- 0.75
sim$Spred_sd <- 1
sim$Sint_sd <-  1

# image random slopes
sim$Icong_sd <- 0.5

# # image random effects correlation
# sim$I0_cong_cor <- 0.74

# SD of residuals
sim$resid_sd <- 3

# sim$nS <-  50  # number of subjects (commented out as defined as function argument)
sim$nI <- 200  # number of images
sim$nW <- sim$nI * 2  # number of words

# SIMULATE MODEL ----

# function for simulating N subjects' data

sim_experiment <- function(nS=50, re_corr=0) {
  
  sim$nS <- nS
  
  # image random effects correlation (randomised)
  sim$I0_cong_cor <- re_corr
  
  # subject random correlations (randomised)
  S0_cors <- tribble(
    ~i, ~j, ~cor,
    "0", "cong", re_corr,
    "0", "pred", re_corr,
    "0", "int", re_corr,
    "cong", "pred", re_corr,
    "cong", "int", re_corr,
    "pred", "int", re_corr
  )
  
  # initialise as matrix
  sim$Scor_mat <- cor_tbl2mat(S0_cors)
  
  # find nearest positive definite var-covar matrix to that generated from random correlations
  sim$Scor_mat <- Matrix::nearPD(sim$Scor_mat, corr=TRUE, keepDiag=TRUE, ensureSymmetry=TRUE, maxit=10000) %>%
    with(mat)
  
  # simulate item random effects
  images <- faux::sim_design(
    within = list(effect = c(
      I0 = "By-image random intercepts",
      Icong = "By-subject interaction slopes"
    )), 
    n = sim$nI,
    sd = c(sim$I0_sd, sim$Icong_sd), 
    r = sim$I0_cong_cor,
    id = "image_id",
    plot = FALSE
  )
  
  # simulate subject random effects
  subjects <- faux::sim_design(
    within = list(effect = c(
      S0 = "By-subject random intercepts", 
      Scong = "By-subject congruency slopes",
      Spred = "By-subject predictability slopes",
      Sint = "By-subject interaction slopes"
    )), 
    n = sim$nS,
    sd = c(sim$S0_sd, sim$Scong_sd, sim$Spred_sd, sim$Sint_sd), 
    r = sim$Scor_mat,
    id = "subj_id",
    plot = FALSE
  ) %>%
    mutate(stim_set = sample(rep(c(1, 2), length.out=nrow(.))))
  
  # get image-word combinations
  images_words <- stim_csv %>%
    mutate(
      item_id = row_number(),
      pred = pred_norm,
      condition = ifelse(condition=="A1", "congruent", "incongruent"),
      cong = scale(ifelse(condition=="incongruent", 0, 1), scale=FALSE, center=TRUE)
    ) %>%
    select(item_id, string, filename, condition, pred, cong) %>%
    rename(word_id=string, image_id=filename) %>%
    mutate(
      image_word_id = paste(image_id, word_id, sep="."),
      stim_set = sample(rep(c(1, 2), length.out=nrow(.)))
    ) %>%
    group_by(image_id) %>%
    mutate(pred = max(pred, na.rm=TRUE)) %>%
    ungroup() %>%
    mutate(
      I0 = rep(images$I0, each=2),
      Icong = rep(images$Icong, each=2),
      W0 = rnorm(sim$nW, 0, sim$W0_sd)
    )
  
  # collate to simulate trials ----
  
  trials <- crossing(
    subj_id = subjects$subj_id,
    image_word_id = images_words$image_word_id
  ) %>%
    inner_join(subjects, by="subj_id") %>%
    inner_join(images_words, by=c("image_word_id", "stim_set"))  # joining by stim_set as well as the image_word_id ensures each simulated participant only sees their trials
  
  # simulate the outcome ----
  
  sim_data <- trials %>%
    mutate(
      resid = rnorm(nrow(.), mean=0, sd=sim$resid_sd),
      uV = sim$B0 + I0 + W0 + S0 +
        (sim$Bcong + Icong + Scong) * cong +
        (sim$Bpred + Spred) * pred +
        (sim$Bint + Sint) * cong * pred +
        resid
    ) %>%
    select(image_id, word_id, image_word_id, subj_id, condition, cong, pred, uV)
  
  # simulate data loss ----
  
  # use full model of trial accuracy to predict whether each trial is responded to correctly, then exclude incorrect responses
  
  sim_data_clean <- sim_data %>%
    # simulate 5% data loss to errors and extreme RTs (based on behavioural pilot) and 5% to technical issues
    slice_sample(prop = 0.9) %>%
    # rebalance deviation coding
    mutate(cong = scale(ifelse(condition=="incongruent", 0, 1), center=TRUE, scale=FALSE))
  
  sim_data_clean
}

sim_experiment(nS = 12) %>%
  ggplot(aes(pred, uV, colour=condition)) +
  geom_point(alpha = 0.1) +
  geom_smooth(method="lm", formula=y~x) +
  facet_wrap(vars(subj_id))

sim_experiment(nS = 50) %>%
  ggplot(aes(pred, uV, colour=condition)) +
  geom_smooth(method="lm", formula=y~x)

sim_experiment(nS = 150) %>%
  ggplot(aes(pred, uV, colour=condition)) +
  geom_smooth(method="lm", formula=y~x)

sim_results <- function(sim_nr=1, nS=50, re_corr=0, alpha_lvl=0.05, save=FALSE) {
  # simulate data
  d <- sim_experiment(nS = nS, re_corr = re_corr)
  # get the results and record how long it takes
  t <- system.time({
    # fit model
    m <- lmer(
      uV ~ cong * pred +
        (cong * pred | subj_id) +
        (cong | image_id) +
        (1 | word_id),
      REML=FALSE,
      control = lmerControl(optimizer="bobyqa"),
      data=d
    )
    # fit model without the effect of interest
    m_no_int <- update(m, .~. -cong:pred)
    # if non-convergence, refit without random correlations - a chisquare of 0 (and p of exactly 1) is usually a good indicator
    if (with(anova(m, m_no_int), Chisq)[[2]]==0) {
      # fit model
      m <- lmer(
        uV ~ cong * pred +
          (cong * pred || subj_id) +
          (cong || image_id) +
          (1 | word_id),
        REML=FALSE,
        control = lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e8)),
        data=d
      )
      # fit model without the effect of interest
      m_no_int <- update(m, ~. -cong:pred)
      # record that this has occurred
      corrs_removed <- TRUE
    } else {
      corrs_removed <- FALSE
    }
    # get the effect of interest
    int_estimate <- fixef(m)[["cong:pred"]]
    # model comparison to test hypothesis
    chisq_int <- anova(m, m_no_int)
    chisq_value <- with(chisq_int, Chisq)[[2]]
    p_value <- with(chisq_int, `Pr(>Chisq)`)[[2]]
  })
  # record results
  res <- tibble(
    sim_nr = sim_nr,
    nS = nS,
    re_corr = re_corr,
    estimate = int_estimate,
    chisq = chisq_value,
    p = p_value,
    is_sig = p_value<alpha_lvl,
    corrs_removed = corrs_removed,
    time_taken = t[3]
  )
  
  if (save) {
    if (!dir.exists("simulations")) dir.create("simulations")
    out_path <- file.path("simulations", sprintf("power_%s_%s_%d.csv", Sys.info()[["nodename"]], round(as.numeric(Sys.time())), sim_nr))
    write_csv(res, out_path)
  }
  
  res
}

cl <- makeCluster(n_cores)
cl_packages <- clusterEvalQ(cl, {library(tidyverse); library(lme4); library(faux)})
clusterExport(cl, c("sim_results", "sim_experiment", "stim_csv", "cor_tbl2mat", "fe_vars", "sim"))

subj_nrs <- rev(seq(10, 100, 5))
re_corr_vals <- seq(0, 0.8, 0.2)
n_sim <- 500

power <- map_df(subj_nrs, function(nS_i) {
  map_df(re_corr_vals, function(re_corr_i) {
    n_sim_i <- if (re_corr_i==0) n_sim else n_sim/5
    cat(sprintf("Simulating %s results when N subjects = %s and random effects have corr of %s\n", n_sim_i, nS_i, re_corr_i))
    start <- Sys.time()
    res <- parLapply(cl, 1:n_sim_i, function(iter_i) sim_results(sim_nr=iter_i, nS=nS_i, re_corr=re_corr_i)) %>%
      reduce(bind_rows)
    end <- Sys.time()
    cat(" - finished in: ")
    print(end-start)
    cat(sprintf(" - estimated power of %s\n", round(mean(res$p<0.1 & res$estimate>0), 3)))
    res
  })
})

stopCluster(cl)

write_csv(power, "power.csv")