123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- # This script runs a sensitivity analysis, where we alter the value of the LKJ eta parameter and extract the posteriors
- library(readr)
- library(dplyr)
- library(purrr)
- library(tidyr)
- library(brms)
- library(tidybayes)
- library(parallel)
- n_cores <- parallel::detectCores() # total number of cores
- n_cores_per_model <- 4 # will be best if n_cores is divisible by this
- n_parallel_models <- floor(n_cores / n_cores_per_model) # number of models to be fit in parallel at one time
- set.seed(1)
- stim_sim <- left_join(
- read_csv(file.path("stim_sim", "preregistered", "jacc.csv")),
- read_csv(file.path("stim_sim", "preregistered", "ot.csv")),
- by = c("char1", "char2")
- ) |>
- mutate(
- rank_jacc = rank(jacc),
- rank_ot = rank(ot)
- ) |>
- rowwise() |>
- mutate(pair_id = paste(sort(c(char1, char2)), collapse="_")) |>
- ungroup()
- rdm_poi <- file.path("rdm_data", "period_of_interest") |>
- list.files(pattern=".*\\.csv", full.names=TRUE) |>
- map_df(read_csv, col_types=c("subj_id"="c")) |>
- left_join(stim_sim, by=c("char1", "char2")) |>
- group_by(subj_id) |>
- mutate(rank_eeg_dissim = rank(eeg_dissim)) |>
- ungroup()
- # short function for calculating the population SD
- pop_sd <- function(x) sqrt((length(x)-1)/length(x)) * sd(x)
- # function to fit the model
- fit_mod <- function(lkj_prior, sample_prior="no") {
- m_rho_prior_full <- c(
- set_prior(sprintf("lkj(%s)", lkj_prior), class="rescor"),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_eeg_dissim))), class="sigma", resp="rankeegdissim", ub=max(rdm_poi$rank_eeg_dissim)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_jacc))), class="sigma", resp="rankjacc", ub=max(rdm_poi$rank_jacc)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_ot))), class="sigma", resp="rankot", ub=max(rdm_poi$rank_ot))
- )
-
- m_rho_full_poi <- brm(
- bf(
- mvbind(rank_eeg_dissim, rank_ot, rank_jacc) ~ 0
- ) +
- set_rescor(rescor=TRUE),
- family = brmsfamily("gaussian"),
- iter = 24000,
- warmup = 6000,
- chains = 8,
- cores = n_cores_per_model,
- seed = 1,
- # centre each dimension since we don't model intercept
- data = mutate(rdm_poi, across(starts_with("rank"), function(x) x - mean(x))),
- prior = m_rho_prior_full,
- save_pars = save_pars(all=TRUE),
- sample_prior = sample_prior,
- control = list(adapt_delta = 0.99)
- )
-
- m_rho_full_poi
- }
- # function to fit the model and return the median and 89% HDI for Wasserstein and Jaccard for the posteriors
- sample_and_summarise_posteriors <- function(lkj_prior) {
- m_rho_full_poi <- fit_mod(lkj_prior, sample_prior="no")
-
- # get a vector of response variable names (will match order of correlation matrix)
- var_names <- colnames(get_y(m_rho_full_poi))
-
- # summarise the posteriors
- summ <- as_draws_df(m_rho_full_poi, "^Lrescor", regex=TRUE) |>
- # convert columns into covariance, correlation, and partial correlation matrix for each draw
- pivot_longer(cols=starts_with("Lrescor"), names_to="idx", values_to="r") |>
- mutate(idx = sub("Lrescor", "", idx, fixed=TRUE)) |>
- mutate(idx = gsub("(\\[)|(\\])", "", idx)) |>
- separate(idx, c("idx1", "idx2"), sep=",") |>
- mutate(idx1=as.integer(idx1), idx2=as.integer(idx2)) |>
- group_by(.chain, .iteration, .draw) |>
- nest() |>
- # get correlation and partial correlation matrix for each draw
- mutate(
- cor_mat = map(
- data, function(x) {
- cov_mat <- matrix(0, nrow=max(x$idx1), ncol=max(x$idx2))
- cov_mat[cbind(x$idx1, x$idx2)] <- x$r
- cov_mat[upper.tri(cov_mat, diag=FALSE)] <- t(cov_mat)[upper.tri(cov_mat, diag=FALSE)]
- colnames(cov_mat) <- var_names
- rownames(cov_mat) <- var_names
- cov2cor(cov_mat)
- }
- ),
- pcor_mat = map(
- cor_mat, function(x) {
- pcor_mat <- corpcor::cor2pcor(x)
- colnames(pcor_mat) <- var_names
- rownames(pcor_mat) <- var_names
- pcor_mat
- }
- ),
- cor_rankeegdissim_rankjacc = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankjacc"]),
- cor_rankeegdissim_rankot = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankot"]),
- pcor_rankeegdissim_rankjacc = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankjacc"]),
- pcor_rankeegdissim_rankot = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankot"])
- ) |>
- pivot_longer(c(starts_with("pcor_rankeegdissim"), starts_with("cor_rankeegdissim")), names_to="cor_lab", values_to="Rho") |>
- mutate(
- model = ifelse(grepl("jacc", cor_lab, fixed=TRUE), "Jaccard Distance", "Wasserstein Distance"),
- is_partial = grepl("^pcor", cor_lab)
- ) |>
- dplyr::select(-data, -cor_mat, -pcor_mat) |>
- group_by(model, is_partial) |>
- median_hdi(Rho, .width=.89) |>
- ungroup() |>
- mutate(eta = lkj_prior) |>
- select(eta, everything())
-
- summ
- }
- # set up a vector of eta values to use as the lkj prior
- n_models <- 50
- lkj_vals <- 1 * 10 ** seq(-3, 3, length.out=n_models)
- # make a cluster with as many workers as necessary
- cl <- makeCluster( n_parallel_models )
- cl_pkg <- clusterEvalQ(cl, {
- library(dplyr)
- library(purrr)
- library(tidyr)
- library(brms)
- library(tidybayes)
- })
- cl_obj <- clusterExport(cl, list(
- "pop_sd", "rdm_poi",
- "fit_mod",
- "n_cores_per_model"
- ))
- posterior_res <- parLapply(cl, lkj_vals, sample_and_summarise_posteriors) |>
- reduce(bind_rows)
- stopCluster(cl)
- saveRDS(posterior_res, file.path("estimates", "sensitivity_lkj_prior.rds"))
|