123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280 |
- library(readr)
- library(dplyr)
- library(purrr)
- library(tidyr)
- library(brms)
- library(ggplot2)
- library(ggdist)
- # library(fda)
- library(patchwork)
- library(parallel)
- n_cores <- parallel::detectCores(all.tests=FALSE, logical=TRUE)
- # n_cores <- 14
- # options(loo.cores = n_cores)
- set.seed(1)
- stim_sim <- left_join(
- read_csv(file.path("stim_sim", "preregistered", "jacc.csv")),
- read_csv(file.path("stim_sim", "preregistered", "ot.csv")),
- by = c("char1", "char2")
- ) |>
- left_join(
- read_csv(file.path("stim_sim", "complexity", "complexity.csv")),
- by = c("char1", "char2")
- ) |>
- left_join(
- read_csv(file.path("stim_sim", "frequency", "frequency.csv")),
- by = c("char1", "char2")
- ) |>
- left_join(
- read_csv(file.path("stim_sim", "phonology", "dominant_phonemes.csv")),
- by = c("char1", "char2")
- ) |>
- left_join(
- read_csv(file.path("stim_sim", "phonology", "letter_names.csv")),
- by = c("char1", "char2")
- ) |>
- mutate(
- rank_jacc = rank(jacc),
- rank_ot = rank(ot),
- rank_comp_dist = rank(comp_dist),
- rank_freq_dist = rank(freq_dist),
- rank_phon_dist = rank(phon_dist),
- rank_name_phon_dist = rank(name_phon_dist)
- ) |>
- rowwise() |>
- mutate(pair_id = paste(sort(c(char1, char2)), collapse="_")) |>
- ungroup()
- rdm <- file.path("rdm_data", "time_resolved") |>
- list.files(pattern=".*\\.csv", full.names=TRUE) |>
- map_df(read_csv, col_types=c("subj_id"="c")) |>
- left_join(stim_sim, by=c("char1", "char2")) |>
- group_by(subj_id, time) |>
- mutate(rank_eeg_dissim = rank(eeg_dissim)) |>
- ungroup() |>
- arrange(time)
- times <- sort(unique(rdm$time))
- rdm_poi <- file.path("rdm_data", "period_of_interest") |>
- list.files(pattern=".*\\.csv", full.names=TRUE) |>
- map_df(read_csv, col_types=c("subj_id"="c")) |>
- left_join(stim_sim, by=c("char1", "char2")) |>
- group_by(subj_id) |>
- mutate(rank_eeg_dissim = rank(eeg_dissim)) |>
- ungroup()
- # fit full model -------------------------------------------
- # this is a multivariate model that estimates correlations between all items
- # partial corr model
- # short function for calculating the population SD
- pop_sd <- function(x) sqrt((length(x)-1)/length(x)) * sd(x)
- # check correlation priors
- lkj_prior <- 1.5
- m_rho_prior_full <- c(
- # biased towards 0 for non-diagonal correlations, because lower correlations are more likely in RSA of EEG data
- set_prior(sprintf("lkj(%s)", lkj_prior), class="rescor"),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_eeg_dissim))), class="sigma", resp="rankeegdissim", ub=max(rdm_poi$rank_eeg_dissim)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_jacc))), class="sigma", resp="rankjacc", ub=max(rdm_poi$rank_jacc)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_ot))), class="sigma", resp="rankot", ub=max(rdm_poi$rank_ot)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_comp_dist))), class="sigma", resp="rankcompdist", ub=max(rdm_poi$rank_comp_dist)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_freq_dist))), class="sigma", resp="rankfreqdist", ub=max(rdm_poi$rank_freq_dist)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_phon_dist))), class="sigma", resp="rankphondist", ub=max(rdm_poi$rank_phon_dist)),
- set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_name_phon_dist))), class="sigma", resp="ranknamephondist", ub=max(rdm_poi$rank_name_phon_dist))
- )
- m_rho_full_poi <- brm(
- bf(
- mvbind(rank_eeg_dissim, rank_jacc, rank_ot, rank_comp_dist, rank_freq_dist, rank_phon_dist, rank_name_phon_dist) ~ 0
- ) +
- set_rescor(rescor=TRUE),
- family = brmsfamily("gaussian"),
- iter = 24000,
- warmup = 6000,
- chains = 8,
- cores = n_cores,
- seed = 1,
- # centre each dimension since we don't model intercept
- data = mutate(rdm_poi, across(starts_with("rank"), function(x) x - mean(x))),
- prior = m_rho_prior_full,
- save_pars = save_pars(all=TRUE),
- sample_prior = FALSE,
- control = list(adapt_delta = 0.99)
- )
- # get a vector of response variable names (will match order of correlation matrix)
- var_names <- colnames(get_y(m_rho_full_poi))
- # get correlation matrices for each draw
- cor_samps <- as_draws_df(m_rho_full_poi, "^Lrescor", regex=TRUE) |>
- # convert columns into covariance, correlation, and partial correlation matrix for each draw
- pivot_longer(cols=starts_with("Lrescor"), names_to="idx", values_to="r") |>
- mutate(idx = sub("Lrescor", "", idx, fixed=TRUE)) |>
- mutate(idx = gsub("(\\[)|(\\])", "", idx)) |>
- separate(idx, c("idx1", "idx2"), sep=",") |>
- mutate(idx1=as.integer(idx1), idx2=as.integer(idx2)) |>
- group_by(.chain, .iteration, .draw) |>
- nest() |>
- # get correlation and partial correlation matrix for each draw
- mutate(
- cor_mat = map(
- data, function(x) {
- cov_mat <- matrix(0, nrow=max(x$idx1), ncol=max(x$idx2))
- cov_mat[cbind(x$idx1, x$idx2)] <- x$r
- cov_mat[upper.tri(cov_mat, diag=FALSE)] <- t(cov_mat)[upper.tri(cov_mat, diag=FALSE)]
- colnames(cov_mat) <- var_names
- rownames(cov_mat) <- var_names
- cov2cor(cov_mat)
- }
- ),
- pcor_mat = map(
- cor_mat, function(x) {
- pcor_mat <- corpcor::cor2pcor(x)
- colnames(pcor_mat) <- var_names
- rownames(pcor_mat) <- var_names
- pcor_mat
- }
- ),
- cor_rankeegdissim_rankjacc = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankjacc"]),
- cor_rankeegdissim_rankot = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankot"]),
- cor_rankeegdissim_rankcompdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankcompdist"]),
- cor_rankeegdissim_rankfreqdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankfreqdist"]),
- cor_rankeegdissim_rankphondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankphondist"]),
- cor_rankeegdissim_ranknamephondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "ranknamephondist"]),
- pcor_rankeegdissim_rankjacc = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankjacc"]),
- pcor_rankeegdissim_rankot = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankot"]),
- pcor_rankeegdissim_rankcompdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankcompdist"]),
- pcor_rankeegdissim_rankfreqdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankfreqdist"]),
- pcor_rankeegdissim_rankphondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankphondist"]),
- pcor_rankeegdissim_ranknamephondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "ranknamephondist"])
- )
- # calculate Evidence Ratios for each hypothesis
- bf_1 <- sum(cor_samps$cor_rankeegdissim_rankot > 0) / sum(cor_samps$cor_rankeegdissim_rankot < 0)
- bf_2a <- sum(cor_samps$cor_rankeegdissim_rankot > cor_samps$cor_rankeegdissim_rankjacc) /
- sum(cor_samps$cor_rankeegdissim_rankot <= cor_samps$cor_rankeegdissim_rankjacc)
- bf_2b <- sum(cor_samps$pcor_rankeegdissim_rankot > cor_samps$pcor_rankeegdissim_rankjacc) /
- sum(cor_samps$pcor_rankeegdissim_rankot <= cor_samps$pcor_rankeegdissim_rankjacc)
- # save the partial Rho estimates
- cor_samps_long <- cor_samps |>
- pivot_longer(c(starts_with("pcor_rankeegdissim"), starts_with("cor_rankeegdissim")), names_to="cor_lab", values_to="Rho") |>
- mutate(
- model = case_when(
- grepl("rankjacc", cor_lab, fixed=TRUE) ~ "Jaccard Distance",
- grepl("rankot", cor_lab, fixed=TRUE) ~ "Wasserstein Distance",
- grepl("rankcompdist", cor_lab, fixed=TRUE) ~ "Complexity Distance",
- grepl("rankfreqdist", cor_lab, fixed=TRUE) ~ "Frequency Distance",
- grepl("rankphondist", cor_lab, fixed=TRUE) ~ "Phonological Distance",
- grepl("ranknamephondist", cor_lab, fixed=TRUE) ~ "Letter Name Phonological Distance",
- .default = NA
- ),
- is_partial = grepl("^pcor", cor_lab)
- ) |>
- dplyr::select(-data, -cor_mat, -pcor_mat)
- cor_samps_long |>
- group_by(model, is_partial) |>
- median_hdi(Rho, .width=.89)
- saveRDS(cor_samps_long, file.path("estimates", "controls_cor_samps_long.rds"))
- # fit time-resolved models -----------------------------------------------
- # fit models for each time point
- # Note: fitting models to all subjects' data, where ranked within subject, gives roughly equivalent peak posteriors to averaging over per-subject Rho estimates
- # fit models in parallel for speed
- library(future)
- plan(multicore, workers=n_cores)
- message("Fitting time-resolved models in parallel")
- m_rho <- brm_multiple(
- bf(
- mvbind(rank_eeg_dissim, rank_jacc, rank_ot, rank_comp_dist, rank_freq_dist, rank_phon_dist, rank_name_phon_dist) ~ 0
- ) +
- set_rescor(rescor=TRUE),
- family = brmsfamily("gaussian"),
- iter = 10000,
- warmup = 5000,
- chains = 4,
- cores = 1,
- seed = 1,
- # centre each dimension since we don't model intercept, then split by time points into list
- data = rdm |>
- group_by(time) |>
- mutate(across(starts_with("rank"), function(x) x - mean(x))) |>
- ungroup() |>
- group_split(time),
- combine = FALSE,
- prior = m_rho_prior_full,
- control = list(adapt_delta = 0.99),
- save_pars = save_pars(all=TRUE)
- )
- # get posteriors for each time point
- tc <- map_df(1:length(m_rho), function(i) {
- t_i <- times[[i]]
- m_i <- m_rho[[i]]
- var_names_i <- colnames(get_y(m_i))
-
- as_draws_df(m_i, "^Lrescor", regex=TRUE) |>
- # convert columns into covariance, correlation, and partial correlation matrix for each draw
- pivot_longer(cols=starts_with("Lrescor"), names_to="idx", values_to="r") |>
- mutate(idx = sub("Lrescor", "", idx, fixed=TRUE)) |>
- mutate(idx = gsub("(\\[)|(\\])", "", idx)) |>
- separate(idx, c("idx1", "idx2"), sep=",") |>
- mutate(idx1=as.integer(idx1), idx2=as.integer(idx2)) |>
- group_by(.chain, .iteration, .draw) |>
- nest() |>
- # get correlation and partial correlation matrix for each draw
- mutate(
- cor_mat = map(
- data, function(x) {
- cov_mat <- matrix(0, nrow=max(x$idx1), ncol=max(x$idx2))
- cov_mat[cbind(x$idx1, x$idx2)] <- x$r
- cov_mat[upper.tri(cov_mat, diag=FALSE)] <- t(cov_mat)[upper.tri(cov_mat, diag=FALSE)]
- colnames(cov_mat) <- var_names_i
- rownames(cov_mat) <- var_names_i
- cov2cor(cov_mat)
- }
- ),
- pcor_mat = map(
- cor_mat, function(x) {
- pcor_mat <- corpcor::cor2pcor(x)
- colnames(pcor_mat) <- var_names_i
- rownames(pcor_mat) <- var_names_i
- pcor_mat
- }
- ),
- cor_rankeegdissim_rankjacc = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankjacc"]),
- cor_rankeegdissim_rankot = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankot"]),
- cor_rankeegdissim_rankcompdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankcompdist"]),
- cor_rankeegdissim_rankfreqdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankfreqdist"]),
- cor_rankeegdissim_rankphondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankphondist"]),
- cor_rankeegdissim_ranknamephondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "ranknamephondist"]),
- pcor_rankeegdissim_rankjacc = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankjacc"]),
- pcor_rankeegdissim_rankot = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankot"]),
- pcor_rankeegdissim_rankcompdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankcompdist"]),
- pcor_rankeegdissim_rankfreqdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankfreqdist"]),
- pcor_rankeegdissim_rankphondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankphondist"]),
- pcor_rankeegdissim_ranknamephondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "ranknamephondist"]),
- time = t_i
- ) |>
- dplyr::select(-data, -cor_mat, -pcor_mat)
- })
- saveRDS(tc, file.path("estimates", "controls_timecourse.rds"))
|