library(readr) library(dplyr) library(purrr) library(tidyr) library(brms) library(ggplot2) library(ggdist) # library(fda) library(patchwork) library(parallel) n_cores <- parallel::detectCores(all.tests=FALSE, logical=TRUE) # n_cores <- 14 # options(loo.cores = n_cores) set.seed(1) stim_sim <- left_join( read_csv(file.path("stim_sim", "preregistered", "jacc.csv")), read_csv(file.path("stim_sim", "preregistered", "ot.csv")), by = c("char1", "char2") ) |> left_join( read_csv(file.path("stim_sim", "complexity", "complexity.csv")), by = c("char1", "char2") ) |> left_join( read_csv(file.path("stim_sim", "frequency", "frequency.csv")), by = c("char1", "char2") ) |> left_join( read_csv(file.path("stim_sim", "phonology", "dominant_phonemes.csv")), by = c("char1", "char2") ) |> left_join( read_csv(file.path("stim_sim", "phonology", "letter_names.csv")), by = c("char1", "char2") ) |> mutate( rank_jacc = rank(jacc), rank_ot = rank(ot), rank_comp_dist = rank(comp_dist), rank_freq_dist = rank(freq_dist), rank_phon_dist = rank(phon_dist), rank_name_phon_dist = rank(name_phon_dist) ) |> rowwise() |> mutate(pair_id = paste(sort(c(char1, char2)), collapse="_")) |> ungroup() rdm <- file.path("rdm_data", "time_resolved") |> list.files(pattern=".*\\.csv", full.names=TRUE) |> map_df(read_csv, col_types=c("subj_id"="c")) |> left_join(stim_sim, by=c("char1", "char2")) |> group_by(subj_id, time) |> mutate(rank_eeg_dissim = rank(eeg_dissim)) |> ungroup() |> arrange(time) times <- sort(unique(rdm$time)) rdm_poi <- file.path("rdm_data", "period_of_interest") |> list.files(pattern=".*\\.csv", full.names=TRUE) |> map_df(read_csv, col_types=c("subj_id"="c")) |> left_join(stim_sim, by=c("char1", "char2")) |> group_by(subj_id) |> mutate(rank_eeg_dissim = rank(eeg_dissim)) |> ungroup() # fit full model ------------------------------------------- # this is a multivariate model that estimates correlations between all items # partial corr model # short function for calculating the population SD pop_sd <- function(x) sqrt((length(x)-1)/length(x)) * sd(x) # check correlation priors lkj_prior <- 1.5 m_rho_prior_full <- c( # biased towards 0 for non-diagonal correlations, because lower correlations are more likely in RSA of EEG data set_prior(sprintf("lkj(%s)", lkj_prior), class="rescor"), set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_eeg_dissim))), class="sigma", resp="rankeegdissim", ub=max(rdm_poi$rank_eeg_dissim)), set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_jacc))), class="sigma", resp="rankjacc", ub=max(rdm_poi$rank_jacc)), set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_ot))), class="sigma", resp="rankot", ub=max(rdm_poi$rank_ot)), set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_comp_dist))), class="sigma", resp="rankcompdist", ub=max(rdm_poi$rank_comp_dist)), set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_freq_dist))), class="sigma", resp="rankfreqdist", ub=max(rdm_poi$rank_freq_dist)), set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_phon_dist))), class="sigma", resp="rankphondist", ub=max(rdm_poi$rank_phon_dist)), set_prior(sprintf("constant(%s)", pop_sd(1:max(rdm_poi$rank_name_phon_dist))), class="sigma", resp="ranknamephondist", ub=max(rdm_poi$rank_name_phon_dist)) ) m_rho_full_poi <- brm( bf( mvbind(rank_eeg_dissim, rank_jacc, rank_ot, rank_comp_dist, rank_freq_dist, rank_phon_dist, rank_name_phon_dist) ~ 0 ) + set_rescor(rescor=TRUE), family = brmsfamily("gaussian"), iter = 24000, warmup = 6000, chains = 8, cores = n_cores, seed = 1, # centre each dimension since we don't model intercept data = mutate(rdm_poi, across(starts_with("rank"), function(x) x - mean(x))), prior = m_rho_prior_full, save_pars = save_pars(all=TRUE), sample_prior = FALSE, control = list(adapt_delta = 0.99) ) # get a vector of response variable names (will match order of correlation matrix) var_names <- colnames(get_y(m_rho_full_poi)) # get correlation matrices for each draw cor_samps <- as_draws_df(m_rho_full_poi, "^Lrescor", regex=TRUE) |> # convert columns into covariance, correlation, and partial correlation matrix for each draw pivot_longer(cols=starts_with("Lrescor"), names_to="idx", values_to="r") |> mutate(idx = sub("Lrescor", "", idx, fixed=TRUE)) |> mutate(idx = gsub("(\\[)|(\\])", "", idx)) |> separate(idx, c("idx1", "idx2"), sep=",") |> mutate(idx1=as.integer(idx1), idx2=as.integer(idx2)) |> group_by(.chain, .iteration, .draw) |> nest() |> # get correlation and partial correlation matrix for each draw mutate( cor_mat = map( data, function(x) { cov_mat <- matrix(0, nrow=max(x$idx1), ncol=max(x$idx2)) cov_mat[cbind(x$idx1, x$idx2)] <- x$r cov_mat[upper.tri(cov_mat, diag=FALSE)] <- t(cov_mat)[upper.tri(cov_mat, diag=FALSE)] colnames(cov_mat) <- var_names rownames(cov_mat) <- var_names cov2cor(cov_mat) } ), pcor_mat = map( cor_mat, function(x) { pcor_mat <- corpcor::cor2pcor(x) colnames(pcor_mat) <- var_names rownames(pcor_mat) <- var_names pcor_mat } ), cor_rankeegdissim_rankjacc = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankjacc"]), cor_rankeegdissim_rankot = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankot"]), cor_rankeegdissim_rankcompdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankcompdist"]), cor_rankeegdissim_rankfreqdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankfreqdist"]), cor_rankeegdissim_rankphondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankphondist"]), cor_rankeegdissim_ranknamephondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "ranknamephondist"]), pcor_rankeegdissim_rankjacc = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankjacc"]), pcor_rankeegdissim_rankot = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankot"]), pcor_rankeegdissim_rankcompdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankcompdist"]), pcor_rankeegdissim_rankfreqdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankfreqdist"]), pcor_rankeegdissim_rankphondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankphondist"]), pcor_rankeegdissim_ranknamephondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "ranknamephondist"]) ) # calculate Evidence Ratios for each hypothesis bf_1 <- sum(cor_samps$cor_rankeegdissim_rankot > 0) / sum(cor_samps$cor_rankeegdissim_rankot < 0) bf_2a <- sum(cor_samps$cor_rankeegdissim_rankot > cor_samps$cor_rankeegdissim_rankjacc) / sum(cor_samps$cor_rankeegdissim_rankot <= cor_samps$cor_rankeegdissim_rankjacc) bf_2b <- sum(cor_samps$pcor_rankeegdissim_rankot > cor_samps$pcor_rankeegdissim_rankjacc) / sum(cor_samps$pcor_rankeegdissim_rankot <= cor_samps$pcor_rankeegdissim_rankjacc) # save the partial Rho estimates cor_samps_long <- cor_samps |> pivot_longer(c(starts_with("pcor_rankeegdissim"), starts_with("cor_rankeegdissim")), names_to="cor_lab", values_to="Rho") |> mutate( model = case_when( grepl("rankjacc", cor_lab, fixed=TRUE) ~ "Jaccard Distance", grepl("rankot", cor_lab, fixed=TRUE) ~ "Wasserstein Distance", grepl("rankcompdist", cor_lab, fixed=TRUE) ~ "Complexity Distance", grepl("rankfreqdist", cor_lab, fixed=TRUE) ~ "Frequency Distance", grepl("rankphondist", cor_lab, fixed=TRUE) ~ "Phonological Distance", grepl("ranknamephondist", cor_lab, fixed=TRUE) ~ "Letter Name Phonological Distance", .default = NA ), is_partial = grepl("^pcor", cor_lab) ) |> dplyr::select(-data, -cor_mat, -pcor_mat) cor_samps_long |> group_by(model, is_partial) |> median_hdi(Rho, .width=.89) saveRDS(cor_samps_long, file.path("estimates", "controls_cor_samps_long.rds")) # fit time-resolved models ----------------------------------------------- # fit models for each time point # Note: fitting models to all subjects' data, where ranked within subject, gives roughly equivalent peak posteriors to averaging over per-subject Rho estimates # fit models in parallel for speed library(future) plan(multicore, workers=n_cores) message("Fitting time-resolved models in parallel") m_rho <- brm_multiple( bf( mvbind(rank_eeg_dissim, rank_jacc, rank_ot, rank_comp_dist, rank_freq_dist, rank_phon_dist, rank_name_phon_dist) ~ 0 ) + set_rescor(rescor=TRUE), family = brmsfamily("gaussian"), iter = 10000, warmup = 5000, chains = 4, cores = 1, seed = 1, # centre each dimension since we don't model intercept, then split by time points into list data = rdm |> group_by(time) |> mutate(across(starts_with("rank"), function(x) x - mean(x))) |> ungroup() |> group_split(time), combine = FALSE, prior = m_rho_prior_full, control = list(adapt_delta = 0.99), save_pars = save_pars(all=TRUE) ) # get posteriors for each time point tc <- map_df(1:length(m_rho), function(i) { t_i <- times[[i]] m_i <- m_rho[[i]] var_names_i <- colnames(get_y(m_i)) as_draws_df(m_i, "^Lrescor", regex=TRUE) |> # convert columns into covariance, correlation, and partial correlation matrix for each draw pivot_longer(cols=starts_with("Lrescor"), names_to="idx", values_to="r") |> mutate(idx = sub("Lrescor", "", idx, fixed=TRUE)) |> mutate(idx = gsub("(\\[)|(\\])", "", idx)) |> separate(idx, c("idx1", "idx2"), sep=",") |> mutate(idx1=as.integer(idx1), idx2=as.integer(idx2)) |> group_by(.chain, .iteration, .draw) |> nest() |> # get correlation and partial correlation matrix for each draw mutate( cor_mat = map( data, function(x) { cov_mat <- matrix(0, nrow=max(x$idx1), ncol=max(x$idx2)) cov_mat[cbind(x$idx1, x$idx2)] <- x$r cov_mat[upper.tri(cov_mat, diag=FALSE)] <- t(cov_mat)[upper.tri(cov_mat, diag=FALSE)] colnames(cov_mat) <- var_names_i rownames(cov_mat) <- var_names_i cov2cor(cov_mat) } ), pcor_mat = map( cor_mat, function(x) { pcor_mat <- corpcor::cor2pcor(x) colnames(pcor_mat) <- var_names_i rownames(pcor_mat) <- var_names_i pcor_mat } ), cor_rankeegdissim_rankjacc = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankjacc"]), cor_rankeegdissim_rankot = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankot"]), cor_rankeegdissim_rankcompdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankcompdist"]), cor_rankeegdissim_rankfreqdist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankfreqdist"]), cor_rankeegdissim_rankphondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "rankphondist"]), cor_rankeegdissim_ranknamephondist = map_dbl(cor_mat, function(x) x["rankeegdissim", "ranknamephondist"]), pcor_rankeegdissim_rankjacc = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankjacc"]), pcor_rankeegdissim_rankot = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankot"]), pcor_rankeegdissim_rankcompdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankcompdist"]), pcor_rankeegdissim_rankfreqdist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankfreqdist"]), pcor_rankeegdissim_rankphondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "rankphondist"]), pcor_rankeegdissim_ranknamephondist = map_dbl(pcor_mat, function(x) x["rankeegdissim", "ranknamephondist"]), time = t_i ) |> dplyr::select(-data, -cor_mat, -pcor_mat) }) saveRDS(tc, file.path("estimates", "controls_timecourse.rds"))