123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- # cluster permutation test in R
- # function to find clusters of time-points above a specified threshold with the
- # clustering being performed separately for samples with a positive and negative
- # value in the time-series.
- # references:
- # Maris, E., & Oostenveld, R. (2007). Nonparametric statistical testing of EEG- and MEG-data.
- # Journal of Neuroscience Methods, 164(1), 177–190. https://doi.org/10.1016/j.jneumeth.2007.03.024
- cluster_indices <- function(timeseries, threshold) {
- # step 1: clustering
- # we cluster adjacent time-samples that all exhibit a similar difference
- # (in sign and magnitude)
-
- # initialize an empty cluster index vector depending on the length of the timeseries:
- cluster_id = replicate(length(timeseries), 0)
- # get positions of cluster with t-values above or below the treshhold
- timepoints_thresh = which(abs(timeseries) > threshold)
- # set all positions of negative t-values to a negative sign
- timepoints = timepoints_thresh * sign(timeseries[timepoints_thresh])
- # split the position indices into clusters of consecutive numbers:
- clusters = split(abs(timepoints), cumsum(c(1, abs(diff(timepoints)) != 1)))
- # write cluster indices into the cluster index vector:
- for (i in seq(1,length(clusters))) {
- cluster_id[unlist(clusters[i])] = i
- }
- # return the cluster indices:
- return(cluster_id)
- }
- shuffle_timeseries <- function(timeseries) {
- # function to randomly shuffle (i.e., invert a timeseries) through
- # multiplication by -1 with a 0.5 probability:
- # participant-specific timesources are flipped (multiplied by -1) randomly:
-
- timeseries_shuffled = timeseries * sample(x = c(-1, 1), size = 1)
- return(timeseries_shuffled)
- }
- cluster_stats = function(data, cfg, shuffle = TRUE) {
-
- data_clustered = data %>%
-
- # shuffle time series for each participant, classification and speed
- # if shuffle = TRUE. If shuffle = FALSE, the data is just copied.
- # check if number of timepoints matches the true number of TRs (here, 13):
- .[, by = c(cfg$grouping, "id"), ":=" (
- n_trs = .N,
- variable_data = if(shuffle) {shuffle_timeseries(get(cfg$variable))} else {get(cfg$variable)}
- )] %>% verify(n_trs == cfg$n_trs) %>%
-
- # run a two-sided one-sample t-test against 0 at every TR to get a
- # timeseries of t-values (by taking out the participant factor):
- # check if there were as many data points as there were participants:
- .[, by = c(cfg$grouping, "seq_tr"), .(
- num_subs = .N,
- tvalue = t.test(variable_data, alternative = "two.sided", mu = cfg$baseline)$statistic
- )] %>% #verify(num_subs == 40) %>%
-
- # get the indices for clusters with consecutive t-values above / below threshold
- # check if number of timepoints matches the true number of TRs (here, 13):
- .[, by = c(cfg$grouping), ":=" (
- n_trs = .N,
- cluster_id = cluster_indices(timeseries = tvalue, threshold = cfg$threshold)
- )] %>% verify(n_trs == cfg$n_trs) %>%
-
- # calculate the cluster mass of each cluster, separately for each
- # classification and speed condition:
- .[, by = c(cfg$grouping, "cluster_id"), .(
- cluster_trs = list(seq_tr),
- cluster_length = .N,
- cluster_mass = sum(tvalue),
- cluster_type = ifelse(sum(tvalue) > 0, "positive", "negative")
- )] %>%
-
- # add marker to the maximum absolute cluster mass
- # rank the clusters according to their absolute cluster mass:
- .[, by = c(cfg$grouping), ":=" (
- cluster_max = as.integer(abs(cluster_mass) == max(abs(cluster_mass))),
- cluster_rank = rank(abs(cluster_mass))
- )]
-
- return(data_clustered)
- }
- cluster_test <- function(data_true, data_perm, cfg) {
- # subset the permutation data frame depending on the grouping:
- data_perm_select = data_perm %>%
- filter(classification == unique(data_true$classification)) %>%
- filter(tITI == unique(data_true$tITI)) %>%
- filter(cluster_max == 1) %>%
- # extract the cluster mass values of the maximum clusters:
- mutate(cluster_mass_perm = cluster_mass * cluster_id)
- # get the number of "maximum" clusters that were actually "zero"-clusters:
- n_zero = sum(data_perm_select$cluster_id == 0)
- # get the number of cluster mass values above the true cluster mass:
- n_above = sum(abs(data_perm_select$cluster_mass_perm) > abs(data_true$cluster_mass))
- # TODO: add column that specifies if above or below threshold (can be used for later plotting):
- # retrieve the number of permutations that was run:
- n_perms = nrow(data_perm_select)
- # get the monte carlo p-value by dividing:
- p_value = n_above/n_perms
- return(list("p_value" = p_value, "n_perms" = n_perms, "n_zero" = n_zero))
- }
- cluster_plot <- function(data_true, data_perm){
-
- plot = ggplot(data_perm, aes(x = abs(as.numeric(cluster_mass)))) +
- facet_wrap(facets = ~ as.factor(tITI), labeller = get_labeller(array = data_perm$tITI)) +
- geom_histogram(binwidth = 0.5) +
- geom_vline(data = data_true, aes(
- xintercept = abs(as.numeric(cluster_mass)), color = as.factor(cluster_type))) +
- xlab("Absolute cluster mass (summed t-values)") +
- ylab("Number of permutations") +
- scale_color_discrete(name = "Cluster type") +
- coord_capped_cart(left = "both", bottom = "both", expand = TRUE) +
- theme(legend.position = c(1, 0), legend.justification = c(1, 0)) +
- theme(axis.line = element_line(linetype = "solid", lineend = "round")) +
- theme(plot.margin = unit(c(t = 1, r = 3, b = 1, l = 1), "pt"))
-
- return(plot)
- }
- cluster_permutation <- function(data, cfg) {
- # wrapper function to combine all steps of the cluster permutation test
- # inputs: data = a data frame; variable = string, variable of interest,
- # cfg = list, containing important configuration parameters:
- # variables, threshold, baseline, grouping, n_perms, n_trs
-
- # get the cluster statistics for the true data:
- data_true = cluster_stats(data, cfg, shuffle = FALSE)
- # get the cluster statistics for the permuted data:
- data_perm = do.call(rbind, replicate(cfg$n_perms, list(cluster_stats(data, cfg, shuffle = TRUE))))
- # compare the cluster statistics of the true data to the permuted data:
- data_true = data_true %>%
- .[, c("p_value", "n_perms", "n_zero") := cluster_test(
- data_true = .SD, data_perm = data_perm, cfg = cfg),
- by = c(cfg$grouping, "cluster_id"),
- .SDcols = colnames(data_true)] %>%
- setorder(classification, tITI, cluster_id) %>%
- filter(cluster_id != 0) %>% setDT(.)
- # plot the permutation distribution and true data:
- plot = cluster_plot(
- subset(data_true, classification == "ovr"),
- subset(data_perm, classification == "ovr"))
- return(list(data_true, plot))
-
- }
|