highspeed-cluster-permutation.R 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # cluster permutation test in R
  2. # function to find clusters of time-points above a specified threshold with the
  3. # clustering being performed separately for samples with a positive and negative
  4. # value in the time-series.
  5. # references:
  6. # Maris, E., & Oostenveld, R. (2007). Nonparametric statistical testing of EEG- and MEG-data.
  7. # Journal of Neuroscience Methods, 164(1), 177–190. https://doi.org/10.1016/j.jneumeth.2007.03.024
  8. cluster_indices <- function(timeseries, threshold) {
  9. # step 1: clustering
  10. # we cluster adjacent time-samples that all exhibit a similar difference
  11. # (in sign and magnitude)
  12. # initialize an empty cluster index vector depending on the length of the timeseries:
  13. cluster_id = replicate(length(timeseries), 0)
  14. # get positions of cluster with t-values above or below the treshhold
  15. timepoints_thresh = which(abs(timeseries) > threshold)
  16. # set all positions of negative t-values to a negative sign
  17. timepoints = timepoints_thresh * sign(timeseries[timepoints_thresh])
  18. # split the position indices into clusters of consecutive numbers:
  19. clusters = split(abs(timepoints), cumsum(c(1, abs(diff(timepoints)) != 1)))
  20. # write cluster indices into the cluster index vector:
  21. for (i in seq(1,length(clusters))) {
  22. cluster_id[unlist(clusters[i])] = i
  23. }
  24. # return the cluster indices:
  25. return(cluster_id)
  26. }
  27. shuffle_timeseries <- function(timeseries) {
  28. # function to randomly shuffle (i.e., invert a timeseries) through
  29. # multiplication by -1 with a 0.5 probability:
  30. # participant-specific timesources are flipped (multiplied by -1) randomly:
  31. timeseries_shuffled = timeseries * sample(x = c(-1, 1), size = 1)
  32. return(timeseries_shuffled)
  33. }
  34. cluster_stats = function(data, cfg, shuffle = TRUE) {
  35. data_clustered = data %>%
  36. # shuffle time series for each participant, classification and speed
  37. # if shuffle = TRUE. If shuffle = FALSE, the data is just copied.
  38. # check if number of timepoints matches the true number of TRs (here, 13):
  39. .[, by = c(cfg$grouping, "id"), ":=" (
  40. n_trs = .N,
  41. variable_data = if(shuffle) {shuffle_timeseries(get(cfg$variable))} else {get(cfg$variable)}
  42. )] %>% verify(n_trs == cfg$n_trs) %>%
  43. # run a two-sided one-sample t-test against 0 at every TR to get a
  44. # timeseries of t-values (by taking out the participant factor):
  45. # check if there were as many data points as there were participants:
  46. .[, by = c(cfg$grouping, "seq_tr"), .(
  47. num_subs = .N,
  48. tvalue = t.test(variable_data, alternative = "two.sided", mu = cfg$baseline)$statistic
  49. )] %>% #verify(num_subs == 40) %>%
  50. # get the indices for clusters with consecutive t-values above / below threshold
  51. # check if number of timepoints matches the true number of TRs (here, 13):
  52. .[, by = c(cfg$grouping), ":=" (
  53. n_trs = .N,
  54. cluster_id = cluster_indices(timeseries = tvalue, threshold = cfg$threshold)
  55. )] %>% verify(n_trs == cfg$n_trs) %>%
  56. # calculate the cluster mass of each cluster, separately for each
  57. # classification and speed condition:
  58. .[, by = c(cfg$grouping, "cluster_id"), .(
  59. cluster_trs = list(seq_tr),
  60. cluster_length = .N,
  61. cluster_mass = sum(tvalue),
  62. cluster_type = ifelse(sum(tvalue) > 0, "positive", "negative")
  63. )] %>%
  64. # add marker to the maximum absolute cluster mass
  65. # rank the clusters according to their absolute cluster mass:
  66. .[, by = c(cfg$grouping), ":=" (
  67. cluster_max = as.integer(abs(cluster_mass) == max(abs(cluster_mass))),
  68. cluster_rank = rank(abs(cluster_mass))
  69. )]
  70. return(data_clustered)
  71. }
  72. cluster_test <- function(data_true, data_perm, cfg) {
  73. # subset the permutation data frame depending on the grouping:
  74. data_perm_select = data_perm %>%
  75. filter(classification == unique(data_true$classification)) %>%
  76. filter(tITI == unique(data_true$tITI)) %>%
  77. filter(cluster_max == 1) %>%
  78. # extract the cluster mass values of the maximum clusters:
  79. mutate(cluster_mass_perm = cluster_mass * cluster_id)
  80. # get the number of "maximum" clusters that were actually "zero"-clusters:
  81. n_zero = sum(data_perm_select$cluster_id == 0)
  82. # get the number of cluster mass values above the true cluster mass:
  83. n_above = sum(abs(data_perm_select$cluster_mass_perm) > abs(data_true$cluster_mass))
  84. # TODO: add column that specifies if above or below threshold (can be used for later plotting):
  85. # retrieve the number of permutations that was run:
  86. n_perms = nrow(data_perm_select)
  87. # get the monte carlo p-value by dividing:
  88. p_value = n_above/n_perms
  89. return(list("p_value" = p_value, "n_perms" = n_perms, "n_zero" = n_zero))
  90. }
  91. cluster_plot <- function(data_true, data_perm){
  92. plot = ggplot(data_perm, aes(x = abs(as.numeric(cluster_mass)))) +
  93. facet_wrap(facets = ~ as.factor(tITI), labeller = get_labeller(array = data_perm$tITI)) +
  94. geom_histogram(binwidth = 0.5) +
  95. geom_vline(data = data_true, aes(
  96. xintercept = abs(as.numeric(cluster_mass)), color = as.factor(cluster_type))) +
  97. xlab("Absolute cluster mass (summed t-values)") +
  98. ylab("Number of permutations") +
  99. scale_color_discrete(name = "Cluster type") +
  100. coord_capped_cart(left = "both", bottom = "both", expand = TRUE) +
  101. theme(legend.position = c(1, 0), legend.justification = c(1, 0)) +
  102. theme(axis.line = element_line(linetype = "solid", lineend = "round")) +
  103. theme(plot.margin = unit(c(t = 1, r = 3, b = 1, l = 1), "pt"))
  104. return(plot)
  105. }
  106. cluster_permutation <- function(data, cfg) {
  107. # wrapper function to combine all steps of the cluster permutation test
  108. # inputs: data = a data frame; variable = string, variable of interest,
  109. # cfg = list, containing important configuration parameters:
  110. # variables, threshold, baseline, grouping, n_perms, n_trs
  111. # get the cluster statistics for the true data:
  112. data_true = cluster_stats(data, cfg, shuffle = FALSE)
  113. # get the cluster statistics for the permuted data:
  114. data_perm = do.call(rbind, replicate(cfg$n_perms, list(cluster_stats(data, cfg, shuffle = TRUE))))
  115. # compare the cluster statistics of the true data to the permuted data:
  116. data_true = data_true %>%
  117. .[, c("p_value", "n_perms", "n_zero") := cluster_test(
  118. data_true = .SD, data_perm = data_perm, cfg = cfg),
  119. by = c(cfg$grouping, "cluster_id"),
  120. .SDcols = colnames(data_true)] %>%
  121. setorder(classification, tITI, cluster_id) %>%
  122. filter(cluster_id != 0) %>% setDT(.)
  123. # plot the permutation distribution and true data:
  124. plot = cluster_plot(
  125. subset(data_true, classification == "ovr"),
  126. subset(data_perm, classification == "ovr"))
  127. return(list(data_true, plot))
  128. }