Browse Source

add code and docs

Lennart Wittkuhn 3 years ago
parent
commit
3487380fbf

+ 2 - 1
.gitattributes

@@ -2,4 +2,5 @@
 * annex.backend=MD5E
 **/.git* annex.largefiles=nothing
 CHANGELOG.md annex.largefiles=nothing
-README.md annex.largefiles=nothing
+README.md annex.largefiles=nothing
+highspeed-decoding.Rproj annex.largefiles=nothing

+ 5 - 0
.gitignore

@@ -0,0 +1,5 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+code/*/*.html

+ 90 - 0
code/decoding/highspeed-decoding-cluster.sh

@@ -0,0 +1,90 @@
+#!/usr/bin/bash
+# ==============================================================================
+# SCRIPT INFORMATION:
+# ==============================================================================
+# SCRIPT: PARALELLIZE THE DECODING SCRIPT ON THE MPIB CLUSTER (TARDIS)
+# PROJECT: HIGHSPEED
+# WRITTEN BY LENNART WITTKUHN, 2018 - 2020
+# CONTACT: WITTKUHN AT MPIB HYPHEN BERLIN DOT MPG DOT DE
+# MAX PLANCK RESEARCH GROUP NEUROCODE
+# MAX PLANCK INSTITUTE FOR HUMAN DEVELOPMENT (MPIB)
+# MAX PLANCK UCL CENTRE FOR COMPUTATIONAL PSYCHIATRY AND AGEING RESEARCH
+# LENTZEALLEE 94, 14195 BERLIN, GERMANY
+# ==============================================================================
+# DEFINE ALL PATHS:
+# ==============================================================================
+# define the name of the current task:
+TASK_NAME="highspeed-decoding"
+# define the name of the project:
+PROJECT_NAME="highspeed"
+# path to the home directory:
+PATH_HOME=/home/mpib/wittkuhn
+# path to the current code folder:
+PATH_CODE=${PATH_HOME}/${PROJECT_NAME}/${TASK_NAME}/code/decoding
+# cd into the directory of the current script:
+cd ${PATH_CODE}
+# path to the current script:
+PATH_SCRIPT=${PATH_CODE}/highspeed_decoding.py
+# path to the output directory:
+PATH_OUT=${PATH_HOME}/${PROJECT_NAME}/${TASK_NAME}/decoding
+# path to the working directory:
+PATH_WORK=${PATH_HOME}/${PROJECT_NAME}/${TASK_NAME}/work
+# path to the log directory:
+PATH_LOG=${PATH_HOME}/${PROJECT_NAME}/${TASK_NAME}/logs/$(date '+%Y%m%d_%H%M%S')
+# ==============================================================================
+# CREATE RELEVANT DIRECTORIES:
+# ==============================================================================
+# create output directory:
+if [ ! -d ${PATH_OUT} ]; then
+	mkdir -p ${PATH_OUT}
+fi
+# create working directory:
+if [ ! -d ${PATH_WORK} ]; then
+	mkdir -p ${PATH_WORK}
+fi
+# create directory for log files:
+if [ ! -d ${PATH_LOG} ]; then
+	mkdir -p ${PATH_LOG}
+fi
+# ==============================================================================
+# DEFINE PARAMETERS:
+# ==============================================================================
+# maximum number of cpus per process:
+N_CPUS=1
+# memory demand in *GB*
+MEM=20GB
+# user-defined subject list
+PARTICIPANTS=$1
+# ==============================================================================
+# RUN THE DECODING:
+# ==============================================================================
+# initialize a counter for the subjects:
+SUB_COUNT=0
+for i in {1..40}; do
+	# turn the subject id into a zero-padded number:
+	SUB=$(printf "%02d\n" ${i})
+	# name of the job
+	echo "#PBS -N decoding_sub-${SUB}" >> job
+	# set the expected maximum running time for the job:
+	echo "#PBS -l walltime=12:00:00" >> job
+	# determine how much RAM your operation needs:
+	echo "#PBS -l mem=${MEM}" >> job
+	# request multiple cpus:
+	echo "#PBS -l nodes=1:ppn=${N_CPUS}" >> job
+	# write (output) log to log folder:
+	echo "#PBS -o ${PATH_LOG}" >> job
+	# write (error) log to log folder:
+	echo "#PBS -e ${PATH_LOG}" >> job
+	# email notification on abort/end, use 'n' for no notification:
+	echo "#PBS -m n" >> job
+	# start in the current directory:
+	echo "#PBS -d ${PATH_CODE}" >> job
+	# activate virtual environment on cluster:
+	echo "source /etc/bash_completion.d/virtualenvwrapper" >> job
+	echo "workon decoding" >> job
+	# define the main command:
+	echo "python3 ${PATH_SCRIPT} ${SUB}" >> job
+	# submit job to cluster queue and remove it to avoid confusion:
+	qsub job
+	rm -f job
+done

+ 16 - 0
code/decoding/highspeed-parameters.yaml

@@ -0,0 +1,16 @@
+# highspeed parameters
+
+info:
+  experimenter: lennart wittkuhn
+  institution: max planck institute for human development
+  project: highspeed
+task:
+  num_trial_odd: 600
+  num_trial_seq: 75
+  num_trial_rep: 45
+mri:
+  tr: 1.25
+  thresh: 3
+  num_sessions: 2
+  num_runs: 8
+  num_trigger: 5

+ 883 - 0
code/decoding/highspeed_decoding.py

@@ -0,0 +1,883 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ======================================================================
+# SCRIPT INFORMATION:
+# ======================================================================
+# SCRIPT: DECODING ANALYSIS
+# PROJECT: HIGHSPEED
+# WRITTEN BY LENNART WITTKUHN, 2018 - 2019
+# CONTACT: WITTKUHN AT MPIB HYPHEN BERLIN DOT MPG DOT DE
+# MAX PLANCK RESEARCH GROUP NEUROCODE
+# MAX PLANCK INSTITUTE FOR HUMAN DEVELOPMENT
+# MAX PLANCK UCL CENTRE FOR COMPUTATIONAL PSYCHIATRY AND AGEING RESEARCH
+# LENTZEALLEE 94, 14195 BERLIN, GERMANY
+'''
+========================================================================
+IMPORT RELEVANT PACKAGES:
+========================================================================
+'''
+import glob
+import os
+import yaml
+import logging
+import time
+from os.path import join as opj
+import sys
+import copy
+from pprint import pformat
+import numpy as np
+from nilearn.input_data import NiftiMasker
+from nilearn import plotting, image, masking
+import pandas as pd
+from matplotlib import pyplot as plt
+import warnings
+from nilearn.signal import clean
+from sklearn.svm import SVC
+from sklearn.svm import LinearSVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import LeaveOneGroupOut
+from sklearn.model_selection import cross_val_score, cross_val_predict
+from sklearn.preprocessing import StandardScaler
+from collections import Counter
+import datalad.api as dl
+'''
+========================================================================
+DEAL WITH ERRORS, WARNING ETC.
+========================================================================
+'''
+warnings.filterwarnings('ignore', message='numpy.dtype size changed*')
+warnings.filterwarnings('ignore', message='numpy.ufunc size changed*')
+mpl_logger = logging.getLogger('matplotlib')
+mpl_logger.setLevel(logging.WARNING)
+'''
+========================================================================
+SETUP NECESSARY PATHS ETC:
+========================================================================
+'''
+# get start time of the script:
+start = time.time()
+# name of the current project:
+project = 'highspeed'
+# initialize empty variables:
+sub = None
+path_tardis = None
+path_server = None
+path_local = None
+# define paths depending on the operating system (OS) platform:
+if 'darwin' in sys.platform:
+    # define the path to the cluster:
+    path_tardis = opj(os.environ['HOME'], 'Volumes', 'tardis_beegfs', project)
+    # define the path to the server:
+    path_server = opj('/Volumes', 'MPRG-Neurocode', 'Data', project)
+    # define the path to the local computer:
+    path_local = opj(os.environ['HOME'], project, '*', '*')
+    # define the subject id:
+    sub = 'sub-14'
+elif 'linux' in sys.platform:
+    # path to the project root:
+    project_name = 'highspeed-decoding'
+    path_root = os.getcwd().split(project_name)[0] + project_name
+    # define the path to the cluster:
+    path_tardis = path_root
+    # define the path to the server:
+    path_server = path_tardis
+    # define the path to the local computer:
+    path_local = opj(path_tardis, 'code', 'decoding')
+    # define the subject id:
+    sub = 'sub-%s' % sys.argv[1]
+'''
+========================================================================
+LOAD PROJECT PARAMETERS:
+========================================================================
+'''
+path_params = glob.glob(opj(path_local, '*parameters.yaml'))[0]
+with open(path_params, 'rb') as f:
+    params = yaml.load(f, Loader=yaml.FullLoader)
+f.close()
+'''
+========================================================================
+CREATE PATHS TO OUTPUT DIRECTORIES:
+========================================================================
+'''
+path_decoding = opj(path_tardis, 'decoding')
+path_out = opj(path_decoding, sub)
+path_out_figs = opj(path_out, 'plots')
+path_out_data = opj(path_out, 'data')
+path_out_logs = opj(path_out, 'logs')
+path_out_masks = opj(path_out, 'masks')
+'''
+========================================================================
+CREATE OUTPUT DIRECTORIES IF THE DO NOT EXIST YET:
+========================================================================
+'''
+for path in [path_out_figs, path_out_data, path_out_logs, path_out_masks]:
+    if not os.path.exists(path):
+        os.makedirs(path)
+'''
+========================================================================
+SETUP LOGGING:
+========================================================================
+'''
+# remove all handlers associated with the root logger object:
+for handler in logging.root.handlers[:]:
+    logging.root.removeHandler(handler)
+# get current data and time as a string:
+timestr = time.strftime("%Y-%m-%d-%H-%M-%S")
+# create path for the logging file
+log = opj(path_out_logs, '%s-%s.log' % (timestr, sub))
+# start logging:
+logging.basicConfig(
+    filename=log, level=logging.DEBUG, format='%(asctime)s %(message)s',
+    datefmt='%d/%m/%Y %H:%M:%S')
+'''
+========================================================================
+DEFINE DECODING SPECIFIC PARAMETERS:
+========================================================================
+'''
+# define the mask to be used:
+mask = 'visual'  # visual or whole
+# applied time-shift to account for the BOLD delay, in seconds:
+bold_delay = 4  # 4, 5 or 6 secs
+# define the degree of smoothing of the functional data
+smooth = 4
+'''
+========================================================================
+ADD BASIC SCRIPT INFORMATION TO THE LOGGER:
+========================================================================
+'''
+logging.info('Running decoding script')
+logging.info('operating system: %s' % sys.platform)
+logging.info('project name: %s' % project)
+logging.info('participant: %s' % sub)
+logging.info('mask: %s' % mask)
+logging.info('bold delay: %d secs' % bold_delay)
+logging.info('smoothing kernel: %d mm' % smooth)
+'''
+========================================================================
+DEFINE RELEVANT VARIABLES:
+========================================================================
+'''
+# time of repetition (TR), in seconds:
+t_tr = params['mri']['tr']
+# number of volumes (TRs) for each functional task run:
+n_tr_run = 530
+# acquisition time window of one sequence trial, in seconds:
+t_win = 16
+# number of measurements that are considered per sequence time window:
+n_tr_win = round(t_win / t_tr)
+# number of oddball trials in the experiment:
+n_tr_odd = 600
+# number of sequence trials in the experiment:
+n_tr_seq = 75
+# number of repetition trials in the experiment:
+n_tr_rep = 45
+# number of scanner triggers before the experiment starts:
+n_tr_wait = params['mri']['num_trigger']
+# number of functional task runs in total:
+n_run = params['mri']['num_runs']
+# number of experimental sessions in total:
+n_ses = params['mri']['num_sessions']
+# number of functional task runs per session:
+n_run_ses = int(n_run / n_ses)
+'''
+========================================================================
+LOAD BEHAVIORAL DATA (THE EVENTS.TSV FILES OF THE SUBJECT):
+========================================================================
+Load the events files for the subject. The 'oddball' data serves as the
+training set of the classifier. The 'sequence' data constitutes the
+test set.
+'''
+# paths to all events files of the current subject:
+path_events = opj(path_tardis, 'bids', sub, 'ses-*', 'func', '*tsv')
+dl.get(glob.glob(path_events))
+path_events = sorted(glob.glob(path_events), key=lambda f: os.path.basename(f))
+logging.info('found %d event files' % len(path_events))
+logging.info('paths to events files (sorted):\n%s' % pformat(path_events))
+# import events file and save data in dataframe:
+df_events = pd.concat((pd.read_csv(f, sep='\t') for f in path_events),
+                      ignore_index=True)
+'''
+========================================================================
+CREATE PATHS TO THE MRI DATA:
+========================================================================
+'''
+# define path to input directories:
+path_fmriprep = opj(path_tardis, 'fmriprep', 'fmriprep', sub)
+path_level1 = opj(path_tardis, 'glm', 'l1pipeline')
+path_masks = opj(path_tardis, 'masks', 'masks')
+
+logging.info('path to fmriprep files: %s' % path_fmriprep)
+logging.info('path to level 1 files: %s' % path_level1)
+logging.info('path to mask files: %s' % path_masks)
+paths = {
+    'fmriprep': opj(path_tardis, 'fmriprep', 'fmriprep', sub),
+    'level1': opj(path_tardis, 'glm', 'l1pipeline'),
+    'masks': opj(path_tardis, 'masks', 'masks')
+}
+
+# load the visual mask task files:
+path_mask_vis_task = opj(path_masks, 'mask_visual', sub, '*', '*task-highspeed*.nii.gz')
+path_mask_vis_task = sorted(glob.glob(path_mask_vis_task), key=lambda f: os.path.basename(f))
+logging.info('found %d visual mask task files' % len(path_mask_vis_task))
+logging.info('paths to visual mask task files:\n%s' % pformat(path_mask_vis_task))
+dl.get(glob.glob(path_mask_vis_task))
+
+# load the hippocampus mask task files:
+path_mask_hpc_task = opj(path_masks, 'mask_hippocampus', sub, '*', '*task-highspeed*.nii.gz')
+path_mask_hpc_task = sorted(glob.glob(path_mask_hpc_task), key=lambda f: os.path.basename(f))
+logging.info('found %d hpc mask files' % len(path_mask_hpc_task))
+logging.info('paths to hpc mask task files:\n%s' % pformat(path_mask_hpc_task))
+dl.get(glob.glob(path_mask_hpc_task))
+
+# load the whole brain mask files:
+path_mask_whole_task = opj(path_fmriprep, '*', 'func', '*task-highspeed*T1w*brain_mask.nii.gz')
+path_mask_whole_task = sorted(glob.glob(path_mask_whole_task), key=lambda f: os.path.basename(f))
+logging.info('found %d whole-brain masks' % len(path_mask_whole_task))
+logging.info('paths to whole-brain mask files:\n%s' % pformat(path_mask_whole_task))
+dl.get(glob.glob(path_mask_whole_task))
+
+# load the functional mri task files:
+path_func_task = opj(path_level1, 'smooth', sub, '*', '*task-highspeed*nii.gz')
+path_func_task = sorted(glob.glob(path_func_task), key=lambda f: os.path.basename(f))
+logging.info('found %d functional mri task files' % len(path_func_task))
+logging.info('paths to functional mri task files:\n%s' % pformat(path_func_task))
+dl.get(glob.glob(path_func_task))
+
+# define path to the functional resting state runs:
+path_rest = opj(path_tardis, 'masks', 'masks', 'smooth', sub, '*', '*task-rest*nii.gz')
+path_rest = sorted(glob.glob(path_rest), key=lambda f: os.path.basename(f))
+logging.info('found %d functional mri rest files' % len(path_rest))
+logging.info('paths to functional mri rest files:\n%s' % pformat(path_rest))
+dl.get(glob.glob(path_rest))
+
+# load the anatomical mri file:
+path_anat = opj(path_fmriprep, 'anat', '%s_desc-preproc_T1w.nii.gz' % sub)
+path_anat = sorted(glob.glob(path_anat), key=lambda f: os.path.basename(f))
+logging.info('found %d anatomical mri file' % len(path_anat))
+logging.info('paths to anatoimical mri files:\n%s' % pformat(path_anat))
+dl.get(glob.glob(path_anat))
+
+# load the confounds files:
+path_confs_task = opj(path_fmriprep, '*', 'func', '*task-highspeed*confounds_regressors.tsv')
+path_confs_task = sorted(glob.glob(path_confs_task), key=lambda f: os.path.basename(f))
+logging.info('found %d confounds files' % len(path_confs_task))
+logging.info('found %d confounds files' % len(path_confs_task))
+logging.info('paths to confounds files:\n%s' % pformat(path_confs_task))
+dl.get(glob.glob(path_confs_task))
+
+# load the spm.mat files:
+path_spm_mat = opj(path_level1, 'contrasts', sub, '*', 'SPM.mat')
+path_spm_mat = sorted(glob.glob(path_spm_mat), key=lambda f: os.path.dirname(f))
+logging.info('found %d spm.mat files' % len(path_spm_mat))
+logging.info('paths to spm.mat files:\n%s' % pformat(path_spm_mat))
+dl.get(glob.glob(path_spm_mat))
+
+# load the t-maps of the first-level glm:
+path_tmap = opj(path_level1, 'contrasts', sub, '*', 'spmT*.nii')
+path_tmap = sorted(glob.glob(path_tmap), key=lambda f: os.path.dirname(f))
+logging.info('found %d t-maps' % len(path_tmap))
+logging.info('paths to t-maps files:\n%s' % pformat(path_tmap))
+dl.get(glob.glob(path_tmap))
+'''
+========================================================================
+LOAD THE MRI DATA:
+========================================================================
+'''
+anat = image.load_img(path_anat[0])
+logging.info('successfully loaded %s' % path_anat[0])
+# load visual mask:
+mask_vis = image.load_img(path_mask_vis_task[0]).get_data().astype(int)
+logging.info('successfully loaded one visual mask file!')
+# load tmap data:
+tmaps = [image.load_img(i).get_data().astype(float) for i in copy.deepcopy(path_tmap)]
+logging.info('successfully loaded the tmaps!')
+# load hippocampus mask:
+mask_hpc = [image.load_img(i).get_data().astype(int) for i in copy.deepcopy(path_mask_hpc_task)]
+logging.info('successfully loaded one visual mask file!')
+'''
+FEATURE SELECTION
+'''
+# plot raw-tmaps on an anatomical background:
+logging.info('plotting raw tmaps with anatomical as background:')
+for i, path in enumerate(path_tmap):
+    logging.info('plotting raw tmap %s (%d of %d)' % (path, i+1, len(path_tmap)))
+    path_save = opj(path_out_figs, '%s_run-%02d_tmap_raw.png' % (sub, i+1))
+    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
+                      output_file=path_save, colorbar=True)
+
+'''
+========================================================================
+FEATURE SELECTION: MASKS THE T-MAPS WITH THE ANATOMICAL MASKS
+We create a combination of the t-maps and the anatomical mask.
+To this end, we multiply the anatomical mask with each t-map.
+As the anatomical conists of binary values (zeros and ones) a
+multiplication results in t-map masked by the anatomical ROI.
+========================================================================
+'''
+#v= tmaps_masked[0]
+#fig = plt.figure()
+#ax = fig.add_subplot(111, projection='3d')
+#ax.scatter(v[:,0],v[:,1],v[:,2], zdir='z', c= 'red')
+#plt.show()
+
+# check if any value in the supposedly binary mask is bigger than 1:
+if np.any(mask_vis > 1):
+    logging.info('WARNING: detected values > 1 in the anatomical ROI!')
+    sys.exit("Values > 1 in the anatomical ROI!")
+# get combination of anatomical mask and t-map
+tmaps_masked = [np.multiply(mask_vis, i) for i in copy.deepcopy(tmaps)]
+# masked tmap into image like object:
+tmaps_masked_img = [image.new_img_like(i, j) for (i, j) in zip(path_tmap, copy.deepcopy(tmaps_masked))]
+
+for i, path in enumerate(tmaps_masked_img):
+    path_save = opj(path_out_masks, '%s_run-%02d_tmap_masked.nii.gz' % (sub, i + 1))
+    path.to_filename(path_save)
+
+# plot masked t-maps
+logging.info('plotting masked tmaps with anatomical as background:')
+for i, path in enumerate(tmaps_masked_img):
+    logging.info('plotting masked tmap %d of %d' % (i+1, len(tmaps_masked_img)))
+    path_save = opj(path_out_figs, '%s_run-%02d_tmap_masked.png' % (sub, i+1))
+    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
+                      output_file=path_save, colorbar=True)
+'''
+========================================================================
+FEATURE SELECTION: THRESHOLD THE MASKED T-MAPS
+We threshold the masked t-maps, selecting only voxels above AND
+below this threshold. We then extract these data and count how
+many voxels were above and / or below the threshold in total.
+========================================================================
+'''
+# set the threshold:
+threshold = params['mri']['thresh']
+logging.info('thresholding t-maps with a threshold of %s' % str(threshold))
+# threshold the masked tmap image:
+tmaps_masked_thresh_img = [image.threshold_img(i, threshold) for i in copy.deepcopy(tmaps_masked_img)]
+
+logging.info('plotting thresholded tmaps with anatomical as background:')
+for i, path in enumerate(tmaps_masked_thresh_img):
+    path_save = opj(path_out_figs, '%s_run-%02d_tmap_masked_thresh.png' % (sub, i+1))
+    logging.info('plotting masked tmap %s (%d of %d)'
+                 % (path_save, i + 1, len(tmaps_masked_thresh_img)))
+    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
+                      output_file=path_save, colorbar=True)
+
+# extract data from the thresholded images
+tmaps_masked_thresh = [image.load_img(i).get_data().astype(float) for i in tmaps_masked_thresh_img]
+
+# calculate the number of tmap voxels:
+num_tmap_voxel = [np.size(i) for i in copy.deepcopy(tmaps_masked_thresh)]
+logging.info('number of feature selected voxels: %s' % pformat(num_tmap_voxel))
+
+num_above_voxel = [np.count_nonzero(i) for i in copy.deepcopy(tmaps_masked_thresh)]
+logging.info('number of voxels above threshold: %s' % pformat(num_above_voxel))
+
+num_below_voxel = [np.count_nonzero(i == 0) for i in copy.deepcopy(tmaps_masked_thresh)]
+logging.info('number of voxels below threshold: %s' % pformat(num_below_voxel))
+
+# plot the distribution of t-values:
+for i, run_mask in enumerate(tmaps_masked_thresh):
+    masked_tmap_flat = run_mask.flatten()
+    masked_tmap_flat = masked_tmap_flat[~np.isnan(masked_tmap_flat)]
+    masked_tmap_flat = masked_tmap_flat[~np.isnan(masked_tmap_flat) & ~(masked_tmap_flat == 0)]
+    path_save = opj(path_out_figs, '%s_run-%02d_tvalue_distribution.png' % (sub, i+1))
+    logging.info('plotting thresholded t-value distribution %s (%d of %d)'
+                 % (path_save, i+1, len(tmaps_masked_thresh)))
+    fig = plt.figure()
+    plt.hist(masked_tmap_flat, bins='auto')
+    plt.xlabel('t-values')
+    plt.ylabel('number')
+    plt.title('t-value distribution (%s, run-%02d)' % (sub, i+1))
+    plt.savefig(path_save)
+
+# create a dataframe with the number of voxels
+df_thresh = pd.DataFrame({
+    'id': [sub] * n_run,
+    'run': np.arange(1,n_run+1),
+    'n_total': num_tmap_voxel,
+    'n_above': num_above_voxel,
+    'n_below': num_below_voxel
+})
+file_name = opj(path_out_data, '%s_thresholding.csv' % (sub))
+df_thresh.to_csv(file_name, sep=',', index=False)
+
+'''
+========================================================================
+FEATURE SELECTION: BINARIZE THE THRESHOLDED MASKED T-MAPS
+We set all voxels above and below the threshold to 1 and all voxels
+that were not selected to 0.
+========================================================================
+'''
+# replace all NaNs with 0:
+tmaps_masked_thresh_bin = [np.where(np.isnan(i), 0, i) for i in copy.deepcopy(tmaps_masked_thresh)]
+# replace all other values with 1:
+tmaps_masked_thresh_bin = [np.where(i > 0, 1, i) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
+# turn the 3D-array into booleans:
+tmaps_masked_thresh_bin = [i.astype(bool) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
+# create image like object:
+masks_final = [image.new_img_like(path_func_task[0], i.astype(np.int)) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
+
+logging.info('plotting final masks with anatomical as background:')
+for i, path in enumerate(masks_final):
+    filename = '%s_run-%02d_tmap_masked_thresh.nii.gz'\
+               % (sub, i + 1)
+    path_save = opj(path_out_masks, filename)
+    logging.info('saving final mask %s (%d of %d)'
+                 % (path_save, i+1, len(masks_final)))
+    path.to_filename(path_save)
+    path_save = opj(path_out_figs, '%s_run-%02d_visual_final_mask.png'
+                    % (sub, i + 1))
+    logging.info('plotting final mask %s (%d of %d)'
+                 % (path_save, i + 1, len(masks_final)))
+    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
+                      output_file=path_save, colorbar=True)
+'''
+========================================================================
+LOAD SMOOTHED FMRI DATA FOR ALL FUNCTIONAL TASK RUNS:
+========================================================================
+'''
+# load smoothed functional mri data for all eight task runs:
+logging.info('loading %d functional task runs ...' % len(path_func_task))
+data_task = [image.load_img(i) for i in path_func_task]
+logging.info('loading successful!')
+'''
+========================================================================
+DEFINE THE CLASSIFIERS
+========================================================================
+'''
+class_labels = ['cat', 'chair', 'face', 'house', 'shoe']
+# create a dictionary with all values as independent instances:
+# see here: https://bit.ly/2J1DvZm
+clf_set = {key: LogisticRegression(
+    C=1., penalty='l2', multi_class='ovr', solver='lbfgs',
+    max_iter=4000, class_weight='balanced', random_state=42) for key in class_labels}
+classifiers = {
+    'log_reg': LogisticRegression(
+        C=1., penalty='l2', multi_class='multinomial', solver='lbfgs',
+        max_iter=4000, class_weight='balanced', random_state=42)}
+clf_set.update(classifiers)
+'''
+========================================================================
+1. SPLIT THE EVENTS DATAFRAME FOR EACH TASK CONDITION
+2. RESET THE INDICES OF THE DATAFRAMES
+3. SORT THE ROWS OF ALL DATAFRAMES IN CHRONOLOGICAL ORDER
+4. PRINT THE NUMBER OF TRIALS OF EACH TASK CONDITION
+========================================================================
+'''
+
+
+class TaskData:
+    """
+
+    """
+    def __init__(
+            self, events, condition, name, trial_type, bold_delay=0,
+            interval=1, t_tr=1.25, num_vol_run=530):
+        import pandas as pd
+        # define name of the task data subset:
+        self.name = name
+        # define the task condition the task data subset if from:
+        self.condition = condition
+        # define the delay (in seconds) by which onsets are moved:
+        self.bold_delay = bold_delay
+        # define the number of TRs from event onset that should be selected:
+        self.interval = interval
+        # define the repetition time (TR) of the mri data acquisition:
+        self.t_tr = t_tr
+        # define the number of volumes per task run:
+        self.num_vol_run = num_vol_run
+        # select events: upright stimulus, correct answer trials only:
+        if trial_type == 'stimulus':
+            self.events = events.loc[
+                          (events['condition'] == condition) &
+                          (events['trial_type'] == trial_type) &
+                          (events['stim_orient'] == 0) &
+                          (events['serial_position'] == 1) &
+                          (events['accuracy'] != 0),
+                          :]
+        elif trial_type == 'cue':
+            self.events = events.loc[
+                          (events['condition'] == condition) &
+                          (events['trial_type'] == trial_type),
+                          :]
+        # reset the indices of the data frame:
+        self.events.reset_index()
+        # sort all values by session and run:
+        self.events.sort_values(by=['session', 'run_session'])
+        # call further function upon initialization:
+        self.define_trs()
+        self.get_stats()
+
+    def define_trs(self):
+        # import relevant functions:
+        import numpy as np
+        # select all events onsets:
+        self.event_onsets = self.events['onset']
+        # add the selected delay to account for the peak of the hrf
+        self.bold_peaks = self.events['onset'] + self.bold_delay
+        # divide the expected time-point of bold peaks by the repetition time:
+        self.peak_trs = self.bold_peaks / self.t_tr
+        # add the number of run volumes to the tr indices:
+        run_volumes = (self.events['run_study']-1) * self.num_vol_run
+        # add the number of volumes of each run:
+        trs = round(self.peak_trs + run_volumes)
+        # copy the relevant trs as often as specified by the interval:
+        a = np.transpose(np.tile(trs, (self.interval, 1)))
+        # create same-sized matrix with trs to be added:
+        b = np.full((len(trs), self.interval), np.arange(self.interval))
+        # assign the final list of trs:
+        self.trs = np.array(np.add(a, b).flatten(), dtype=int)
+        # save the TRs of the stimulus presentations
+        self.stim_trs = round(self.event_onsets / self.t_tr + run_volumes)
+
+    def get_stats(self):
+        import numpy as np
+        self.num_trials = len(self.events)
+        self.runs = np.repeat(np.array(self.events['run_study'], dtype=int), self.interval)
+        self.trials = np.repeat(np.array(self.events['trial'], dtype=int), self.interval)
+        self.sess = np.repeat(np.array(self.events['session'], dtype=int), self.interval)
+        self.stim = np.repeat(np.array(self.events['stim_label'], dtype=object), self.interval)
+        self.itis = np.repeat(np.array(self.events['interval_time'], dtype=float), self.interval)
+        self.stim_trs = np.repeat(np.array(self.stim_trs, dtype=int), self.interval)
+
+    def zscore(self, signals, run_list, t_tr=1.25):
+        from nilearn.signal import clean
+        import numpy as np
+        # get boolean indices for all run indices in the run list:
+        run_indices = np.isin(self.runs, list(run_list))
+        # standardize data all runs in the run list:
+        self.data_zscored = clean(
+            signals=signals[self.trs[run_indices]],
+            sessions=self.runs[run_indices],
+            t_r=t_tr,
+            detrend=False,
+            standardize=True)
+
+    def predict(self, clf, run_list):
+        # import packages:
+        import pandas as pd
+        # get classifier class predictions:
+        pred_class = clf.predict(self.data_zscored)
+        # get classifier probabilistic predictions:
+        pred_proba = clf.predict_proba(self.data_zscored)
+        # get the classes of the classifier:
+        classes_names = clf.classes_
+        # get boolean indices for all run indices in the run list:
+        run_indices = np.isin(self.runs, list(run_list))
+        # create a dataframe with the probabilities of each class:
+        df = pd.DataFrame(pred_proba, columns=classes_names)
+        # get the number of predictions made:
+        num_pred = len(df)
+        # get the number of trials in the test set:
+        num_trials = int(num_pred / self.interval)
+        # add the predicted class label to the dataframe:
+        df['pred_label'] = pred_class
+        # add the true stimulus label to the dataframe:
+        df['stim'] = self.stim[run_indices]
+        # add the volume number (TR) to the dataframe:
+        df['tr'] = self.trs[run_indices]
+        # add the sequential TR to the dataframe:
+        df['seq_tr'] = np.tile(np.arange(1, self.interval + 1), num_trials)
+        # add the counter of trs on which the stimulus was presented
+        df['stim_tr'] = self.stim_trs[run_indices]
+        # add the trial number to the dataframe:
+        df['trial'] = self.trials[run_indices]
+        # add the run number to the dataframe:
+        df['run_study'] = self.runs[run_indices]
+        # add the session number to the dataframe:
+        df['session'] = self.sess[run_indices]
+        # add the inter trial interval to the dataframe:
+        df['tITI'] = self.itis[run_indices]
+        # add the participant id to the dataframe:
+        df['id'] = np.repeat(self.events['subject'].unique(), num_pred)
+        # add the name of the classifier to the dataframe:
+        df['test_set'] = np.repeat(self.name, num_pred)
+        # return the dataframe:
+        return df
+
+
+train_odd_peak = TaskData(
+        events=df_events, condition='oddball', trial_type='stimulus',
+        bold_delay=4, interval=1, name='train-odd_peak')
+test_odd_peak = TaskData(
+        events=df_events, condition='oddball', trial_type='stimulus',
+        bold_delay=4, interval=1, name='test-odd_peak')
+test_odd_long = TaskData(
+        events=df_events, condition='oddball', trial_type='stimulus',
+        bold_delay=0, interval=7, name='test-odd_long')
+test_seq_long = TaskData(
+        events=df_events, condition='sequence', trial_type='stimulus',
+        bold_delay=0, interval=13, name='test-seq_long')
+test_rep_long = TaskData(
+        events=df_events, condition='repetition', trial_type='stimulus',
+        bold_delay=0, interval=13, name='test-rep_long')
+test_seq_cue = TaskData(
+        events=df_events, condition='sequence', trial_type='cue',
+        bold_delay=0, interval=5, name='test-seq_cue')
+test_rep_cue = TaskData(
+        events=df_events, condition='repetition', trial_type='cue',
+        bold_delay=0, interval=5, name='test-rep_cue')
+test_sets = [
+        test_odd_peak, test_odd_long, test_seq_long, test_rep_long,
+        test_seq_cue, test_rep_cue
+        ]
+'''
+========================================================================
+SEPARATE RUN-WISE STANDARDIZATION (Z-SCORING) OF ALL TASK CONDITIONS
+Standardize features by removing the mean and scaling to unit variance.
+Centering and scaling happen independently on each feature by computing
+the relevant statistics on the samples in the training set. Here,
+we standardize the features of all trials run-wise but separated for
+each task condition (oddball, sequence, and repetition condition).
+========================================================================
+'''
+
+
+def detrend(data, t_tr=1.25):
+    from nilearn.signal import clean
+    data_detrend = clean(
+        signals=data, t_r=t_tr, detrend=True, standardize=False)
+    return data_detrend
+
+
+def show_weights(array):
+    # https://stackoverflow.com/a/50154388
+    import numpy as np
+    import seaborn as sns
+    n_samples = array.shape[0]
+    classes, bins = np.unique(array, return_counts=True)
+    n_classes = len(classes)
+    weights = n_samples / (n_classes * bins)
+    sns.barplot(classes, weights)
+    plt.xlabel('class label')
+    plt.ylabel('weight')
+    plt.show()
+
+
+def melt_df(df, melt_columns):
+    # save the column names of the dataframe in a list:
+    column_names = df.columns.tolist()
+    # remove the stimulus classes from the column names;
+    id_vars = [x for x in column_names if x not in melt_columns]
+    # melt the dataframe creating one column with value_name and var_name:
+    df_melt = pd.melt(
+            df, value_name='probability', var_name='class', id_vars=id_vars)
+    # return the melted dataframe:
+    return df_melt
+
+
+data_list = []
+runs = list(range(1, n_run+1))
+#mask_label = 'cv'
+
+logging.info('starting leave-one-run-out cross-validation')
+for mask_label in ['cv', 'cv_hpc']:
+    logging.info('testing in mask %s' % (mask_label))
+    for run in runs:
+        logging.info('testing on run %d of %d ...' % (run, len(runs)))
+        # define the run indices for the training and test set:
+        train_runs = [x for x in runs if x != run]
+        test_runs = [x for x in runs if x == run]
+        # get the feature selection mask of the current run:
+        if mask_label == 'cv':
+            mask_run = masks_final[runs.index(run)]
+        elif mask_label == 'cv_hpc':
+            mask_run = path_mask_hpc_task[runs.index(run)]
+        # extract smoothed fMRI data from the mask for the cross-validation fold:
+        masked_data = [masking.apply_mask(i, mask_run) for i in data_task]
+        # detrend the masked fMRI data separately for each run:
+        data_detrend = [detrend(i) for i in masked_data]
+        # combine the detrended data of all runs:
+        data_detrend = np.vstack(data_detrend)
+        # loop through all classifiers in the classifier set:
+        for clf_name, clf in clf_set.items():
+            # print classifier:
+            logging.info('classifier: %s' % clf_name)
+            # fit the classifier to the training data:
+            train_odd_peak.zscore(signals=data_detrend, run_list=train_runs)
+            # get the example labels:
+            train_stim = copy.deepcopy(train_odd_peak.stim[train_odd_peak.runs != run])
+            # replace labels for single-label classifiers:
+            if clf_name in class_labels:
+                # replace all other labels with other
+                train_stim = ['other' if x != clf_name else x for x in train_stim]
+                # turn into a numpy array
+                train_stim = np.array(train_stim, dtype=object)
+            # check weights:
+            #show_weights(array=train_stim)
+            # train the classifier
+            clf.fit(train_odd_peak.data_zscored, train_stim)
+            # classifier prediction: predict on test data and save the data:
+            for test_set in test_sets:
+                logging.info('testing on test set %s' % test_set.name)
+                test_set.zscore(signals=data_detrend, run_list=test_runs)
+                if test_set.data_zscored.size < 0:
+                    continue
+                # create dataframe containing classifier predictions:
+                df_pred = test_set.predict(clf=clf, run_list=test_runs)
+                # add the current classifier as a new column:
+                df_pred['classifier'] = np.repeat(clf_name, len(df_pred))
+                # add a label that indicates the mask / training regime:
+                df_pred['mask'] = np.repeat(mask_label, len(df_pred))
+                # melt the data frame:
+                df_pred_melt = melt_df(df=df_pred, melt_columns=train_stim)
+                # append dataframe to list of dataframe results:
+                data_list.append(df_pred_melt)
+'''
+========================================================================
+DECODING ON RESTING STATE DATA:
+========================================================================
+'''
+logging.info('Loading fMRI data of %d resting state runs ...' % len(path_rest))
+data_rest = [image.load_img(i) for i in path_rest]
+logging.info('loading successful!')
+
+# combine all masks from the feature selection by intersection:
+def multimultiply(arrays):
+    import copy
+    # start with the first array:
+    array_union = copy.deepcopy(arrays[0].astype(np.int))
+    # loop through all arrays
+    for i in range(len(arrays)):
+        # multiply every array with all previous array
+        array_union = np.multiply(array_union, copy.deepcopy(arrays[i].astype(np.int)))
+    # return the union of all arrays:
+    return(array_union)
+
+
+for mask_label in ['union', 'union_hpc']:
+    # calculate the union of all masks by multiplication:
+    if mask_label == 'union':
+        masks_union = multimultiply(tmaps_masked_thresh_bin).astype(int).astype(bool)
+    elif mask_label == 'union_hpc':
+        masks_union = multimultiply(mask_hpc).astype(int).astype(bool)
+    # masked tmap into image like object:
+    masks_union_nii = image.new_img_like(path_func_task[0], masks_union)
+    path_save = opj(path_out_masks, '{}_task-rest_mask-{}.nii.gz'.format(sub, mask_label))
+    masks_union_nii.to_filename(path_save)
+    # mask all resting state runs with the averaged feature selection masks:
+    data_rest_masked = [masking.apply_mask(i, masks_union_nii) for i in data_rest]
+    # detrend and standardize each resting state run separately:
+    data_rest_final = [clean(i, detrend=True, standardize=True) for i in data_rest_masked]
+    # mask all functional task runs separately:
+    data_task_masked = [masking.apply_mask(i, masks_union_nii) for i in data_task]
+    # detrend each task run separately:
+    data_task_masked_detrend = [clean(i, detrend=True, standardize=False) for i in data_task_masked]
+    # combine the detrended data of all runs:
+    data_task_masked_detrend = np.vstack(data_task_masked_detrend)
+    # select only oddball data and standardize:
+    train_odd_peak.zscore(signals=data_task_masked_detrend, run_list=runs)
+    # write session and run labels:
+    ses_labels = [i.split(sub + "_")[1].split("_task")[0] for i in path_rest]
+    run_labels = [i.split("prenorm_")[1].split("_space")[0] for i in path_rest]
+    file_names = ['_'.join([a, b]) for (a, b) in zip(ses_labels, run_labels)]
+    rest_interval = 1
+    # save the voxel patterns:
+    num_voxels = len(train_odd_peak.data_zscored[0])
+    voxel_labels = ['v' + str(x) for x in range(num_voxels)]
+    df_patterns = pd.DataFrame(train_odd_peak.data_zscored, columns=voxel_labels)
+    # add the stimulus labels to the dataframe:
+    df_patterns['label'] = copy.deepcopy(train_odd_peak.stim)
+    # add the participant id to the dataframe:
+    df_patterns['id'] = np.repeat(df_events['subject'].unique(), len(train_odd_peak.stim))
+    # add the mask label:
+    df_patterns['mask'] = np.repeat(mask_label, len(train_odd_peak.stim))
+    # split pattern dataframe by label:
+    df_pattern_list = [df_patterns[df_patterns['label'] == i] for i in df_patterns['label'].unique()]
+    # create file path to save the dataframe:
+    file_paths = [opj(path_out_data, '{}_voxel_patterns_{}_{}'.format(sub, mask_label, i)) for i in df_patterns['label'].unique()]
+    # save the final dataframe to a .csv-file:
+    [i.to_csv(j + '.csv', sep=',', index=False) for (i, j) in zip(df_pattern_list, file_paths)]
+    # save only the voxel patterns as niimg-like objects
+    [masking.unmask(X=i.loc[:, voxel_labels].to_numpy(), mask_img=masks_union_nii).to_filename(j + '.nii.gz') for (i, j) in zip(df_pattern_list, file_paths)]
+    #[image.new_img_like(path_func_task[0], i.loc[:, voxel_labels].to_numpy()).to_filename(j + '.nii.gz') for (i, j) in zip(df_pattern_list, file_paths)]
+    # decoding on resting state data:
+    for clf_name, clf in clf_set.items():
+        # print classifier name:
+        logging.info('classifier: %s' % clf_name)
+        # get the example labels for all slow trials:
+        train_stim = copy.deepcopy(train_odd_peak.stim)
+        # replace labels for single-label classifiers:
+        if clf_name in class_labels:
+            # replace all other labels with other
+            train_stim = ['other' if x != clf_name else x for x in train_stim]
+            # turn into a numpy array
+            train_stim = np.array(train_stim, dtype=object)
+        # train the classifier
+        clf.fit(train_odd_peak.data_zscored, train_stim)
+        # classifier prediction: predict on test data and save the data:
+        pred_rest_class = [clf.predict(i) for i in data_rest_final]
+        pred_rest_proba = [clf.predict_proba(i) for i in data_rest_final]
+        # get the class names of the classifier:
+        classes_names = clf.classes_
+        # save classifier predictions on resting state scans
+        for t, name in enumerate(pred_rest_proba):
+            # create a dataframe with the probabilities of each class:
+            df_rest_pred = pd.DataFrame(
+                    pred_rest_proba[t], columns=classes_names)
+            # get the number of predictions made:
+            num_pred = len(df_rest_pred)
+            # get the number of trials in the test set:
+            num_trials = int(num_pred / rest_interval)
+            # add the predicted class label to the dataframe:
+            df_rest_pred['pred_label'] = pred_rest_class[t]
+            # add the true stimulus label to the dataframe:
+            df_rest_pred['stim'] = np.full(num_pred, np.nan)
+            # add the volume number (TR) to the dataframe:
+            df_rest_pred['tr'] = np.arange(1, num_pred + 1)
+            # add the sequential TR to the dataframe:
+            df_rest_pred['seq_tr'] = np.arange(1, num_pred + 1)
+            # add the trial number to the dataframe:
+            df_rest_pred['trial'] = np.tile(
+                    np.arange(1, rest_interval + 1), num_trials)
+            # add the run number to the dataframe:
+            df_rest_pred['run_study'] = np.repeat(run_labels[t], num_pred)
+            # add the session number to the dataframe:
+            df_rest_pred['session'] = np.repeat(ses_labels[t], len(df_rest_pred))
+            # add the inter trial interval to the dataframe:
+            df_rest_pred['tITI'] = np.tile('rest', num_pred)
+            # add the participant id to the dataframe:
+            df_rest_pred['id'] = np.repeat(df_events['subject'].unique(), num_pred)
+            # add the name of the classifier to the dataframe:
+            df_rest_pred['test_set'] = np.repeat('rest', num_pred)
+            # add the name of the classifier to the dataframe;
+            df_rest_pred['classifier'] = np.repeat(clf_name, num_pred)
+            # add a label that indicates the mask / training regime:
+            df_rest_pred['mask'] = np.repeat(mask_label, len(df_rest_pred))
+            # melt the data frame:
+            df_pred_melt = melt_df(df=df_rest_pred, melt_columns=train_stim)
+            # append dataframe to list of dataframe results:
+            data_list.append(df_pred_melt)
+        # run classifier trained on all runs across all test sets:
+        for test_set in test_sets:
+            logging.info('testing on test set %s' % test_set.name)
+            test_set.zscore(signals=data_task_masked_detrend, run_list=runs)
+            if test_set.data_zscored.size < 0:
+                continue
+            # create dataframe containing classifier predictions:
+            df_pred = test_set.predict(clf=clf, run_list=runs)
+            # add the current classifier as a new column:
+            df_pred['classifier'] = np.repeat(clf_name, len(df_pred))
+            # add a label that indicates the mask / training regime:
+            df_pred['mask'] = np.repeat(mask_label, len(df_pred))
+            # melt the data frame:
+            df_pred_melt = melt_df(df=df_pred, melt_columns=train_stim)
+            # append dataframe to list of dataframe results:
+            data_list.append(df_pred_melt)
+
+# concatenate all decoding dataframes in one final dataframe:
+df_all = pd.concat(data_list, sort=False)
+# create file path to save the dataframe:
+file_path = opj(path_out_data, '{}_decoding.csv'.format(sub))
+# save the final dataframe to a .csv-file:
+df_all.to_csv(file_path, sep=',', index=False)
+
+'''
+========================================================================
+STOP LOGGING:
+========================================================================
+'''
+end = time.time()
+total_time = (end - start)/60
+logging.info('total running time: %0.2f minutes' % total_time)
+logging.shutdown()

+ 41 - 0
code/docs/highspeed-decoding-docs.Rmd

@@ -0,0 +1,41 @@
+---
+title: "Faster than thought: Detecting sub-second activation sequences with sequential fMRI pattern analysis"
+subtitle: "Short project title: highspeed"
+author:
+  - Lennart Wittkuhn^[Max Planck Institute for Human Development, wittkuhn@mpib-berlin.mpg.de]
+  - Nicolas W. Schuck^[Max Planck Institute for Human Development, schuck@mpib-berlin.mpg.de]
+date: "Last update: `r format(Sys.time(), '%d %B, %Y')`"
+output:
+ html_document:
+    toc: true
+    self_contained: true
+    toc_float: true
+    toc_depth: 3
+    number_sections: true
+    highlight: pygments
+    theme: cosmo
+    df_print: paged
+    fig_caption: true
+fig.align: "center"
+header-includes:
+  - \usepackage{fontspec}
+  - \setmainfont{AgfaRotisSansSerif}
+email: wittkuhn@mpib-berlin.mpg.de
+---
+
+```{r, libraries, echo=FALSE, message=FALSE, include=FALSE}
+if (!requireNamespace("pacman")) install.packages("pacman")
+packages_cran <- c("here")
+pacman::p_load(char = packages_cran)
+if (basename(here::here()) == "highspeed"){
+  path_root = here::here("highspeed-decoding")
+} else {
+  path_root = here::here()
+}
+```
+
+```{python, echo=TRUE, code=readLines(file.path(path_root, "code", "decoding", "highspeed_decoding.py")), eval=FALSE, python.reticulate=FALSE}
+```
+
+```{bash, echo=TRUE, code=readLines(file.path(path_root, "code", "decoding", "highspeed-decoding-cluster.sh")), eval=FALSE}
+```

+ 13 - 0
highspeed-decoding.Rproj

@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: No
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX