lnnrtwttkhn
/
highspeed-decoding


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ======================================================================
# SCRIPT INFORMATION:
# ======================================================================
# SCRIPT: DECODING ANALYSIS
# PROJECT: HIGHSPEED
# WRITTEN BY LENNART WITTKUHN, 2018 - 2019
# CONTACT: WITTKUHN AT MPIB HYPHEN BERLIN DOT MPG DOT DE
# MAX PLANCK RESEARCH GROUP NEUROCODE
# MAX PLANCK INSTITUTE FOR HUMAN DEVELOPMENT
# MAX PLANCK UCL CENTRE FOR COMPUTATIONAL PSYCHIATRY AND AGEING RESEARCH
# LENTZEALLEE 94, 14195 BERLIN, GERMANY
'''
========================================================================
IMPORT RELEVANT PACKAGES:
========================================================================
'''
import glob
import os
import yaml
import logging
import time
from os.path import join as opj
import sys
import copy
from pprint import pformat
import numpy as np
from nilearn.input_data import NiftiMasker
from nilearn import plotting, image, masking
import pandas as pd
from matplotlib import pyplot as plt
import warnings
from nilearn.signal import clean
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from collections import Counter
import datalad.api as dl
'''
========================================================================
DEAL WITH ERRORS, WARNING ETC.
========================================================================
'''
warnings.filterwarnings('ignore', message='numpy.dtype size changed*')
warnings.filterwarnings('ignore', message='numpy.ufunc size changed*')
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
'''
========================================================================
SETUP NECESSARY PATHS ETC:
========================================================================
'''
# get start time of the script:
start = time.time()
# name of the current project:
project = 'highspeed'
# initialize empty variables:
sub = None
path_tardis = None
path_server = None
path_local = None
# define paths depending on the operating system (OS) platform:
if 'darwin' in sys.platform:
    # define the path to the cluster:
    path_tardis = opj(os.environ['HOME'], 'Volumes', 'tardis_beegfs', project)
    # define the path to the server:
    path_server = opj('/Volumes', 'MPRG-Neurocode', 'Data', project)
    # define the path to the local computer:
    path_local = opj(os.environ['HOME'], project, '*', '*')
    # define the subject id:
    sub = 'sub-14'
elif 'linux' in sys.platform:
    # path to the project root:
    project_name = 'highspeed-decoding'
    path_root = os.getcwd().split(project_name)[0] + project_name
    # define the path to the cluster:
    path_tardis = path_root
    # define the path to the server:
    path_server = path_tardis
    # define the path to the local computer:
    path_local = opj(path_tardis, 'code', 'decoding')
    # define the subject id:
    sub = 'sub-%s' % sys.argv[1]
'''
========================================================================
LOAD PROJECT PARAMETERS:
========================================================================
'''
path_params = glob.glob(opj(path_local, '*parameters.yaml'))[0]
with open(path_params, 'rb') as f:
    params = yaml.load(f, Loader=yaml.FullLoader)
f.close()
'''
========================================================================
CREATE PATHS TO OUTPUT DIRECTORIES:
========================================================================
'''
path_decoding = opj(path_tardis, 'decoding')
path_out = opj(path_decoding, sub)
path_out_figs = opj(path_out, 'plots')
path_out_data = opj(path_out, 'data')
path_out_logs = opj(path_out, 'logs')
path_out_masks = opj(path_out, 'masks')
'''
========================================================================
CREATE OUTPUT DIRECTORIES IF THE DO NOT EXIST YET:
========================================================================
'''
for path in [path_out_figs, path_out_data, path_out_logs, path_out_masks]:
    if not os.path.exists(path):
        os.makedirs(path)
'''
========================================================================
SETUP LOGGING:
========================================================================
'''
# remove all handlers associated with the root logger object:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
# get current data and time as a string:
timestr = time.strftime("%Y-%m-%d-%H-%M-%S")
# create path for the logging file
log = opj(path_out_logs, '%s-%s.log' % (timestr, sub))
# start logging:
logging.basicConfig(
    filename=log, level=logging.DEBUG, format='%(asctime)s %(message)s',
    datefmt='%d/%m/%Y %H:%M:%S')
'''
========================================================================
DEFINE DECODING SPECIFIC PARAMETERS:
========================================================================
'''
# define the mask to be used:
mask = 'visual'  # visual or whole
# applied time-shift to account for the BOLD delay, in seconds:
bold_delay = 4  # 4, 5 or 6 secs
# define the degree of smoothing of the functional data
smooth = 4
'''
========================================================================
ADD BASIC SCRIPT INFORMATION TO THE LOGGER:
========================================================================
'''
logging.info('Running decoding script')
logging.info('operating system: %s' % sys.platform)
logging.info('project name: %s' % project)
logging.info('participant: %s' % sub)
logging.info('mask: %s' % mask)
logging.info('bold delay: %d secs' % bold_delay)
logging.info('smoothing kernel: %d mm' % smooth)
'''
========================================================================
DEFINE RELEVANT VARIABLES:
========================================================================
'''
# time of repetition (TR), in seconds:
t_tr = params['mri']['tr']
# number of volumes (TRs) for each functional task run:
n_tr_run = 530
# acquisition time window of one sequence trial, in seconds:
t_win = 16
# number of measurements that are considered per sequence time window:
n_tr_win = round(t_win / t_tr)
# number of oddball trials in the experiment:
n_tr_odd = 600
# number of sequence trials in the experiment:
n_tr_seq = 75
# number of repetition trials in the experiment:
n_tr_rep = 45
# number of scanner triggers before the experiment starts:
n_tr_wait = params['mri']['num_trigger']
# number of functional task runs in total:
n_run = params['mri']['num_runs']
# number of experimental sessions in total:
n_ses = params['mri']['num_sessions']
# number of functional task runs per session:
n_run_ses = int(n_run / n_ses)
'''
========================================================================
LOAD BEHAVIORAL DATA (THE EVENTS.TSV FILES OF THE SUBJECT):
========================================================================
Load the events files for the subject. The 'oddball' data serves as the
training set of the classifier. The 'sequence' data constitutes the
test set.
'''
# paths to all events files of the current subject:
path_events = opj(path_tardis, 'bids', sub, 'ses-*', 'func', '*tsv')
dl.get(glob.glob(path_events))
path_events = sorted(glob.glob(path_events), key=lambda f: os.path.basename(f))
logging.info('found %d event files' % len(path_events))
logging.info('paths to events files (sorted):\n%s' % pformat(path_events))
# import events file and save data in dataframe:
df_events = pd.concat((pd.read_csv(f, sep='\t') for f in path_events),
                      ignore_index=True)
'''
========================================================================
CREATE PATHS TO THE MRI DATA:
========================================================================
'''
# define path to input directories:
path_fmriprep = opj(path_tardis, 'fmriprep', 'fmriprep', sub)
path_level1 = opj(path_tardis, 'glm', 'l1pipeline')
path_masks = opj(path_tardis, 'masks', 'masks')

logging.info('path to fmriprep files: %s' % path_fmriprep)
logging.info('path to level 1 files: %s' % path_level1)
logging.info('path to mask files: %s' % path_masks)
paths = {
    'fmriprep': opj(path_tardis, 'fmriprep', 'fmriprep', sub),
    'level1': opj(path_tardis, 'glm', 'l1pipeline'),
    'masks': opj(path_tardis, 'masks', 'masks')
}

# load the visual mask task files:
path_mask_vis_task = opj(path_masks, 'mask_visual', sub, '*', '*task-highspeed*.nii.gz')
path_mask_vis_task = sorted(glob.glob(path_mask_vis_task), key=lambda f: os.path.basename(f))
logging.info('found %d visual mask task files' % len(path_mask_vis_task))
logging.info('paths to visual mask task files:\n%s' % pformat(path_mask_vis_task))
dl.get(glob.glob(path_mask_vis_task))

# load the hippocampus mask task files:
path_mask_hpc_task = opj(path_masks, 'mask_hippocampus', sub, '*', '*task-highspeed*.nii.gz')
path_mask_hpc_task = sorted(glob.glob(path_mask_hpc_task), key=lambda f: os.path.basename(f))
logging.info('found %d hpc mask files' % len(path_mask_hpc_task))
logging.info('paths to hpc mask task files:\n%s' % pformat(path_mask_hpc_task))
dl.get(glob.glob(path_mask_hpc_task))

# load the whole brain mask files:
path_mask_whole_task = opj(path_fmriprep, '*', 'func', '*task-highspeed*T1w*brain_mask.nii.gz')
path_mask_whole_task = sorted(glob.glob(path_mask_whole_task), key=lambda f: os.path.basename(f))
logging.info('found %d whole-brain masks' % len(path_mask_whole_task))
logging.info('paths to whole-brain mask files:\n%s' % pformat(path_mask_whole_task))
dl.get(glob.glob(path_mask_whole_task))

# load the functional mri task files:
path_func_task = opj(path_level1, 'smooth', sub, '*', '*task-highspeed*nii.gz')
path_func_task = sorted(glob.glob(path_func_task), key=lambda f: os.path.basename(f))
logging.info('found %d functional mri task files' % len(path_func_task))
logging.info('paths to functional mri task files:\n%s' % pformat(path_func_task))
dl.get(glob.glob(path_func_task))

# define path to the functional resting state runs:
path_rest = opj(path_tardis, 'masks', 'masks', 'smooth', sub, '*', '*task-rest*nii.gz')
path_rest = sorted(glob.glob(path_rest), key=lambda f: os.path.basename(f))
logging.info('found %d functional mri rest files' % len(path_rest))
logging.info('paths to functional mri rest files:\n%s' % pformat(path_rest))
dl.get(glob.glob(path_rest))

# load the anatomical mri file:
path_anat = opj(path_fmriprep, 'anat', '%s_desc-preproc_T1w.nii.gz' % sub)
path_anat = sorted(glob.glob(path_anat), key=lambda f: os.path.basename(f))
logging.info('found %d anatomical mri file' % len(path_anat))
logging.info('paths to anatoimical mri files:\n%s' % pformat(path_anat))
dl.get(glob.glob(path_anat))

# load the confounds files:
path_confs_task = opj(path_fmriprep, '*', 'func', '*task-highspeed*confounds_regressors.tsv')
path_confs_task = sorted(glob.glob(path_confs_task), key=lambda f: os.path.basename(f))
logging.info('found %d confounds files' % len(path_confs_task))
logging.info('found %d confounds files' % len(path_confs_task))
logging.info('paths to confounds files:\n%s' % pformat(path_confs_task))
dl.get(glob.glob(path_confs_task))

# load the spm.mat files:
path_spm_mat = opj(path_level1, 'contrasts', sub, '*', 'SPM.mat')
path_spm_mat = sorted(glob.glob(path_spm_mat), key=lambda f: os.path.dirname(f))
logging.info('found %d spm.mat files' % len(path_spm_mat))
logging.info('paths to spm.mat files:\n%s' % pformat(path_spm_mat))
dl.get(glob.glob(path_spm_mat))

# load the t-maps of the first-level glm:
path_tmap = opj(path_level1, 'contrasts', sub, '*', 'spmT*.nii')
path_tmap = sorted(glob.glob(path_tmap), key=lambda f: os.path.dirname(f))
logging.info('found %d t-maps' % len(path_tmap))
logging.info('paths to t-maps files:\n%s' % pformat(path_tmap))
dl.get(glob.glob(path_tmap))
'''
========================================================================
LOAD THE MRI DATA:
========================================================================
'''
anat = image.load_img(path_anat[0])
logging.info('successfully loaded %s' % path_anat[0])
# load visual mask:
mask_vis = image.load_img(path_mask_vis_task[0]).get_data().astype(int)
logging.info('successfully loaded one visual mask file!')
# load tmap data:
tmaps = [image.load_img(i).get_data().astype(float) for i in copy.deepcopy(path_tmap)]
logging.info('successfully loaded the tmaps!')
# load hippocampus mask:
mask_hpc = [image.load_img(i).get_data().astype(int) for i in copy.deepcopy(path_mask_hpc_task)]
logging.info('successfully loaded one visual mask file!')
'''
FEATURE SELECTION
'''
# plot raw-tmaps on an anatomical background:
logging.info('plotting raw tmaps with anatomical as background:')
for i, path in enumerate(path_tmap):
    logging.info('plotting raw tmap %s (%d of %d)' % (path, i+1, len(path_tmap)))
    path_save = opj(path_out_figs, '%s_run-%02d_tmap_raw.png' % (sub, i+1))
    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
                      output_file=path_save, colorbar=True)

'''
========================================================================
FEATURE SELECTION: MASKS THE T-MAPS WITH THE ANATOMICAL MASKS
We create a combination of the t-maps and the anatomical mask.
To this end, we multiply the anatomical mask with each t-map.
As the anatomical conists of binary values (zeros and ones) a
multiplication results in t-map masked by the anatomical ROI.
========================================================================
'''
#v= tmaps_masked[0]
#fig = plt.figure()
#ax = fig.add_subplot(111, projection='3d')
#ax.scatter(v[:,0],v[:,1],v[:,2], zdir='z', c= 'red')
#plt.show()

# check if any value in the supposedly binary mask is bigger than 1:
if np.any(mask_vis > 1):
    logging.info('WARNING: detected values > 1 in the anatomical ROI!')
    sys.exit("Values > 1 in the anatomical ROI!")
# get combination of anatomical mask and t-map
tmaps_masked = [np.multiply(mask_vis, i) for i in copy.deepcopy(tmaps)]
# masked tmap into image like object:
tmaps_masked_img = [image.new_img_like(i, j) for (i, j) in zip(path_tmap, copy.deepcopy(tmaps_masked))]

for i, path in enumerate(tmaps_masked_img):
    path_save = opj(path_out_masks, '%s_run-%02d_tmap_masked.nii.gz' % (sub, i + 1))
    path.to_filename(path_save)

# plot masked t-maps
logging.info('plotting masked tmaps with anatomical as background:')
for i, path in enumerate(tmaps_masked_img):
    logging.info('plotting masked tmap %d of %d' % (i+1, len(tmaps_masked_img)))
    path_save = opj(path_out_figs, '%s_run-%02d_tmap_masked.png' % (sub, i+1))
    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
                      output_file=path_save, colorbar=True)
'''
========================================================================
FEATURE SELECTION: THRESHOLD THE MASKED T-MAPS
We threshold the masked t-maps, selecting only voxels above AND
below this threshold. We then extract these data and count how
many voxels were above and / or below the threshold in total.
========================================================================
'''
# set the threshold:
threshold = params['mri']['thresh']
logging.info('thresholding t-maps with a threshold of %s' % str(threshold))
# threshold the masked tmap image:
tmaps_masked_thresh_img = [image.threshold_img(i, threshold) for i in copy.deepcopy(tmaps_masked_img)]

logging.info('plotting thresholded tmaps with anatomical as background:')
for i, path in enumerate(tmaps_masked_thresh_img):
    path_save = opj(path_out_figs, '%s_run-%02d_tmap_masked_thresh.png' % (sub, i+1))
    logging.info('plotting masked tmap %s (%d of %d)'
                 % (path_save, i + 1, len(tmaps_masked_thresh_img)))
    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
                      output_file=path_save, colorbar=True)

# extract data from the thresholded images
tmaps_masked_thresh = [image.load_img(i).get_data().astype(float) for i in tmaps_masked_thresh_img]

# calculate the number of tmap voxels:
num_tmap_voxel = [np.size(i) for i in copy.deepcopy(tmaps_masked_thresh)]
logging.info('number of feature selected voxels: %s' % pformat(num_tmap_voxel))

num_above_voxel = [np.count_nonzero(i) for i in copy.deepcopy(tmaps_masked_thresh)]
logging.info('number of voxels above threshold: %s' % pformat(num_above_voxel))

num_below_voxel = [np.count_nonzero(i == 0) for i in copy.deepcopy(tmaps_masked_thresh)]
logging.info('number of voxels below threshold: %s' % pformat(num_below_voxel))

# plot the distribution of t-values:
for i, run_mask in enumerate(tmaps_masked_thresh):
    masked_tmap_flat = run_mask.flatten()
    masked_tmap_flat = masked_tmap_flat[~np.isnan(masked_tmap_flat)]
    masked_tmap_flat = masked_tmap_flat[~np.isnan(masked_tmap_flat) & ~(masked_tmap_flat == 0)]
    path_save = opj(path_out_figs, '%s_run-%02d_tvalue_distribution.png' % (sub, i+1))
    logging.info('plotting thresholded t-value distribution %s (%d of %d)'
                 % (path_save, i+1, len(tmaps_masked_thresh)))
    fig = plt.figure()
    plt.hist(masked_tmap_flat, bins='auto')
    plt.xlabel('t-values')
    plt.ylabel('number')
    plt.title('t-value distribution (%s, run-%02d)' % (sub, i+1))
    plt.savefig(path_save)

# create a dataframe with the number of voxels
df_thresh = pd.DataFrame({
    'id': [sub] * n_run,
    'run': np.arange(1,n_run+1),
    'n_total': num_tmap_voxel,
    'n_above': num_above_voxel,
    'n_below': num_below_voxel
})
file_name = opj(path_out_data, '%s_thresholding.csv' % (sub))
df_thresh.to_csv(file_name, sep=',', index=False)

'''
========================================================================
FEATURE SELECTION: BINARIZE THE THRESHOLDED MASKED T-MAPS
We set all voxels above and below the threshold to 1 and all voxels
that were not selected to 0.
========================================================================
'''
# replace all NaNs with 0:
tmaps_masked_thresh_bin = [np.where(np.isnan(i), 0, i) for i in copy.deepcopy(tmaps_masked_thresh)]
# replace all other values with 1:
tmaps_masked_thresh_bin = [np.where(i > 0, 1, i) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
# turn the 3D-array into booleans:
tmaps_masked_thresh_bin = [i.astype(bool) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
# create image like object:
masks_final = [image.new_img_like(path_func_task[0], i.astype(np.int)) for i in copy.deepcopy(tmaps_masked_thresh_bin)]

logging.info('plotting final masks with anatomical as background:')
for i, path in enumerate(masks_final):
    filename = '%s_run-%02d_tmap_masked_thresh.nii.gz'\
               % (sub, i + 1)
    path_save = opj(path_out_masks, filename)
    logging.info('saving final mask %s (%d of %d)'
                 % (path_save, i+1, len(masks_final)))
    path.to_filename(path_save)
    path_save = opj(path_out_figs, '%s_run-%02d_visual_final_mask.png'
                    % (sub, i + 1))
    logging.info('plotting final mask %s (%d of %d)'
                 % (path_save, i + 1, len(masks_final)))
    plotting.plot_roi(path, anat, title=os.path.basename(path_save),
                      output_file=path_save, colorbar=True)
'''
========================================================================
LOAD SMOOTHED FMRI DATA FOR ALL FUNCTIONAL TASK RUNS:
========================================================================
'''
# load smoothed functional mri data for all eight task runs:
logging.info('loading %d functional task runs ...' % len(path_func_task))
data_task = [image.load_img(i) for i in path_func_task]
logging.info('loading successful!')
'''
========================================================================
DEFINE THE CLASSIFIERS
========================================================================
'''
class_labels = ['cat', 'chair', 'face', 'house', 'shoe']
# create a dictionary with all values as independent instances:
# see here: https://bit.ly/2J1DvZm
clf_set = {key: LogisticRegression(
    C=1., penalty='l2', multi_class='ovr', solver='lbfgs',
    max_iter=4000, class_weight='balanced', random_state=42) for key in class_labels}
classifiers = {
    'log_reg': LogisticRegression(
        C=1., penalty='l2', multi_class='multinomial', solver='lbfgs',
        max_iter=4000, class_weight='balanced', random_state=42)}
clf_set.update(classifiers)
'''
========================================================================
1. SPLIT THE EVENTS DATAFRAME FOR EACH TASK CONDITION
2. RESET THE INDICES OF THE DATAFRAMES
3. SORT THE ROWS OF ALL DATAFRAMES IN CHRONOLOGICAL ORDER
4. PRINT THE NUMBER OF TRIALS OF EACH TASK CONDITION
========================================================================
'''


class TaskData:
    """

    """
    def __init__(
            self, events, condition, name, trial_type, bold_delay=0,
            interval=1, t_tr=1.25, num_vol_run=530):
        import pandas as pd
        # define name of the task data subset:
        self.name = name
        # define the task condition the task data subset if from:
        self.condition = condition
        # define the delay (in seconds) by which onsets are moved:
        self.bold_delay = bold_delay
        # define the number of TRs from event onset that should be selected:
        self.interval = interval
        # define the repetition time (TR) of the mri data acquisition:
        self.t_tr = t_tr
        # define the number of volumes per task run:
        self.num_vol_run = num_vol_run
        # select events: upright stimulus, correct answer trials only:
        if trial_type == 'stimulus':
            self.events = events.loc[
                          (events['condition'] == condition) &
                          (events['trial_type'] == trial_type) &
                          (events['stim_orient'] == 0) &
                          (events['serial_position'] == 1) &
                          (events['accuracy'] != 0),
                          :]
        elif trial_type == 'cue':
            self.events = events.loc[
                          (events['condition'] == condition) &
                          (events['trial_type'] == trial_type),
                          :]
        # reset the indices of the data frame:
        self.events.reset_index()
        # sort all values by session and run:
        self.events.sort_values(by=['session', 'run_session'])
        # call further function upon initialization:
        self.define_trs()
        self.get_stats()

    def define_trs(self):
        # import relevant functions:
        import numpy as np
        # select all events onsets:
        self.event_onsets = self.events['onset']
        # add the selected delay to account for the peak of the hrf
        self.bold_peaks = self.events['onset'] + self.bold_delay
        # divide the expected time-point of bold peaks by the repetition time:
        self.peak_trs = self.bold_peaks / self.t_tr
        # add the number of run volumes to the tr indices:
        run_volumes = (self.events['run_study']-1) * self.num_vol_run
        # add the number of volumes of each run:
        trs = round(self.peak_trs + run_volumes)
        # copy the relevant trs as often as specified by the interval:
        a = np.transpose(np.tile(trs, (self.interval, 1)))
        # create same-sized matrix with trs to be added:
        b = np.full((len(trs), self.interval), np.arange(self.interval))
        # assign the final list of trs:
        self.trs = np.array(np.add(a, b).flatten(), dtype=int)
        # save the TRs of the stimulus presentations
        self.stim_trs = round(self.event_onsets / self.t_tr + run_volumes)

    def get_stats(self):
        import numpy as np
        self.num_trials = len(self.events)
        self.runs = np.repeat(np.array(self.events['run_study'], dtype=int), self.interval)
        self.trials = np.repeat(np.array(self.events['trial'], dtype=int), self.interval)
        self.sess = np.repeat(np.array(self.events['session'], dtype=int), self.interval)
        self.stim = np.repeat(np.array(self.events['stim_label'], dtype=object), self.interval)
        self.itis = np.repeat(np.array(self.events['interval_time'], dtype=float), self.interval)
        self.stim_trs = np.repeat(np.array(self.stim_trs, dtype=int), self.interval)

    def zscore(self, signals, run_list, t_tr=1.25):
        from nilearn.signal import clean
        import numpy as np
        # get boolean indices for all run indices in the run list:
        run_indices = np.isin(self.runs, list(run_list))
        # standardize data all runs in the run list:
        self.data_zscored = clean(
            signals=signals[self.trs[run_indices]],
            sessions=self.runs[run_indices],
            t_r=t_tr,
            detrend=False,
            standardize=True)

    def predict(self, clf, run_list):
        # import packages:
        import pandas as pd
        # get classifier class predictions:
        pred_class = clf.predict(self.data_zscored)
        # get classifier probabilistic predictions:
        pred_proba = clf.predict_proba(self.data_zscored)
        # get the classes of the classifier:
        classes_names = clf.classes_
        # get boolean indices for all run indices in the run list:
        run_indices = np.isin(self.runs, list(run_list))
        # create a dataframe with the probabilities of each class:
        df = pd.DataFrame(pred_proba, columns=classes_names)
        # get the number of predictions made:
        num_pred = len(df)
        # get the number of trials in the test set:
        num_trials = int(num_pred / self.interval)
        # add the predicted class label to the dataframe:
        df['pred_label'] = pred_class
        # add the true stimulus label to the dataframe:
        df['stim'] = self.stim[run_indices]
        # add the volume number (TR) to the dataframe:
        df['tr'] = self.trs[run_indices]
        # add the sequential TR to the dataframe:
        df['seq_tr'] = np.tile(np.arange(1, self.interval + 1), num_trials)
        # add the counter of trs on which the stimulus was presented
        df['stim_tr'] = self.stim_trs[run_indices]
        # add the trial number to the dataframe:
        df['trial'] = self.trials[run_indices]
        # add the run number to the dataframe:
        df['run_study'] = self.runs[run_indices]
        # add the session number to the dataframe:
        df['session'] = self.sess[run_indices]
        # add the inter trial interval to the dataframe:
        df['tITI'] = self.itis[run_indices]
        # add the participant id to the dataframe:
        df['id'] = np.repeat(self.events['subject'].unique(), num_pred)
        # add the name of the classifier to the dataframe:
        df['test_set'] = np.repeat(self.name, num_pred)
        # return the dataframe:
        return df


train_odd_peak = TaskData(
        events=df_events, condition='oddball', trial_type='stimulus',
        bold_delay=4, interval=1, name='train-odd_peak')
test_odd_peak = TaskData(
        events=df_events, condition='oddball', trial_type='stimulus',
        bold_delay=4, interval=1, name='test-odd_peak')
test_odd_long = TaskData(
        events=df_events, condition='oddball', trial_type='stimulus',
        bold_delay=0, interval=7, name='test-odd_long')
test_seq_long = TaskData(
        events=df_events, condition='sequence', trial_type='stimulus',
        bold_delay=0, interval=13, name='test-seq_long')
test_rep_long = TaskData(
        events=df_events, condition='repetition', trial_type='stimulus',
        bold_delay=0, interval=13, name='test-rep_long')
test_seq_cue = TaskData(
        events=df_events, condition='sequence', trial_type='cue',
        bold_delay=0, interval=5, name='test-seq_cue')
test_rep_cue = TaskData(
        events=df_events, condition='repetition', trial_type='cue',
        bold_delay=0, interval=5, name='test-rep_cue')
test_sets = [
        test_odd_peak, test_odd_long, test_seq_long, test_rep_long,
        test_seq_cue, test_rep_cue
        ]
'''
========================================================================
SEPARATE RUN-WISE STANDARDIZATION (Z-SCORING) OF ALL TASK CONDITIONS
Standardize features by removing the mean and scaling to unit variance.
Centering and scaling happen independently on each feature by computing
the relevant statistics on the samples in the training set. Here,
we standardize the features of all trials run-wise but separated for
each task condition (oddball, sequence, and repetition condition).
========================================================================
'''


def detrend(data, t_tr=1.25):
    from nilearn.signal import clean
    data_detrend = clean(
        signals=data, t_r=t_tr, detrend=True, standardize=False)
    return data_detrend


def show_weights(array):
    # https://stackoverflow.com/a/50154388
    import numpy as np
    import seaborn as sns
    n_samples = array.shape[0]
    classes, bins = np.unique(array, return_counts=True)
    n_classes = len(classes)
    weights = n_samples / (n_classes * bins)
    sns.barplot(classes, weights)
    plt.xlabel('class label')
    plt.ylabel('weight')
    plt.show()


def melt_df(df, melt_columns):
    # save the column names of the dataframe in a list:
    column_names = df.columns.tolist()
    # remove the stimulus classes from the column names;
    id_vars = [x for x in column_names if x not in melt_columns]
    # melt the dataframe creating one column with value_name and var_name:
    df_melt = pd.melt(
            df, value_name='probability', var_name='class', id_vars=id_vars)
    # return the melted dataframe:
    return df_melt


data_list = []
runs = list(range(1, n_run+1))
#mask_label = 'cv'

logging.info('starting leave-one-run-out cross-validation')
for mask_label in ['cv', 'cv_hpc']:
    logging.info('testing in mask %s' % (mask_label))
    for run in runs:
        logging.info('testing on run %d of %d ...' % (run, len(runs)))
        # define the run indices for the training and test set:
        train_runs = [x for x in runs if x != run]
        test_runs = [x for x in runs if x == run]
        # get the feature selection mask of the current run:
        if mask_label == 'cv':
            mask_run = masks_final[runs.index(run)]
        elif mask_label == 'cv_hpc':
            mask_run = path_mask_hpc_task[runs.index(run)]
        # extract smoothed fMRI data from the mask for the cross-validation fold:
        masked_data = [masking.apply_mask(i, mask_run) for i in data_task]
        # detrend the masked fMRI data separately for each run:
        data_detrend = [detrend(i) for i in masked_data]
        # combine the detrended data of all runs:
        data_detrend = np.vstack(data_detrend)
        # loop through all classifiers in the classifier set:
        for clf_name, clf in clf_set.items():
            # print classifier:
            logging.info('classifier: %s' % clf_name)
            # fit the classifier to the training data:
            train_odd_peak.zscore(signals=data_detrend, run_list=train_runs)
            # get the example labels:
            train_stim = copy.deepcopy(train_odd_peak.stim[train_odd_peak.runs != run])
            # replace labels for single-label classifiers:
            if clf_name in class_labels:
                # replace all other labels with other
                train_stim = ['other' if x != clf_name else x for x in train_stim]
                # turn into a numpy array
                train_stim = np.array(train_stim, dtype=object)
            # check weights:
            #show_weights(array=train_stim)
            # train the classifier
            clf.fit(train_odd_peak.data_zscored, train_stim)
            # classifier prediction: predict on test data and save the data:
            for test_set in test_sets:
                logging.info('testing on test set %s' % test_set.name)
                test_set.zscore(signals=data_detrend, run_list=test_runs)
                if test_set.data_zscored.size < 0:
                    continue
                # create dataframe containing classifier predictions:
                df_pred = test_set.predict(clf=clf, run_list=test_runs)
                # add the current classifier as a new column:
                df_pred['classifier'] = np.repeat(clf_name, len(df_pred))
                # add a label that indicates the mask / training regime:
                df_pred['mask'] = np.repeat(mask_label, len(df_pred))
                # melt the data frame:
                df_pred_melt = melt_df(df=df_pred, melt_columns=train_stim)
                # append dataframe to list of dataframe results:
                data_list.append(df_pred_melt)
'''
========================================================================
DECODING ON RESTING STATE DATA:
========================================================================
'''
logging.info('Loading fMRI data of %d resting state runs ...' % len(path_rest))
data_rest = [image.load_img(i) for i in path_rest]
logging.info('loading successful!')

# combine all masks from the feature selection by intersection:
def multimultiply(arrays):
    import copy
    # start with the first array:
    array_union = copy.deepcopy(arrays[0].astype(np.int))
    # loop through all arrays
    for i in range(len(arrays)):
        # multiply every array with all previous array
        array_union = np.multiply(array_union, copy.deepcopy(arrays[i].astype(np.int)))
    # return the union of all arrays:
    return(array_union)


for mask_label in ['union', 'union_hpc']:
    # calculate the union of all masks by multiplication:
    if mask_label == 'union':
        masks_union = multimultiply(tmaps_masked_thresh_bin).astype(int).astype(bool)
    elif mask_label == 'union_hpc':
        masks_union = multimultiply(mask_hpc).astype(int).astype(bool)
    # masked tmap into image like object:
    masks_union_nii = image.new_img_like(path_func_task[0], masks_union)
    path_save = opj(path_out_masks, '{}_task-rest_mask-{}.nii.gz'.format(sub, mask_label))
    masks_union_nii.to_filename(path_save)
    # mask all resting state runs with the averaged feature selection masks:
    data_rest_masked = [masking.apply_mask(i, masks_union_nii) for i in data_rest]
    # detrend and standardize each resting state run separately:
    data_rest_final = [clean(i, detrend=True, standardize=True) for i in data_rest_masked]
    # mask all functional task runs separately:
    data_task_masked = [masking.apply_mask(i, masks_union_nii) for i in data_task]
    # detrend each task run separately:
    data_task_masked_detrend = [clean(i, detrend=True, standardize=False) for i in data_task_masked]
    # combine the detrended data of all runs:
    data_task_masked_detrend = np.vstack(data_task_masked_detrend)
    # select only oddball data and standardize:
    train_odd_peak.zscore(signals=data_task_masked_detrend, run_list=runs)
    # write session and run labels:
    ses_labels = [i.split(sub + "_")[1].split("_task")[0] for i in path_rest]
    run_labels = [i.split("prenorm_")[1].split("_space")[0] for i in path_rest]
    file_names = ['_'.join([a, b]) for (a, b) in zip(ses_labels, run_labels)]
    rest_interval = 1
    # save the voxel patterns:
    num_voxels = len(train_odd_peak.data_zscored[0])
    voxel_labels = ['v' + str(x) for x in range(num_voxels)]
    df_patterns = pd.DataFrame(train_odd_peak.data_zscored, columns=voxel_labels)
    # add the stimulus labels to the dataframe:
    df_patterns['label'] = copy.deepcopy(train_odd_peak.stim)
    # add the participant id to the dataframe:
    df_patterns['id'] = np.repeat(df_events['subject'].unique(), len(train_odd_peak.stim))
    # add the mask label:
    df_patterns['mask'] = np.repeat(mask_label, len(train_odd_peak.stim))
    # split pattern dataframe by label:
    df_pattern_list = [df_patterns[df_patterns['label'] == i] for i in df_patterns['label'].unique()]
    # create file path to save the dataframe:
    file_paths = [opj(path_out_data, '{}_voxel_patterns_{}_{}'.format(sub, mask_label, i)) for i in df_patterns['label'].unique()]
    # save the final dataframe to a .csv-file:
    [i.to_csv(j + '.csv', sep=',', index=False) for (i, j) in zip(df_pattern_list, file_paths)]
    # save only the voxel patterns as niimg-like objects
    [masking.unmask(X=i.loc[:, voxel_labels].to_numpy(), mask_img=masks_union_nii).to_filename(j + '.nii.gz') for (i, j) in zip(df_pattern_list, file_paths)]
    #[image.new_img_like(path_func_task[0], i.loc[:, voxel_labels].to_numpy()).to_filename(j + '.nii.gz') for (i, j) in zip(df_pattern_list, file_paths)]
    # decoding on resting state data:
    for clf_name, clf in clf_set.items():
        # print classifier name:
        logging.info('classifier: %s' % clf_name)
        # get the example labels for all slow trials:
        train_stim = copy.deepcopy(train_odd_peak.stim)
        # replace labels for single-label classifiers:
        if clf_name in class_labels:
            # replace all other labels with other
            train_stim = ['other' if x != clf_name else x for x in train_stim]
            # turn into a numpy array
            train_stim = np.array(train_stim, dtype=object)
        # train the classifier
        clf.fit(train_odd_peak.data_zscored, train_stim)
        # classifier prediction: predict on test data and save the data:
        pred_rest_class = [clf.predict(i) for i in data_rest_final]
        pred_rest_proba = [clf.predict_proba(i) for i in data_rest_final]
        # get the class names of the classifier:
        classes_names = clf.classes_
        # save classifier predictions on resting state scans
        for t, name in enumerate(pred_rest_proba):
            # create a dataframe with the probabilities of each class:
            df_rest_pred = pd.DataFrame(
                    pred_rest_proba[t], columns=classes_names)
            # get the number of predictions made:
            num_pred = len(df_rest_pred)
            # get the number of trials in the test set:
            num_trials = int(num_pred / rest_interval)
            # add the predicted class label to the dataframe:
            df_rest_pred['pred_label'] = pred_rest_class[t]
            # add the true stimulus label to the dataframe:
            df_rest_pred['stim'] = np.full(num_pred, np.nan)
            # add the volume number (TR) to the dataframe:
            df_rest_pred['tr'] = np.arange(1, num_pred + 1)
            # add the sequential TR to the dataframe:
            df_rest_pred['seq_tr'] = np.arange(1, num_pred + 1)
            # add the trial number to the dataframe:
            df_rest_pred['trial'] = np.tile(
                    np.arange(1, rest_interval + 1), num_trials)
            # add the run number to the dataframe:
            df_rest_pred['run_study'] = np.repeat(run_labels[t], num_pred)
            # add the session number to the dataframe:
            df_rest_pred['session'] = np.repeat(ses_labels[t], len(df_rest_pred))
            # add the inter trial interval to the dataframe:
            df_rest_pred['tITI'] = np.tile('rest', num_pred)
            # add the participant id to the dataframe:
            df_rest_pred['id'] = np.repeat(df_events['subject'].unique(), num_pred)
            # add the name of the classifier to the dataframe:
            df_rest_pred['test_set'] = np.repeat('rest', num_pred)
            # add the name of the classifier to the dataframe;
            df_rest_pred['classifier'] = np.repeat(clf_name, num_pred)
            # add a label that indicates the mask / training regime:
            df_rest_pred['mask'] = np.repeat(mask_label, len(df_rest_pred))
            # melt the data frame:
            df_pred_melt = melt_df(df=df_rest_pred, melt_columns=train_stim)
            # append dataframe to list of dataframe results:
            data_list.append(df_pred_melt)
        # run classifier trained on all runs across all test sets:
        for test_set in test_sets:
            logging.info('testing on test set %s' % test_set.name)
            test_set.zscore(signals=data_task_masked_detrend, run_list=runs)
            if test_set.data_zscored.size < 0:
                continue
            # create dataframe containing classifier predictions:
            df_pred = test_set.predict(clf=clf, run_list=runs)
            # add the current classifier as a new column:
            df_pred['classifier'] = np.repeat(clf_name, len(df_pred))
            # add a label that indicates the mask / training regime:
            df_pred['mask'] = np.repeat(mask_label, len(df_pred))
            # melt the data frame:
            df_pred_melt = melt_df(df=df_pred, melt_columns=train_stim)
            # append dataframe to list of dataframe results:
            data_list.append(df_pred_melt)

# concatenate all decoding dataframes in one final dataframe:
df_all = pd.concat(data_list, sort=False)
# create file path to save the dataframe:
file_path = opj(path_out_data, '{}_decoding.csv'.format(sub))
# save the final dataframe to a .csv-file:
df_all.to_csv(file_path, sep=',', index=False)

'''
========================================================================
STOP LOGGING:
========================================================================
'''
end = time.time()
total_time = (end - start)/60
logging.info('total running time: %0.2f minutes' % total_time)
logging.shutdown()