highspeed_decoding.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # ======================================================================
  4. # SCRIPT INFORMATION:
  5. # ======================================================================
  6. # SCRIPT: DECODING ANALYSIS
  7. # PROJECT: HIGHSPEED
  8. # WRITTEN BY LENNART WITTKUHN, 2018 - 2019
  9. # CONTACT: WITTKUHN AT MPIB HYPHEN BERLIN DOT MPG DOT DE
  10. # MAX PLANCK RESEARCH GROUP NEUROCODE
  11. # MAX PLANCK INSTITUTE FOR HUMAN DEVELOPMENT
  12. # MAX PLANCK UCL CENTRE FOR COMPUTATIONAL PSYCHIATRY AND AGEING RESEARCH
  13. # LENTZEALLEE 94, 14195 BERLIN, GERMANY
  14. '''
  15. ========================================================================
  16. IMPORT RELEVANT PACKAGES:
  17. ========================================================================
  18. '''
  19. import glob
  20. import os
  21. import yaml
  22. import logging
  23. import time
  24. from os.path import join as opj
  25. import sys
  26. import copy
  27. from pprint import pformat
  28. import numpy as np
  29. from nilearn.input_data import NiftiMasker
  30. from nilearn import plotting, image, masking
  31. import pandas as pd
  32. from matplotlib import pyplot as plt
  33. import warnings
  34. from nilearn.signal import clean
  35. from sklearn.svm import SVC
  36. from sklearn.svm import LinearSVC
  37. from sklearn.linear_model import LogisticRegression
  38. from sklearn.model_selection import LeaveOneGroupOut
  39. from sklearn.model_selection import cross_val_score, cross_val_predict
  40. from sklearn.preprocessing import StandardScaler
  41. from collections import Counter
  42. import datalad.api as dl
  43. '''
  44. ========================================================================
  45. DEAL WITH ERRORS, WARNING ETC.
  46. ========================================================================
  47. '''
  48. warnings.filterwarnings('ignore', message='numpy.dtype size changed*')
  49. warnings.filterwarnings('ignore', message='numpy.ufunc size changed*')
  50. mpl_logger = logging.getLogger('matplotlib')
  51. mpl_logger.setLevel(logging.WARNING)
  52. '''
  53. ========================================================================
  54. SETUP NECESSARY PATHS ETC:
  55. ========================================================================
  56. '''
  57. # get start time of the script:
  58. start = time.time()
  59. # name of the current project:
  60. project = 'highspeed'
  61. # initialize empty variables:
  62. sub = None
  63. path_tardis = None
  64. path_server = None
  65. path_local = None
  66. # define paths depending on the operating system (OS) platform:
  67. if 'darwin' in sys.platform:
  68. # define the path to the cluster:
  69. path_tardis = opj(os.environ['HOME'], 'Volumes', 'tardis_beegfs', project)
  70. # define the path to the server:
  71. path_server = opj('/Volumes', 'MPRG-Neurocode', 'Data', project)
  72. # define the path to the local computer:
  73. path_local = opj(os.environ['HOME'], project, '*', '*')
  74. # define the subject id:
  75. sub = 'sub-14'
  76. elif 'linux' in sys.platform:
  77. # path to the project root:
  78. project_name = 'highspeed-decoding'
  79. path_root = os.getenv('PWD').split(project_name)[0] + project_name
  80. # define the path to the cluster:
  81. path_tardis = path_root
  82. # define the path to the server:
  83. path_server = path_tardis
  84. # define the path to the local computer:
  85. path_local = opj(path_tardis, 'code', 'decoding')
  86. # define the subject id:
  87. sub = 'sub-%s' % sys.argv[1]
  88. '''
  89. ========================================================================
  90. LOAD PROJECT PARAMETERS:
  91. ========================================================================
  92. '''
  93. path_params = glob.glob(opj(path_local, '*parameters.yaml'))[0]
  94. with open(path_params, 'rb') as f:
  95. params = yaml.load(f, Loader=yaml.FullLoader)
  96. f.close()
  97. '''
  98. ========================================================================
  99. CREATE PATHS TO OUTPUT DIRECTORIES:
  100. ========================================================================
  101. '''
  102. path_decoding = opj(path_tardis, 'decoding')
  103. path_out = opj(path_decoding, sub)
  104. path_out_figs = opj(path_out, 'plots')
  105. path_out_data = opj(path_out, 'data')
  106. path_out_logs = opj(path_out, 'logs')
  107. path_out_masks = opj(path_out, 'masks')
  108. '''
  109. ========================================================================
  110. CREATE OUTPUT DIRECTORIES IF THE DO NOT EXIST YET:
  111. ========================================================================
  112. '''
  113. for path in [path_out_figs, path_out_data, path_out_logs, path_out_masks]:
  114. if not os.path.exists(path):
  115. os.makedirs(path)
  116. '''
  117. ========================================================================
  118. SETUP LOGGING:
  119. ========================================================================
  120. '''
  121. # remove all handlers associated with the root logger object:
  122. for handler in logging.root.handlers[:]:
  123. logging.root.removeHandler(handler)
  124. # get current data and time as a string:
  125. timestr = time.strftime("%Y-%m-%d-%H-%M-%S")
  126. # create path for the logging file
  127. log = opj(path_out_logs, '%s-%s.log' % (timestr, sub))
  128. # start logging:
  129. logging.basicConfig(
  130. filename=log, level=logging.DEBUG, format='%(asctime)s %(message)s',
  131. datefmt='%d/%m/%Y %H:%M:%S')
  132. '''
  133. ========================================================================
  134. DEFINE DECODING SPECIFIC PARAMETERS:
  135. ========================================================================
  136. '''
  137. # define the mask to be used:
  138. mask = 'visual' # visual or whole
  139. # applied time-shift to account for the BOLD delay, in seconds:
  140. bold_delay = 4 # 4, 5 or 6 secs
  141. # define the degree of smoothing of the functional data
  142. smooth = 4
  143. '''
  144. ========================================================================
  145. ADD BASIC SCRIPT INFORMATION TO THE LOGGER:
  146. ========================================================================
  147. '''
  148. logging.info('Running decoding script')
  149. logging.info('operating system: %s' % sys.platform)
  150. logging.info('project name: %s' % project)
  151. logging.info('participant: %s' % sub)
  152. logging.info('mask: %s' % mask)
  153. logging.info('bold delay: %d secs' % bold_delay)
  154. logging.info('smoothing kernel: %d mm' % smooth)
  155. '''
  156. ========================================================================
  157. DEFINE RELEVANT VARIABLES:
  158. ========================================================================
  159. '''
  160. # time of repetition (TR), in seconds:
  161. t_tr = params['mri']['tr']
  162. # number of volumes (TRs) for each functional task run:
  163. n_tr_run = 530
  164. # acquisition time window of one sequence trial, in seconds:
  165. t_win = 16
  166. # number of measurements that are considered per sequence time window:
  167. n_tr_win = round(t_win / t_tr)
  168. # number of oddball trials in the experiment:
  169. n_tr_odd = 600
  170. # number of sequence trials in the experiment:
  171. n_tr_seq = 75
  172. # number of repetition trials in the experiment:
  173. n_tr_rep = 45
  174. # number of scanner triggers before the experiment starts:
  175. n_tr_wait = params['mri']['num_trigger']
  176. # number of functional task runs in total:
  177. n_run = params['mri']['num_runs']
  178. # number of experimental sessions in total:
  179. n_ses = params['mri']['num_sessions']
  180. # number of functional task runs per session:
  181. n_run_ses = int(n_run / n_ses)
  182. '''
  183. ========================================================================
  184. LOAD BEHAVIORAL DATA (THE EVENTS.TSV FILES OF THE SUBJECT):
  185. ========================================================================
  186. Load the events files for the subject. The 'oddball' data serves as the
  187. training set of the classifier. The 'sequence' data constitutes the
  188. test set.
  189. '''
  190. # paths to all events files of the current subject:
  191. path_events = opj(path_tardis, 'bids', sub, 'ses-*', 'func', '*tsv')
  192. dl.get(glob.glob(path_events))
  193. path_events = sorted(glob.glob(path_events), key=lambda f: os.path.basename(f))
  194. logging.info('found %d event files' % len(path_events))
  195. logging.info('paths to events files (sorted):\n%s' % pformat(path_events))
  196. # import events file and save data in dataframe:
  197. df_events = pd.concat((pd.read_csv(f, sep='\t') for f in path_events),
  198. ignore_index=True)
  199. '''
  200. ========================================================================
  201. CREATE PATHS TO THE MRI DATA:
  202. ========================================================================
  203. '''
  204. # define path to input directories:
  205. path_fmriprep = opj(path_tardis, 'fmriprep', 'fmriprep', sub)
  206. path_level1 = opj(path_tardis, 'glm', 'l1pipeline')
  207. path_masks = opj(path_tardis, 'masks', 'masks')
  208. logging.info('path to fmriprep files: %s' % path_fmriprep)
  209. logging.info('path to level 1 files: %s' % path_level1)
  210. logging.info('path to mask files: %s' % path_masks)
  211. paths = {
  212. 'fmriprep': opj(path_tardis, 'fmriprep', 'fmriprep', sub),
  213. 'level1': opj(path_tardis, 'glm', 'l1pipeline'),
  214. 'masks': opj(path_tardis, 'masks', 'masks')
  215. }
  216. # load the visual mask task files:
  217. path_mask_vis_task = opj(path_masks, 'mask_visual', sub, '*', '*task-highspeed*.nii.gz')
  218. path_mask_vis_task = sorted(glob.glob(path_mask_vis_task), key=lambda f: os.path.basename(f))
  219. logging.info('found %d visual mask task files' % len(path_mask_vis_task))
  220. logging.info('paths to visual mask task files:\n%s' % pformat(path_mask_vis_task))
  221. dl.get(path_mask_vis_task)
  222. # load the hippocampus mask task files:
  223. path_mask_hpc_task = opj(path_masks, 'mask_hippocampus', sub, '*', '*task-highspeed*.nii.gz')
  224. path_mask_hpc_task = sorted(glob.glob(path_mask_hpc_task), key=lambda f: os.path.basename(f))
  225. logging.info('found %d hpc mask files' % len(path_mask_hpc_task))
  226. logging.info('paths to hpc mask task files:\n%s' % pformat(path_mask_hpc_task))
  227. dl.get(path_mask_hpc_task)
  228. # load the whole brain mask files:
  229. path_mask_whole_task = opj(path_fmriprep, '*', 'func', '*task-highspeed*T1w*brain_mask.nii.gz')
  230. path_mask_whole_task = sorted(glob.glob(path_mask_whole_task), key=lambda f: os.path.basename(f))
  231. logging.info('found %d whole-brain masks' % len(path_mask_whole_task))
  232. logging.info('paths to whole-brain mask files:\n%s' % pformat(path_mask_whole_task))
  233. dl.get(path_mask_whole_task)
  234. # load the functional mri task files:
  235. path_func_task = opj(path_level1, 'smooth', sub, '*', '*task-highspeed*nii.gz')
  236. path_func_task = sorted(glob.glob(path_func_task), key=lambda f: os.path.basename(f))
  237. logging.info('found %d functional mri task files' % len(path_func_task))
  238. logging.info('paths to functional mri task files:\n%s' % pformat(path_func_task))
  239. dl.get(path_func_task)
  240. # define path to the functional resting state runs:
  241. path_rest = opj(path_tardis, 'masks', 'masks', 'smooth', sub, '*', '*task-rest*nii.gz')
  242. path_rest = sorted(glob.glob(path_rest), key=lambda f: os.path.basename(f))
  243. logging.info('found %d functional mri rest files' % len(path_rest))
  244. logging.info('paths to functional mri rest files:\n%s' % pformat(path_rest))
  245. dl.get(path_rest)
  246. # load the anatomical mri file:
  247. path_anat = opj(path_fmriprep, 'anat', '%s_desc-preproc_T1w.nii.gz' % sub)
  248. path_anat = sorted(glob.glob(path_anat), key=lambda f: os.path.basename(f))
  249. logging.info('found %d anatomical mri file' % len(path_anat))
  250. logging.info('paths to anatoimical mri files:\n%s' % pformat(path_anat))
  251. dl.get(path_anat)
  252. # load the confounds files:
  253. path_confs_task = opj(path_fmriprep, '*', 'func', '*task-highspeed*confounds_regressors.tsv')
  254. path_confs_task = sorted(glob.glob(path_confs_task), key=lambda f: os.path.basename(f))
  255. logging.info('found %d confounds files' % len(path_confs_task))
  256. logging.info('found %d confounds files' % len(path_confs_task))
  257. logging.info('paths to confounds files:\n%s' % pformat(path_confs_task))
  258. dl.get(path_confs_task)
  259. # load the spm.mat files:
  260. path_spm_mat = opj(path_level1, 'contrasts', sub, '*', 'SPM.mat')
  261. path_spm_mat = sorted(glob.glob(path_spm_mat), key=lambda f: os.path.dirname(f))
  262. logging.info('found %d spm.mat files' % len(path_spm_mat))
  263. logging.info('paths to spm.mat files:\n%s' % pformat(path_spm_mat))
  264. dl.get(path_spm_mat)
  265. # load the t-maps of the first-level glm:
  266. path_tmap = opj(path_level1, 'contrasts', sub, '*', 'spmT*.nii')
  267. path_tmap = sorted(glob.glob(path_tmap), key=lambda f: os.path.dirname(f))
  268. logging.info('found %d t-maps' % len(path_tmap))
  269. logging.info('paths to t-maps files:\n%s' % pformat(path_tmap))
  270. dl.get(path_tmap)
  271. '''
  272. ========================================================================
  273. LOAD THE MRI DATA:
  274. ========================================================================
  275. '''
  276. anat = image.load_img(path_anat[0])
  277. logging.info('successfully loaded %s' % path_anat[0])
  278. # load visual mask:
  279. mask_vis = image.load_img(path_mask_vis_task[0]).get_data().astype(int)
  280. logging.info('successfully loaded one visual mask file!')
  281. # load tmap data:
  282. tmaps = [image.load_img(i).get_data().astype(float) for i in copy.deepcopy(path_tmap)]
  283. logging.info('successfully loaded the tmaps!')
  284. # load hippocampus mask:
  285. mask_hpc = [image.load_img(i).get_data().astype(int) for i in copy.deepcopy(path_mask_hpc_task)]
  286. logging.info('successfully loaded one visual mask file!')
  287. '''
  288. FEATURE SELECTION
  289. '''
  290. # plot raw-tmaps on an anatomical background:
  291. logging.info('plotting raw tmaps with anatomical as background:')
  292. for i, path in enumerate(path_tmap):
  293. logging.info('plotting raw tmap %s (%d of %d)' % (path, i+1, len(path_tmap)))
  294. path_save = opj(path_out_figs, '%s_run-%02d_tmap_raw.png' % (sub, i+1))
  295. plotting.plot_roi(path, anat, title=os.path.basename(path_save),
  296. output_file=path_save, colorbar=True)
  297. '''
  298. ========================================================================
  299. FEATURE SELECTION: MASKS THE T-MAPS WITH THE ANATOMICAL MASKS
  300. We create a combination of the t-maps and the anatomical mask.
  301. To this end, we multiply the anatomical mask with each t-map.
  302. As the anatomical conists of binary values (zeros and ones) a
  303. multiplication results in t-map masked by the anatomical ROI.
  304. ========================================================================
  305. '''
  306. #v= tmaps_masked[0]
  307. #fig = plt.figure()
  308. #ax = fig.add_subplot(111, projection='3d')
  309. #ax.scatter(v[:,0],v[:,1],v[:,2], zdir='z', c= 'red')
  310. #plt.show()
  311. # check if any value in the supposedly binary mask is bigger than 1:
  312. if np.any(mask_vis > 1):
  313. logging.info('WARNING: detected values > 1 in the anatomical ROI!')
  314. sys.exit("Values > 1 in the anatomical ROI!")
  315. # get combination of anatomical mask and t-map
  316. tmaps_masked = [np.multiply(mask_vis, i) for i in copy.deepcopy(tmaps)]
  317. # masked tmap into image like object:
  318. tmaps_masked_img = [image.new_img_like(i, j) for (i, j) in zip(path_tmap, copy.deepcopy(tmaps_masked))]
  319. for i, path in enumerate(tmaps_masked_img):
  320. path_save = opj(path_out_masks, '%s_run-%02d_tmap_masked.nii.gz' % (sub, i + 1))
  321. path.to_filename(path_save)
  322. # plot masked t-maps
  323. logging.info('plotting masked tmaps with anatomical as background:')
  324. for i, path in enumerate(tmaps_masked_img):
  325. logging.info('plotting masked tmap %d of %d' % (i+1, len(tmaps_masked_img)))
  326. path_save = opj(path_out_figs, '%s_run-%02d_tmap_masked.png' % (sub, i+1))
  327. plotting.plot_roi(path, anat, title=os.path.basename(path_save),
  328. output_file=path_save, colorbar=True)
  329. '''
  330. ========================================================================
  331. FEATURE SELECTION: THRESHOLD THE MASKED T-MAPS
  332. We threshold the masked t-maps, selecting only voxels above AND
  333. below this threshold. We then extract these data and count how
  334. many voxels were above and / or below the threshold in total.
  335. ========================================================================
  336. '''
  337. # set the threshold:
  338. threshold = params['mri']['thresh']
  339. logging.info('thresholding t-maps with a threshold of %s' % str(threshold))
  340. # threshold the masked tmap image:
  341. tmaps_masked_thresh_img = [image.threshold_img(i, threshold) for i in copy.deepcopy(tmaps_masked_img)]
  342. logging.info('plotting thresholded tmaps with anatomical as background:')
  343. for i, path in enumerate(tmaps_masked_thresh_img):
  344. path_save = opj(path_out_figs, '%s_run-%02d_tmap_masked_thresh.png' % (sub, i+1))
  345. logging.info('plotting masked tmap %s (%d of %d)'
  346. % (path_save, i + 1, len(tmaps_masked_thresh_img)))
  347. plotting.plot_roi(path, anat, title=os.path.basename(path_save),
  348. output_file=path_save, colorbar=True)
  349. # extract data from the thresholded images
  350. tmaps_masked_thresh = [image.load_img(i).get_data().astype(float) for i in tmaps_masked_thresh_img]
  351. # calculate the number of tmap voxels:
  352. num_tmap_voxel = [np.size(i) for i in copy.deepcopy(tmaps_masked_thresh)]
  353. logging.info('number of feature selected voxels: %s' % pformat(num_tmap_voxel))
  354. num_above_voxel = [np.count_nonzero(i) for i in copy.deepcopy(tmaps_masked_thresh)]
  355. logging.info('number of voxels above threshold: %s' % pformat(num_above_voxel))
  356. num_below_voxel = [np.count_nonzero(i == 0) for i in copy.deepcopy(tmaps_masked_thresh)]
  357. logging.info('number of voxels below threshold: %s' % pformat(num_below_voxel))
  358. # plot the distribution of t-values:
  359. for i, run_mask in enumerate(tmaps_masked_thresh):
  360. masked_tmap_flat = run_mask.flatten()
  361. masked_tmap_flat = masked_tmap_flat[~np.isnan(masked_tmap_flat)]
  362. masked_tmap_flat = masked_tmap_flat[~np.isnan(masked_tmap_flat) & ~(masked_tmap_flat == 0)]
  363. path_save = opj(path_out_figs, '%s_run-%02d_tvalue_distribution.png' % (sub, i+1))
  364. logging.info('plotting thresholded t-value distribution %s (%d of %d)'
  365. % (path_save, i+1, len(tmaps_masked_thresh)))
  366. fig = plt.figure()
  367. plt.hist(masked_tmap_flat, bins='auto')
  368. plt.xlabel('t-values')
  369. plt.ylabel('number')
  370. plt.title('t-value distribution (%s, run-%02d)' % (sub, i+1))
  371. plt.savefig(path_save)
  372. # create a dataframe with the number of voxels
  373. df_thresh = pd.DataFrame({
  374. 'id': [sub] * n_run,
  375. 'run': np.arange(1,n_run+1),
  376. 'n_total': num_tmap_voxel,
  377. 'n_above': num_above_voxel,
  378. 'n_below': num_below_voxel
  379. })
  380. file_name = opj(path_out_data, '%s_thresholding.csv' % (sub))
  381. df_thresh.to_csv(file_name, sep=',', index=False)
  382. '''
  383. ========================================================================
  384. FEATURE SELECTION: BINARIZE THE THRESHOLDED MASKED T-MAPS
  385. We set all voxels above and below the threshold to 1 and all voxels
  386. that were not selected to 0.
  387. ========================================================================
  388. '''
  389. # replace all NaNs with 0:
  390. tmaps_masked_thresh_bin = [np.where(np.isnan(i), 0, i) for i in copy.deepcopy(tmaps_masked_thresh)]
  391. # replace all other values with 1:
  392. tmaps_masked_thresh_bin = [np.where(i > 0, 1, i) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
  393. # turn the 3D-array into booleans:
  394. tmaps_masked_thresh_bin = [i.astype(bool) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
  395. # create image like object:
  396. masks_final = [image.new_img_like(path_func_task[0], i.astype(np.int)) for i in copy.deepcopy(tmaps_masked_thresh_bin)]
  397. logging.info('plotting final masks with anatomical as background:')
  398. for i, path in enumerate(masks_final):
  399. filename = '%s_run-%02d_tmap_masked_thresh.nii.gz'\
  400. % (sub, i + 1)
  401. path_save = opj(path_out_masks, filename)
  402. logging.info('saving final mask %s (%d of %d)'
  403. % (path_save, i+1, len(masks_final)))
  404. path.to_filename(path_save)
  405. path_save = opj(path_out_figs, '%s_run-%02d_visual_final_mask.png'
  406. % (sub, i + 1))
  407. logging.info('plotting final mask %s (%d of %d)'
  408. % (path_save, i + 1, len(masks_final)))
  409. plotting.plot_roi(path, anat, title=os.path.basename(path_save),
  410. output_file=path_save, colorbar=True)
  411. '''
  412. ========================================================================
  413. LOAD SMOOTHED FMRI DATA FOR ALL FUNCTIONAL TASK RUNS:
  414. ========================================================================
  415. '''
  416. # load smoothed functional mri data for all eight task runs:
  417. logging.info('loading %d functional task runs ...' % len(path_func_task))
  418. data_task = [image.load_img(i) for i in path_func_task]
  419. logging.info('loading successful!')
  420. '''
  421. ========================================================================
  422. DEFINE THE CLASSIFIERS
  423. ========================================================================
  424. '''
  425. class_labels = ['cat', 'chair', 'face', 'house', 'shoe']
  426. # create a dictionary with all values as independent instances:
  427. # see here: https://bit.ly/2J1DvZm
  428. clf_set = {key: LogisticRegression(
  429. C=1., penalty='l2', multi_class='ovr', solver='lbfgs',
  430. max_iter=4000, class_weight='balanced', random_state=42) for key in class_labels}
  431. classifiers = {
  432. 'log_reg': LogisticRegression(
  433. C=1., penalty='l2', multi_class='multinomial', solver='lbfgs',
  434. max_iter=4000, class_weight='balanced', random_state=42)}
  435. clf_set.update(classifiers)
  436. '''
  437. ========================================================================
  438. 1. SPLIT THE EVENTS DATAFRAME FOR EACH TASK CONDITION
  439. 2. RESET THE INDICES OF THE DATAFRAMES
  440. 3. SORT THE ROWS OF ALL DATAFRAMES IN CHRONOLOGICAL ORDER
  441. 4. PRINT THE NUMBER OF TRIALS OF EACH TASK CONDITION
  442. ========================================================================
  443. '''
  444. class TaskData:
  445. """
  446. """
  447. def __init__(
  448. self, events, condition, name, trial_type, bold_delay=0,
  449. interval=1, t_tr=1.25, num_vol_run=530):
  450. import pandas as pd
  451. # define name of the task data subset:
  452. self.name = name
  453. # define the task condition the task data subset if from:
  454. self.condition = condition
  455. # define the delay (in seconds) by which onsets are moved:
  456. self.bold_delay = bold_delay
  457. # define the number of TRs from event onset that should be selected:
  458. self.interval = interval
  459. # define the repetition time (TR) of the mri data acquisition:
  460. self.t_tr = t_tr
  461. # define the number of volumes per task run:
  462. self.num_vol_run = num_vol_run
  463. # select events: upright stimulus, correct answer trials only:
  464. if trial_type == 'stimulus':
  465. self.events = events.loc[
  466. (events['condition'] == condition) &
  467. (events['trial_type'] == trial_type) &
  468. (events['stim_orient'] == 0) &
  469. (events['serial_position'] == 1) &
  470. (events['accuracy'] != 0),
  471. :]
  472. elif trial_type == 'cue':
  473. self.events = events.loc[
  474. (events['condition'] == condition) &
  475. (events['trial_type'] == trial_type),
  476. :]
  477. # reset the indices of the data frame:
  478. self.events.reset_index()
  479. # sort all values by session and run:
  480. self.events.sort_values(by=['session', 'run_session'])
  481. # call further function upon initialization:
  482. self.define_trs()
  483. self.get_stats()
  484. def define_trs(self):
  485. # import relevant functions:
  486. import numpy as np
  487. # select all events onsets:
  488. self.event_onsets = self.events['onset']
  489. # add the selected delay to account for the peak of the hrf
  490. self.bold_peaks = self.events['onset'] + self.bold_delay
  491. # divide the expected time-point of bold peaks by the repetition time:
  492. self.peak_trs = self.bold_peaks / self.t_tr
  493. # add the number of run volumes to the tr indices:
  494. run_volumes = (self.events['run_study']-1) * self.num_vol_run
  495. # add the number of volumes of each run:
  496. trs = round(self.peak_trs + run_volumes)
  497. # copy the relevant trs as often as specified by the interval:
  498. a = np.transpose(np.tile(trs, (self.interval, 1)))
  499. # create same-sized matrix with trs to be added:
  500. b = np.full((len(trs), self.interval), np.arange(self.interval))
  501. # assign the final list of trs:
  502. self.trs = np.array(np.add(a, b).flatten(), dtype=int)
  503. # save the TRs of the stimulus presentations
  504. self.stim_trs = round(self.event_onsets / self.t_tr + run_volumes)
  505. def get_stats(self):
  506. import numpy as np
  507. self.num_trials = len(self.events)
  508. self.runs = np.repeat(np.array(self.events['run_study'], dtype=int), self.interval)
  509. self.trials = np.repeat(np.array(self.events['trial'], dtype=int), self.interval)
  510. self.sess = np.repeat(np.array(self.events['session'], dtype=int), self.interval)
  511. self.stim = np.repeat(np.array(self.events['stim_label'], dtype=object), self.interval)
  512. self.itis = np.repeat(np.array(self.events['interval_time'], dtype=float), self.interval)
  513. self.stim_trs = np.repeat(np.array(self.stim_trs, dtype=int), self.interval)
  514. def zscore(self, signals, run_list, t_tr=1.25):
  515. from nilearn.signal import clean
  516. import numpy as np
  517. # get boolean indices for all run indices in the run list:
  518. run_indices = np.isin(self.runs, list(run_list))
  519. # standardize data all runs in the run list:
  520. self.data_zscored = clean(
  521. signals=signals[self.trs[run_indices]],
  522. sessions=self.runs[run_indices],
  523. t_r=t_tr,
  524. detrend=False,
  525. standardize=True)
  526. def predict(self, clf, run_list):
  527. # import packages:
  528. import pandas as pd
  529. # get classifier class predictions:
  530. pred_class = clf.predict(self.data_zscored)
  531. # get classifier probabilistic predictions:
  532. pred_proba = clf.predict_proba(self.data_zscored)
  533. # get the classes of the classifier:
  534. classes_names = clf.classes_
  535. # get boolean indices for all run indices in the run list:
  536. run_indices = np.isin(self.runs, list(run_list))
  537. # create a dataframe with the probabilities of each class:
  538. df = pd.DataFrame(pred_proba, columns=classes_names)
  539. # get the number of predictions made:
  540. num_pred = len(df)
  541. # get the number of trials in the test set:
  542. num_trials = int(num_pred / self.interval)
  543. # add the predicted class label to the dataframe:
  544. df['pred_label'] = pred_class
  545. # add the true stimulus label to the dataframe:
  546. df['stim'] = self.stim[run_indices]
  547. # add the volume number (TR) to the dataframe:
  548. df['tr'] = self.trs[run_indices]
  549. # add the sequential TR to the dataframe:
  550. df['seq_tr'] = np.tile(np.arange(1, self.interval + 1), num_trials)
  551. # add the counter of trs on which the stimulus was presented
  552. df['stim_tr'] = self.stim_trs[run_indices]
  553. # add the trial number to the dataframe:
  554. df['trial'] = self.trials[run_indices]
  555. # add the run number to the dataframe:
  556. df['run_study'] = self.runs[run_indices]
  557. # add the session number to the dataframe:
  558. df['session'] = self.sess[run_indices]
  559. # add the inter trial interval to the dataframe:
  560. df['tITI'] = self.itis[run_indices]
  561. # add the participant id to the dataframe:
  562. df['id'] = np.repeat(self.events['subject'].unique(), num_pred)
  563. # add the name of the classifier to the dataframe:
  564. df['test_set'] = np.repeat(self.name, num_pred)
  565. # return the dataframe:
  566. return df
  567. train_odd_peak = TaskData(
  568. events=df_events, condition='oddball', trial_type='stimulus',
  569. bold_delay=4, interval=1, name='train-odd_peak')
  570. test_odd_peak = TaskData(
  571. events=df_events, condition='oddball', trial_type='stimulus',
  572. bold_delay=4, interval=1, name='test-odd_peak')
  573. test_odd_long = TaskData(
  574. events=df_events, condition='oddball', trial_type='stimulus',
  575. bold_delay=0, interval=7, name='test-odd_long')
  576. test_seq_long = TaskData(
  577. events=df_events, condition='sequence', trial_type='stimulus',
  578. bold_delay=0, interval=13, name='test-seq_long')
  579. test_rep_long = TaskData(
  580. events=df_events, condition='repetition', trial_type='stimulus',
  581. bold_delay=0, interval=13, name='test-rep_long')
  582. test_seq_cue = TaskData(
  583. events=df_events, condition='sequence', trial_type='cue',
  584. bold_delay=0, interval=5, name='test-seq_cue')
  585. test_rep_cue = TaskData(
  586. events=df_events, condition='repetition', trial_type='cue',
  587. bold_delay=0, interval=5, name='test-rep_cue')
  588. test_sets = [
  589. test_odd_peak, test_odd_long, test_seq_long, test_rep_long,
  590. test_seq_cue, test_rep_cue
  591. ]
  592. '''
  593. ========================================================================
  594. SEPARATE RUN-WISE STANDARDIZATION (Z-SCORING) OF ALL TASK CONDITIONS
  595. Standardize features by removing the mean and scaling to unit variance.
  596. Centering and scaling happen independently on each feature by computing
  597. the relevant statistics on the samples in the training set. Here,
  598. we standardize the features of all trials run-wise but separated for
  599. each task condition (oddball, sequence, and repetition condition).
  600. ========================================================================
  601. '''
  602. def detrend(data, t_tr=1.25):
  603. from nilearn.signal import clean
  604. data_detrend = clean(
  605. signals=data, t_r=t_tr, detrend=True, standardize=False)
  606. return data_detrend
  607. def show_weights(array):
  608. # https://stackoverflow.com/a/50154388
  609. import numpy as np
  610. import seaborn as sns
  611. n_samples = array.shape[0]
  612. classes, bins = np.unique(array, return_counts=True)
  613. n_classes = len(classes)
  614. weights = n_samples / (n_classes * bins)
  615. sns.barplot(classes, weights)
  616. plt.xlabel('class label')
  617. plt.ylabel('weight')
  618. plt.show()
  619. def melt_df(df, melt_columns):
  620. # save the column names of the dataframe in a list:
  621. column_names = df.columns.tolist()
  622. # remove the stimulus classes from the column names;
  623. id_vars = [x for x in column_names if x not in melt_columns]
  624. # melt the dataframe creating one column with value_name and var_name:
  625. df_melt = pd.melt(
  626. df, value_name='probability', var_name='class', id_vars=id_vars)
  627. # return the melted dataframe:
  628. return df_melt
  629. data_list = []
  630. runs = list(range(1, n_run+1))
  631. #mask_label = 'cv'
  632. logging.info('starting leave-one-run-out cross-validation')
  633. for mask_label in ['cv', 'cv_hpc']:
  634. logging.info('testing in mask %s' % (mask_label))
  635. for run in runs:
  636. logging.info('testing on run %d of %d ...' % (run, len(runs)))
  637. # define the run indices for the training and test set:
  638. train_runs = [x for x in runs if x != run]
  639. test_runs = [x for x in runs if x == run]
  640. # get the feature selection mask of the current run:
  641. if mask_label == 'cv':
  642. mask_run = masks_final[runs.index(run)]
  643. elif mask_label == 'cv_hpc':
  644. mask_run = path_mask_hpc_task[runs.index(run)]
  645. # extract smoothed fMRI data from the mask for the cross-validation fold:
  646. masked_data = [masking.apply_mask(i, mask_run) for i in data_task]
  647. # detrend the masked fMRI data separately for each run:
  648. data_detrend = [detrend(i) for i in masked_data]
  649. # combine the detrended data of all runs:
  650. data_detrend = np.vstack(data_detrend)
  651. # loop through all classifiers in the classifier set:
  652. for clf_name, clf in clf_set.items():
  653. # print classifier:
  654. logging.info('classifier: %s' % clf_name)
  655. # fit the classifier to the training data:
  656. train_odd_peak.zscore(signals=data_detrend, run_list=train_runs)
  657. # get the example labels:
  658. train_stim = copy.deepcopy(train_odd_peak.stim[train_odd_peak.runs != run])
  659. # replace labels for single-label classifiers:
  660. if clf_name in class_labels:
  661. # replace all other labels with other
  662. train_stim = ['other' if x != clf_name else x for x in train_stim]
  663. # turn into a numpy array
  664. train_stim = np.array(train_stim, dtype=object)
  665. # check weights:
  666. #show_weights(array=train_stim)
  667. # train the classifier
  668. clf.fit(train_odd_peak.data_zscored, train_stim)
  669. # classifier prediction: predict on test data and save the data:
  670. for test_set in test_sets:
  671. logging.info('testing on test set %s' % test_set.name)
  672. test_set.zscore(signals=data_detrend, run_list=test_runs)
  673. if test_set.data_zscored.size < 0:
  674. continue
  675. # create dataframe containing classifier predictions:
  676. df_pred = test_set.predict(clf=clf, run_list=test_runs)
  677. # add the current classifier as a new column:
  678. df_pred['classifier'] = np.repeat(clf_name, len(df_pred))
  679. # add a label that indicates the mask / training regime:
  680. df_pred['mask'] = np.repeat(mask_label, len(df_pred))
  681. # melt the data frame:
  682. df_pred_melt = melt_df(df=df_pred, melt_columns=train_stim)
  683. # append dataframe to list of dataframe results:
  684. data_list.append(df_pred_melt)
  685. '''
  686. ========================================================================
  687. DECODING ON RESTING STATE DATA:
  688. ========================================================================
  689. '''
  690. logging.info('Loading fMRI data of %d resting state runs ...' % len(path_rest))
  691. data_rest = [image.load_img(i) for i in path_rest]
  692. logging.info('loading successful!')
  693. # combine all masks from the feature selection by intersection:
  694. def multimultiply(arrays):
  695. import copy
  696. # start with the first array:
  697. array_union = copy.deepcopy(arrays[0].astype(np.int))
  698. # loop through all arrays
  699. for i in range(len(arrays)):
  700. # multiply every array with all previous array
  701. array_union = np.multiply(array_union, copy.deepcopy(arrays[i].astype(np.int)))
  702. # return the union of all arrays:
  703. return(array_union)
  704. for mask_label in ['union', 'union_hpc']:
  705. # calculate the union of all masks by multiplication:
  706. if mask_label == 'union':
  707. masks_union = multimultiply(tmaps_masked_thresh_bin).astype(int).astype(bool)
  708. elif mask_label == 'union_hpc':
  709. masks_union = multimultiply(mask_hpc).astype(int).astype(bool)
  710. # masked tmap into image like object:
  711. masks_union_nii = image.new_img_like(path_func_task[0], masks_union)
  712. path_save = opj(path_out_masks, '{}_task-rest_mask-{}.nii.gz'.format(sub, mask_label))
  713. masks_union_nii.to_filename(path_save)
  714. # mask all resting state runs with the averaged feature selection masks:
  715. data_rest_masked = [masking.apply_mask(i, masks_union_nii) for i in data_rest]
  716. # detrend and standardize each resting state run separately:
  717. data_rest_final = [clean(i, detrend=True, standardize=True) for i in data_rest_masked]
  718. # mask all functional task runs separately:
  719. data_task_masked = [masking.apply_mask(i, masks_union_nii) for i in data_task]
  720. # detrend each task run separately:
  721. data_task_masked_detrend = [clean(i, detrend=True, standardize=False) for i in data_task_masked]
  722. # combine the detrended data of all runs:
  723. data_task_masked_detrend = np.vstack(data_task_masked_detrend)
  724. # select only oddball data and standardize:
  725. train_odd_peak.zscore(signals=data_task_masked_detrend, run_list=runs)
  726. # write session and run labels:
  727. ses_labels = [i.split(sub + "_")[1].split("_task")[0] for i in path_rest]
  728. run_labels = [i.split("prenorm_")[1].split("_space")[0] for i in path_rest]
  729. file_names = ['_'.join([a, b]) for (a, b) in zip(ses_labels, run_labels)]
  730. rest_interval = 1
  731. # save the voxel patterns:
  732. num_voxels = len(train_odd_peak.data_zscored[0])
  733. voxel_labels = ['v' + str(x) for x in range(num_voxels)]
  734. df_patterns = pd.DataFrame(train_odd_peak.data_zscored, columns=voxel_labels)
  735. # add the stimulus labels to the dataframe:
  736. df_patterns['label'] = copy.deepcopy(train_odd_peak.stim)
  737. # add the participant id to the dataframe:
  738. df_patterns['id'] = np.repeat(df_events['subject'].unique(), len(train_odd_peak.stim))
  739. # add the mask label:
  740. df_patterns['mask'] = np.repeat(mask_label, len(train_odd_peak.stim))
  741. # split pattern dataframe by label:
  742. df_pattern_list = [df_patterns[df_patterns['label'] == i] for i in df_patterns['label'].unique()]
  743. # create file path to save the dataframe:
  744. file_paths = [opj(path_out_data, '{}_voxel_patterns_{}_{}'.format(sub, mask_label, i)) for i in df_patterns['label'].unique()]
  745. # save the final dataframe to a .csv-file:
  746. [i.to_csv(j + '.csv', sep=',', index=False) for (i, j) in zip(df_pattern_list, file_paths)]
  747. # save only the voxel patterns as niimg-like objects
  748. [masking.unmask(X=i.loc[:, voxel_labels].to_numpy(), mask_img=masks_union_nii).to_filename(j + '.nii.gz') for (i, j) in zip(df_pattern_list, file_paths)]
  749. #[image.new_img_like(path_func_task[0], i.loc[:, voxel_labels].to_numpy()).to_filename(j + '.nii.gz') for (i, j) in zip(df_pattern_list, file_paths)]
  750. # decoding on resting state data:
  751. for clf_name, clf in clf_set.items():
  752. # print classifier name:
  753. logging.info('classifier: %s' % clf_name)
  754. # get the example labels for all slow trials:
  755. train_stim = copy.deepcopy(train_odd_peak.stim)
  756. # replace labels for single-label classifiers:
  757. if clf_name in class_labels:
  758. # replace all other labels with other
  759. train_stim = ['other' if x != clf_name else x for x in train_stim]
  760. # turn into a numpy array
  761. train_stim = np.array(train_stim, dtype=object)
  762. # train the classifier
  763. clf.fit(train_odd_peak.data_zscored, train_stim)
  764. # classifier prediction: predict on test data and save the data:
  765. pred_rest_class = [clf.predict(i) for i in data_rest_final]
  766. pred_rest_proba = [clf.predict_proba(i) for i in data_rest_final]
  767. # get the class names of the classifier:
  768. classes_names = clf.classes_
  769. # save classifier predictions on resting state scans
  770. for t, name in enumerate(pred_rest_proba):
  771. # create a dataframe with the probabilities of each class:
  772. df_rest_pred = pd.DataFrame(
  773. pred_rest_proba[t], columns=classes_names)
  774. # get the number of predictions made:
  775. num_pred = len(df_rest_pred)
  776. # get the number of trials in the test set:
  777. num_trials = int(num_pred / rest_interval)
  778. # add the predicted class label to the dataframe:
  779. df_rest_pred['pred_label'] = pred_rest_class[t]
  780. # add the true stimulus label to the dataframe:
  781. df_rest_pred['stim'] = np.full(num_pred, np.nan)
  782. # add the volume number (TR) to the dataframe:
  783. df_rest_pred['tr'] = np.arange(1, num_pred + 1)
  784. # add the sequential TR to the dataframe:
  785. df_rest_pred['seq_tr'] = np.arange(1, num_pred + 1)
  786. # add the trial number to the dataframe:
  787. df_rest_pred['trial'] = np.tile(
  788. np.arange(1, rest_interval + 1), num_trials)
  789. # add the run number to the dataframe:
  790. df_rest_pred['run_study'] = np.repeat(run_labels[t], num_pred)
  791. # add the session number to the dataframe:
  792. df_rest_pred['session'] = np.repeat(ses_labels[t], len(df_rest_pred))
  793. # add the inter trial interval to the dataframe:
  794. df_rest_pred['tITI'] = np.tile('rest', num_pred)
  795. # add the participant id to the dataframe:
  796. df_rest_pred['id'] = np.repeat(df_events['subject'].unique(), num_pred)
  797. # add the name of the classifier to the dataframe:
  798. df_rest_pred['test_set'] = np.repeat('rest', num_pred)
  799. # add the name of the classifier to the dataframe;
  800. df_rest_pred['classifier'] = np.repeat(clf_name, num_pred)
  801. # add a label that indicates the mask / training regime:
  802. df_rest_pred['mask'] = np.repeat(mask_label, len(df_rest_pred))
  803. # melt the data frame:
  804. df_pred_melt = melt_df(df=df_rest_pred, melt_columns=train_stim)
  805. # append dataframe to list of dataframe results:
  806. data_list.append(df_pred_melt)
  807. # run classifier trained on all runs across all test sets:
  808. for test_set in test_sets:
  809. logging.info('testing on test set %s' % test_set.name)
  810. test_set.zscore(signals=data_task_masked_detrend, run_list=runs)
  811. if test_set.data_zscored.size < 0:
  812. continue
  813. # create dataframe containing classifier predictions:
  814. df_pred = test_set.predict(clf=clf, run_list=runs)
  815. # add the current classifier as a new column:
  816. df_pred['classifier'] = np.repeat(clf_name, len(df_pred))
  817. # add a label that indicates the mask / training regime:
  818. df_pred['mask'] = np.repeat(mask_label, len(df_pred))
  819. # melt the data frame:
  820. df_pred_melt = melt_df(df=df_pred, melt_columns=train_stim)
  821. # append dataframe to list of dataframe results:
  822. data_list.append(df_pred_melt)
  823. # concatenate all decoding dataframes in one final dataframe:
  824. df_all = pd.concat(data_list, sort=False)
  825. # create file path to save the dataframe:
  826. file_path = opj(path_out_data, '{}_decoding.csv'.format(sub))
  827. # save the final dataframe to a .csv-file:
  828. df_all.to_csv(file_path, sep=',', index=False)
  829. '''
  830. ========================================================================
  831. STOP LOGGING:
  832. ========================================================================
  833. '''
  834. end = time.time()
  835. total_time = (end - start)/60
  836. logging.info('total running time: %0.2f minutes' % total_time)
  837. logging.shutdown()