JackEdTaylor
/
Picture-Word-N1


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
							%% Setup

% paths to data
eeg_path = fullfile('raw_data', 'eeg-pc', 'pictureword');
beh_path = fullfile('raw_data', 'stim-pc', 'data', 'pictureword');

% import eeglab (assumes eeglab has been added to path), e.g.
addpath('C:/EEGLAB/eeglab2020_0')
[ALLEEG, EEG, CURRENTSET, ALLCOM] = eeglab;

% This script uses fastica algorithm for ICA, so FastICA needs to be on the path, e.g.
addpath('C:/EEGLAB/FastICA_25')

% region of interest for trial-level ROI average
roi = {'TP7', 'CP5', 'P7', 'P5', 'P9', 'PO7', 'PO3', 'O1'};

% cutoff probability for identifying eye and muscle related ICA components with ICLabel
icl_cutoff = 0.85;

% sigma parameter for ASR
asr_sigma = 20;

%% Clear output folders

delete(fullfile('sample_data_picture', '*.csv'))

%% Import lab book

% handle commas in vectors
lab_book_file = fullfile('raw_data', 'stim-pc', 'participants.csv');
lab_book_raw_dat = fileread(lab_book_file);

[regstart, regend] = regexp(lab_book_raw_dat, '\[.*?\]');

for regmatch_i = 1:numel(regstart)
    str_i = lab_book_raw_dat(regstart(regmatch_i):regend(regmatch_i));
    str_i(str_i==',') = '.';
    lab_book_raw_dat(regstart(regmatch_i):regend(regmatch_i)) = str_i;
end

lab_book_fixed_file = fullfile('raw_data', 'stim-pc', 'participants_tmp.csv');
lab_book_fixed_conn = fopen(lab_book_fixed_file, 'w');
fprintf(lab_book_fixed_conn, lab_book_raw_dat);
fclose(lab_book_fixed_conn);

lab_book_readopts = detectImportOptions(lab_book_fixed_file, 'VariableNamesLine', 1, 'Delimiter', ',');
% read subject ids as class character
lab_book_readopts.VariableTypes{strcmp(lab_book_readopts.SelectedVariableNames, 'subj_id')} = 'char';
lab_book = readtable(lab_book_fixed_file, lab_book_readopts);

delete(lab_book_fixed_file)

%% Count the total number of excluded electrodes

n_bads = 0;
n_bads_per_s = zeros(size(lab_book, 1), 0);

for subject_nr = 1:size(lab_book, 1)
    bad_channels = eval(strrep(strrep(strrep(lab_book.pw_bad_channels{subject_nr}, '[', '{'), ']', '}'), '.', ','));
    n_bads_per_s(subject_nr) = numel(bad_channels);
    n_bads = n_bads + numel(bad_channels);
end

perc_bads = n_bads / (64 * size(lab_book, 1)) * 100;

%% Import max electrode info

% this contains participants' maximal electrodes for the N170 from the
% localisation task
max_elecs = readtable('max_elecs.csv');

%% Iterate over subjects

% record trial exclusions
total_excl_trials_incorr = zeros(1, size(lab_book, 1));
total_excl_trials_rt = zeros(1, size(lab_book, 1));

n_bad_ica = zeros(size(lab_book, 1), 0);

for subject_nr = 1:size(lab_book, 1)
    
    subject_id = lab_book.subj_id{subject_nr};
    fprintf('\n\n Subject Iteration %g/%g, ID: %s\n', subject_nr, size(lab_book, 1), subject_id)
    
    %% get subject-specific info from lab book
    exclude = lab_book.exclude(subject_nr);
    bad_channels = eval(strrep(strrep(strrep(lab_book.pw_bad_channels{subject_nr}, '[', '{'), ']', '}'), '.', ','));
    bad_trigger_indices = eval(strrep(lab_book.pw_bad_trigger_indices{subject_nr}, '.', ','));
    
    % add PO4 to bad channels, which seems to be consistently noisy, even when not marked as bad
    if sum(strcmp('PO4', bad_channels))==0
        bad_channels(numel(bad_channels)+1) = {'PO4'};
    end

    %% abort if excluded
    
    if exclude
        fprintf('Subject %s excluded. Preprocessing aborted.\n', subject_id)
        fprintf('Lab book note: %s\n', lab_book.note{subject_nr})
        continue
    end
    
    %% load participant's data
    
    % load raw eeg
    raw_datapath = fullfile(eeg_path, append(subject_id, '.bdf'));
    
    % abort if no EEG data collected yet
    if ~isfile(raw_datapath)
        fprintf('Subject %s skipped: no EEG data found\n', subject_id)
        continue
    end
    
    EEG = pop_biosig(raw_datapath, 'importevent', 'on', 'rmeventchan', 'off');
    
    % load behavioural
    all_beh_files = dir(beh_path);
    beh_regex_matches = regexpi({all_beh_files.name}, append('^', subject_id, '_.+\.csv$'), 'match');
    regex_emptymask = cellfun('isempty', beh_regex_matches);
    beh_regex_matches(regex_emptymask) = [];
    subj_beh_files = cellfun(@(x) x{:}, beh_regex_matches, 'UniformOutput', false);
    
    if size(subj_beh_files)>1
        fprintf('%g behavioural files found?\n', size(subj_beh_files))
        break
    end
    
    beh_datapath = fullfile(beh_path, subj_beh_files{1});
    beh = readtable(beh_datapath);
    
    %% Set data features
    
    % set channel locations
    
    orig_locs = EEG.chanlocs;
    EEG.chanlocs = pop_chanedit(EEG.chanlocs, 'load', {'BioSemi64.loc', 'filetype', 'loc'});  % doesn't match order for the data
    
    % set channel types
    for ch_nr = 1:64
        EEG.chanlocs(ch_nr).type = 'EEG';
    end
    
    for ch_nr = 65:72
        EEG.chanlocs(ch_nr).type = 'EOG';
    end
    
    for ch_nr = 73:79
        EEG.chanlocs(ch_nr).type = 'MISC';
    end
    
    for ch_nr = 65:79
        EEG.chanlocs(ch_nr).theta = [];
        EEG.chanlocs(ch_nr).radius = [];
        EEG.chanlocs(ch_nr).sph_theta = [];
        EEG.chanlocs(ch_nr).sph_phi = [];
        EEG.chanlocs(ch_nr).X = [];
        EEG.chanlocs(ch_nr).Y = [];
        EEG.chanlocs(ch_nr).Z = [];
    end
    
    % change the order of channels in EEG.data to match the new order in chanlocs
    data_reordered = EEG.data;
    for ch_nr = 1:64        
        % make sure the new eeg data array matches the listed order
        ch_lab = EEG.chanlocs(ch_nr).labels;
        orig_locs_idx = find(strcmp(lower({orig_locs.labels}), lower(ch_lab)));
        data_reordered(ch_nr, :) = EEG.data(orig_locs_idx, :);
    end
    EEG.data = data_reordered;
    
    % remove unused channels
    EEG = pop_select(EEG, 'nochannel', 69:79);
    
    % remove bad channels
    ur_chanlocs = EEG.chanlocs;  % store a copy of the full channel locations before removing (for later interpolation)
    bad_channels_indices = find(ismember(lower({EEG.chanlocs.labels}), lower(bad_channels)));
    EEG = pop_select(EEG, 'nochannel', bad_channels_indices);
    
    %% Identify events (trials) - getting the picture as the trigger instead of the word
    
    % make the sopen function happy
    x = fileparts( which('sopen') );
    rmpath(x);
    addpath(x,'-begin');
    
    % build the events manually from the raw eeg file (pop_biosig removes event offsets)
    % NB: this assumes no resampling between reading the BDF file and now
    bdf_dat = sopen(raw_datapath, 'r', [0, Inf], 'OVERFLOWDETECTION:OFF');
    event_types = bdf_dat.BDF.Trigger.TYP;
    event_pos = bdf_dat.BDF.Trigger.POS;
    event_time = EEG.times(event_pos);
    sclose(bdf_dat);
    clear bdf_dat;
    
    triggers = struct(...
        'off', 0,...
        'A1', 1,...
        'A2', 2,...
        'practice', 25,...
        'image', 99);
    
    % add 61440 to each trigger value (because of number of bits in pp)
    trigger_labels = fieldnames(triggers);
    for field_nr = 1:numel(trigger_labels)
        triggers.(trigger_labels{field_nr}) = triggers.(trigger_labels{field_nr}) + 61440;
    end
    
    % remove the first trigger if it is at time 0 and has a value which isn't a recognised trigger
    if (event_time(1)==0 && ~ismember(event_types(1), [triggers.off, triggers.A1, triggers.A2, triggers.practice, triggers.image]))
        event_types(1) = [];
        event_pos(1) = [];
        event_time(1) = [];
    end
    
    % remove the new first trigger if it has a value of off
    if (event_types(1)==triggers.off)
        event_types(1) = [];
        event_pos(1) = [];
        event_time(1) = [];
    end
    
    % check every second trigger is an offset
    offset_locs = find(event_types==triggers.off);
    if any(offset_locs' ~= 2:2:numel(event_types))
        fprintf('Expected each second trigger to be an off?')
        break
    end
    
    % check every first trigger is non-zero
    onset_locs = find(event_types~=triggers.off);
    if any(onset_locs' ~= 1:2:numel(event_types))
        fprintf('Expected each first trigger to be an event?')
        break
    end
    
    % create the events struct manually    
    events_onset_types = event_types(onset_locs);
    events_onsets = event_pos(onset_locs);
    events_offsets = event_pos(offset_locs);
    events_durations = events_offsets - events_onsets;
    
    EEG.event = struct();
    for event_nr = 1:numel(events_onsets)
        EEG.event(event_nr).type = events_onset_types(event_nr);
        EEG.event(event_nr).latency = events_onsets(event_nr);
        EEG.event(event_nr).offset = events_offsets(event_nr);
        EEG.event(event_nr).duration = events_durations(event_nr);
    end
    
    % copy the details over to urevent
    EEG.urevent = EEG.event;
    
    % record the urevent
    for event_nr = 1:numel(events_onsets)
        EEG.event(event_nr).urevent = event_nr;
    end
    
    % remove bad events recorded in lab book (misfired triggers)
    EEG = pop_editeventvals(EEG, 'delete', find(ismember([EEG.event.urevent], bad_trigger_indices)));
    
    % remove practice trials
    EEG = pop_editeventvals(EEG, 'delete', find(ismember([EEG.event.type], triggers.practice)));
    
    % remove triggers for words
    EEG = pop_editeventvals(EEG, 'delete', find(ismember([EEG.event.type], [triggers.A1, triggers.A2])));

    % remove triggers for all but the last 200 triggers (i.e., remove the practice images)
    EEG = pop_editeventvals(EEG, 'delete', fliplr( 1:numel([EEG.event.type]) ) > 200);
    
    % check the events make sense
    if sum(~ismember([EEG.event.type], triggers.image)) > 0
        fprintf('Unexpected trial types?\n')
        break
    end
    
    if numel({EEG.event.type})~=200
        fprintf('%g trial triggers detected?\n',  numel({EEG.event.type}))
        break
    end
    
    % add the trials' onsets, offsets, durations, and triggers to the behavioural data
    beh.event = zeros(size(beh, 1), 1);
    beh.latency = zeros(size(beh, 1), 1);
    for row_nr = 1:size(beh, 1)
        cond_i = beh.condition(row_nr);
        beh.event(row_nr) = triggers.(cond_i{:});
        beh.latency(row_nr) = EEG.event(row_nr).latency;
        beh.offset(row_nr) = EEG.event(row_nr).offset;
        beh.duration(row_nr) = EEG.event(row_nr).duration;
        beh.duration_ms(row_nr) = (EEG.event(row_nr).duration * 1000/EEG.srate) - 500;  % minus 500 as event timer starts at word presentation, but rt timer starts once word turns green
    end
    
    % record trial numbers in EEG.event
    for row_nr = 1:size(beh, 1)
        EEG.event(row_nr).trl_nr = beh.trl_nr(row_nr);
    end
    
    %% Remove segments of data that fall outside of blocks
    
    % record block starts
    beh.is_block_start(1) = 1;
    for row_nr = 2:size(beh, 1)
        beh.is_block_start(row_nr) = beh.block_nr(row_nr) - beh.block_nr(row_nr-1) == 1;
    end
    % record block ends
    beh.is_block_end(size(beh, 1)) = 1;
    for row_nr = 1:(size(beh, 1)-1)
        beh.is_block_end(row_nr) = beh.block_nr(row_nr+1) - beh.block_nr(row_nr) == 1;
    end

    % record block boundaries (first start and last end point of each block, with 1 seconds buffer)
    beh.block_boundary = zeros(size(beh, 1), 1);
    for row_nr = 1:size(beh, 1)
        if beh.is_block_start(row_nr)
            beh.block_boundary(row_nr) = beh.latency(row_nr) - (EEG.srate * 1);
        elseif beh.is_block_end(row_nr)
            beh.block_boundary(row_nr) = beh.offset(row_nr) + (EEG.srate * 1);
        end
    end
    
    % get the boundary indices in required format (start1, end1; start2, end2; start3, end3)
    block_boundaries = reshape(beh.block_boundary(beh.block_boundary~=0), 2, [])';
    
    % remove anything outside of blocks
    EEG = pop_select(EEG, 'time', (block_boundaries / EEG.srate));
    
    %% Trial selection
    
    % include only correct responses
    beh_filt_acc_only = beh(beh.acc==1, :);
    excl_trials_incorr = size(beh, 1)-size(beh_filt_acc_only, 1);
    total_excl_trials_incorr(subject_nr) = excl_trials_incorr;
    fprintf('Lost %g trials to incorrect responses\n', excl_trials_incorr)
    
    % include only responses between 100 and 1500 ms
    beh_filt = beh_filt_acc_only(beh_filt_acc_only.rt<=1500, :);
    excl_trials_rt = size(beh_filt_acc_only, 1)-size(beh_filt, 1);
    total_excl_trials_rt(subject_nr) = excl_trials_rt;
    fprintf('Lost %g trials to RTs above 1500\n', excl_trials_rt)
    
    fprintf('Lost %g trials in total to behavioural data\n', size(beh, 1)-size(beh_filt, 1))
    
    % filter the events structure
    discarded_trls = beh.trl_nr(~ismember(beh.trl_nr, beh_filt.trl_nr));
    discarded_events_indices = [];  % (collect in a for loop, as [EEG.event.trl_nr] would remove missing data)
    for event_nr = 1:size(EEG.event, 2)
        if ismember(EEG.event(event_nr).trl_nr, discarded_trls)
            discarded_events_indices = [discarded_events_indices, event_nr];
        end
    end
    EEG = pop_editeventvals(EEG, 'delete', discarded_events_indices);
    
    % check the discarded trials are the expected length
    if numel(discarded_trls) ~= size(beh, 1)-size(beh_filt, 1)
        fprintf('Mismatch between behavioural data and EEG events in the number of trials to discard?')
        break
    end
    
    % check the sizes match
    if numel([EEG.event.trl_nr]) ~= size(beh_filt, 1)
        fprintf('Inconsistent numbers of trials between events structure and behavioural data after discarding trials?')
        break
    end
    
    % check the trl numbers match
    if any([EEG.event.trl_nr]' ~= beh_filt.trl_nr)
        fprintf('Trial IDs mmismatch between events structure and behavioural data after discarding trials?')
        break
    end
    
    %% Rereference, downsample, and filter
    
    % rereference
    EEG = pop_reref(EEG, []);
    
    % downsample
    EEG = pop_resample(EEG, 512);
    
    % filter
    % EEG = eeglab_butterworth(EEG, 0.5, 40, 4, 1:size(EEG.chanlocs, 2));  % preregistered filter
    EEG = eeglab_butterworth(EEG, 0.1, 40, 4, 1:size(EEG.chanlocs, 2));  % filter with lower highpass
    
    %% ICA
    
    % apply ASR
    %EEG_no_asr = EEG;
    %EEG = clean_asr(EEG, asr_sigma, [], [], [], [], [], [], [], [], 1024);  % The last number is available memory in mb, needed for reproducibility
    
    % ASR is not used in this exploratory analysis

    rng(3101)  % set seed for reproducibility
    EEG = pop_runica(EEG, 'icatype', 'fastica', 'approach', 'symm');

    % classify components with ICLabel
    EEG = iclabel(EEG);

    % store results for easy indexing
    icl_res = EEG.etc.ic_classification.ICLabel.classifications;
    icl_classes = EEG.etc.ic_classification.ICLabel.classes;
    
    % identify and remove artefact components
    artefact_comps = find(icl_res(:, strcmp(icl_classes, 'Eye')) >= icl_cutoff | icl_res(:, strcmp(icl_classes, 'Muscle')) >= icl_cutoff);
    fprintf('Removing %g artefact-related ICA components\n', numel(artefact_comps))
    n_bad_ica(subject_nr) = numel(artefact_comps);
    %EEG_no_iclabel = EEG;
    EEG = pop_subcomp(EEG, artefact_comps);
    
    %% Interpolate bad channels
    
    % give the original chanlocs structure so EEGLAB interpolates the missing electrode(s)
    if numel(bad_channels)>0
        EEG = pop_interp(EEG, ur_chanlocs);
    end
    
    %% Get sample level microvolts for exploratory analysis checking image ERPs
    
    disp('Getting sample-level results...')
    
    % resample to 256 Hz
    EEG_256 = pop_resample(EEG, 256);
    
    % get epochs of low-srate data
    EEG_epo_256 = pop_epoch(EEG_256, {triggers.image}, [-0.25, 1.8]);
    
    % remove baseline
    EEG_epo_256 = pop_rmbase(EEG_epo_256, [-200, 0]);
    
    % pre-allocate the table
    var_names = {'subj_id', 'stim_grp', 'resp_grp', 'item_nr', 'ch_name', 'time', 'uV'};
    var_types = {'string', 'string', 'string', 'double', 'string', 'double', 'double'};
    nrows = 64 * size(EEG_epo_256.times, 2) * size(beh_filt, 1);
    sample_res = table('Size',[nrows, numel(var_names)], 'VariableTypes',var_types, 'VariableNames',var_names);
    
    sample_res.subj_id = repmat(beh_filt.subj_id, 64*size(EEG_epo_256.times, 2), 1);
    sample_res.stim_grp = repmat(beh_filt.stim_grp, 64*size(EEG_epo_256.times, 2), 1);
    sample_res.resp_grp = repmat(beh_filt.resp_grp, 64*size(EEG_epo_256.times, 2), 1);
    
    % get the 64 channel eeg data as an array
    eeg_arr = EEG_epo_256.data(1:64, :, :);
    
    % a vector of all eeg data
    eeg_vec = squeeze(reshape(eeg_arr, 1, 1, []));
    
    % array and vector of the channel labels for each value in EEG.data
    channel_labels_arr = cell(size(eeg_arr));
    channel_label_lookup = {EEG_epo_256.chanlocs.labels};
    for chan_nr = 1:size(eeg_arr, 1)
        channel_labels_arr(chan_nr, :, :) = repmat(channel_label_lookup(chan_nr), size(channel_labels_arr, 2), size(channel_labels_arr, 3));
    end
    
    channel_labels_vec = squeeze(reshape(channel_labels_arr, 1, 1, []));
    
    % array and vector of the item numbers for each value in EEG.data
    times_arr = zeros(size(eeg_arr));
    times_lookup = EEG_epo_256.times;
    for time_idx = 1:size(eeg_arr, 2)
        times_arr(:, time_idx, :) = repmat(times_lookup(time_idx), size(times_arr, 1), size(times_arr, 3));
    end
    
    times_vec = squeeze(reshape(times_arr, 1, 1, []));
    
    % array and vector of the trial numbers
    trials_arr = zeros(size(eeg_arr));
    trials_lookup = beh_filt.item_nr;
    for trl_idx = 1:size(eeg_arr, 3)
        trials_arr(:, :, trl_idx) = repmat(trials_lookup(trl_idx), size(trials_arr, 1), size(trials_arr, 2));
    end
    
    trials_vec = squeeze(reshape(trials_arr, 1, 1, []));
    
    % store sample-level results in the table
    sample_res.ch_name = channel_labels_vec;
    sample_res.item_nr = trials_vec;
    sample_res.time = times_vec;
    sample_res.uV = eeg_vec;
    
    % look up and store some info about the trials
    trial_info_lookup = beh_filt(:, {'item_nr', 'condition', 'image', 'string'});
    sample_res = outerjoin(sample_res, trial_info_lookup, 'MergeKeys', true);
    
    % sort by time, channel, item_nr
    sample_res = sortrows(sample_res, {'time', 'ch_name', 'item_nr'});
    
    %% save the results
    
    disp('Saving results...')
    writetable(sample_res, fullfile('sample_data_picture', [subject_id, '.csv']));
    
end

fprintf('\nFinished preprocessing picture-word data!\n')

%% Functions

% custom function for applying a Butterworth filter to EEGLAB data
function EEG = eeglab_butterworth(EEG, low, high, order, chanind)
    fprintf('Applying Butterworth filter between %g and %g Hz (order of %g)\n', low, high, order)
    % create filter
    [b, a] = butter(order, [low, high]/(EEG.srate/2));
    % apply to data (requires transposition for filtfilt)
    data_trans = single(filtfilt(b, a, double(EEG.data(chanind, :)')));
    EEG.data(chanind, :) = data_trans';
end

% custom function for finding the closest timepoint in an EEG dataset
function [idx, closesttime] = eeglab_closest_time(EEG, time)
    dists = abs(EEG.times - time);
    idx = find(dists == min(dists));
    % in the unlikely case there are two equidistant times, select one randomly
    if numel(idx) > 1
        fprintf('Two equidistant times! Selecting one randomly.')
        idx = idx(randperm(numel(idx)));
        idx = idx(1);
    end
    closesttime = EEG.times(idx);
end