Explorar o código

add script that creates event files for FSL from the segmented speech annotation

Christian O. Häusler %!s(int64=4) %!d(string=hai) anos
pai
achega
8404a00f09
Modificáronse 1 ficheiros con 275 adicións e 0 borrados
  1. 275 0
      code/speechanno2onsets.py

+ 275 - 0
code/speechanno2onsets.py

@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+'''
+To Do:
+'''
+from collections import defaultdict
+from glob import glob
+import argparse
+import csv
+import os.path
+
+
+# a whiteliste just to check for errors in the annotation
+DESCRNOUNS = [
+    'body', 'bodypart',
+    'face', 'head',
+    'female', 'females', 'fname',
+    'male', 'males', 'mname',
+    'persons',
+    'setting_new', 'setting_rec',
+    'geo', 'geo-room',
+    'object', 'objects', 'furniture',
+    'time', '-', '+', '++'
+]
+
+# dictionary with the used events and the mapping to the condition they
+# belong to; not used: 'time', '-', '+', '++'
+DESCRNOUNS = {
+    'body': 'body',
+    'bodypart': 'bpart',
+    'face': 'fahead',
+    'head': 'fahead',
+    'fname': 'sex_f',
+    'female': 'sex_f',
+    'females': 'sex_f',
+    'mname': 'sex_m',
+    'male': 'sex_m',
+    'males': 'sex_m',
+    'person': 'sex_u',
+    'persons': 'sex_u',
+    'setting_new': 'se_new',
+    'setting_rec': 'se_old',
+    'geo': 'geo',
+    'geo-room': 'groom',
+    'object': 'obj',
+    'objects': 'obj',
+    'furniture': 'furn'
+}
+
+def parse_arguments():
+    '''
+    '''
+    parser = argparse.ArgumentParser(
+        description='''converts annotated events to
+        event files to be used in FSL''')
+    parser.add_argument('-ind',
+                        default='events/segments/aomovie',
+                        help='''directory that contains the segmented
+                        annotation; e.g. 'events/segments/aomovie' ''')
+
+    parser.add_argument('-inp',
+                        default='fg_rscut_ad_ger_speech_tagged_run-*.tsv',
+                        help='''input pattern of the segmented
+                        annotation files ''')
+
+    parser.add_argument('-outd',
+                        default='events/onsets',
+                        help='''output directory; e.g. 'events/onsets' ''')
+
+    args = parser.parse_args()
+
+    inDir = args.ind
+    inPat = args.inp
+    outDir = args.outd
+
+    return inDir, inPat, outDir
+
+
+def get_anno_segments(directory, fname_pattern):
+    '''
+    '''
+    path_pattern = os.path.join(directory, fname_pattern)
+    anno_pathes = glob(path_pattern)
+
+    return sorted(anno_pathes)
+
+
+def get_run_number(path):
+    '''
+    '''
+    fname = os.path.basename(path)
+    run = fname.split('run-')[1].split('_events')[0]
+
+    return run
+
+
+def read_n_clean(path):
+    '''
+    '''
+    with open(path, 'r') as csvfile:
+        all_rows = csv.reader(csvfile, delimiter='\t')
+
+        # skip the headers
+        next(all_rows, None)
+
+        # put files' content into a list
+        anno = []
+        for row in all_rows:
+            # ignore lines with whole sentences,
+            # and onsets with an values smaller than 0
+            # (which is, at the beginnign of the last run, an result of
+            # timing corrections made by researchcut2segments.py
+            # in data speech annotation data paper)
+            if float(row[0]) < 0:
+                continue
+
+            # convert onset from str to float
+            row[0] = float(row[0])
+            # convert duration to offset
+            row[1] = round(float(row[1]), 3)
+
+            if ';' in row[9]:
+                row[9] = row[9].split(';')[0]
+
+            # choose columns for onset, duration, text, tag, and noun
+            # (dropping person, pos, dep, lemma, stop, word vector)
+            anno.append([row[0], row[1], row[3], row[4], row[5], row[9]])
+
+    return anno
+
+
+def extract_descr_nouns(row, run, events_dict):
+    '''
+    '''
+    timing = row[0:2]
+    descrEntry = row[5]
+    # take the first category if column contains multiple (seperator=';')
+    if ';' in descrEntry:
+        category = descrEntry.split(';')[0]
+    else:
+        category = descrEntry
+
+    if category not in DESCRNOUNS:
+        print(category, 'is a unknown localizer')
+    else:
+        # populate the dict
+        events_dict[run][category].append(timing)
+
+    return events_dict
+
+
+def convert_noun2cond(events_dict):
+    '''
+    '''
+    conds_dict = defaultdict(int)
+
+    # loop over the events dict
+    for run in sorted(events_dict.keys()):
+        # make keys from run number with an empty default dict as value
+        conds_dict[run] = defaultdict(list)
+
+        for category in sorted(events_dict[run].keys()):
+            timings = events_dict[run][category]
+
+            # abbreviations of the categories to be used for the filenames
+            # pool the events to conditions by using the mapping in the
+            # dictionary DESCRNOUNS
+            condition = DESCRNOUNS[category]
+            conds_dict[run][condition].extend(timings)
+
+    # sort the pooled lists such that event timings are in temporal order
+    for run in conds_dict.keys():
+        for cond in conds_dict[run].keys():
+            conds_dict[run][cond] = sorted(conds_dict[run][cond])
+
+    return conds_dict
+
+
+def count_events_conds(events_dict):
+    '''
+    '''
+    all_segments_dict = defaultdict(int)
+
+
+    # print events per condition per run
+    for run in sorted(events_dict.keys()):
+        print('\nrun %s:' % run)
+        for cond in sorted(events_dict[run].keys()):
+            count = len(events_dict[run][cond])
+            if count > 5:
+                print('%s\t%s' % (cond, count))
+            else:
+                print('%s\t%s\t###' % (cond, count))
+
+            # add the event count of the current run to the dict for the
+            # whole stimulus
+            all_segments_dict[cond] += count
+
+    print('\n\nwhole stimulus:')
+    cond_count = [[count, cond] for cond, count in all_segments_dict.items()]
+    cond_count.sort(key=lambda x: int(x[0]), reverse=True)
+    for count, cond in cond_count:
+        print('%s\t%s' % (cond, count))
+
+    return None
+
+
+def write_event_files(conds_dict, out_dir):
+    '''
+    '''
+    print('\nWriting onset files')
+
+    for run in conds_dict.keys():
+        print('run', run)
+        for cond in conds_dict[run].keys():
+
+            out_fname = os.path.join(out_dir,
+                                     'run-%i' % run,
+                                     cond + '.txt')
+
+            path = os.path.dirname(out_fname)
+            if not os.path.exists(path):
+                os.makedirs(path)
+
+            # write lines in FSL's EV3 format
+            lines = ['%.3f\t%.3f\t1\n' % (timing[0], timing[1]) for timing in conds_dict[run][cond]]
+
+            outfile = open(out_fname, 'w')
+            outfile.writelines(lines)
+            outfile.close()
+
+    return None
+
+
+if __name__ == "__main__":
+    inDir, inPat, outDir = parse_arguments()
+    # build the name of the output directory from the input directory
+    # handles if input has timing of audio-description or audio-visual movie
+    outDir = os.path.join(outDir, os.path.basename(inDir))
+    # search for files that contain the desired annotation
+    annoSegments = get_anno_segments(inDir, inPat)
+
+    # initialize the dicts for the tags drawn from their columns
+    descrEvents = {}
+
+    # looper over the segmented annotation
+    for segment in annoSegments:
+        # read annotation and convert duration to offset on the fly
+        lastWordEnd = 0.0
+        anno = read_n_clean(segment)
+        run = int(get_run_number(segment))
+
+        # in the dict for every run, make a new dict with keys=conditions
+        descrEvents[run] = {category:[] for category in DESCRNOUNS}
+
+        # EXTRACTING
+        # loop over the rows in the current segment's annotation
+        for row in anno:
+            # extract events for 'descriptive nouns'
+            if row[5] in DESCRNOUNS.keys():
+                descrEvents = extract_descr_nouns(row, run, descrEvents)
+
+    # convert annotated nouns to regressors
+    descrConditions = convert_noun2cond(descrEvents)
+
+    # counts for annotated categories
+    print('Counts per Events:')
+    count_events_conds(descrEvents)
+
+    # counts for the regressors build from (pooled) categories
+    print('\n\nCounts per regressor:')
+    print('Descriptive Nouns:')
+    count_events_conds(descrConditions)
+
+    # write data of current run to file
+    write_event_files(descrConditions, outDir)