%!s(int64=4) %!d(string=hai) anos · 8404a00f09
--- a/code/speechanno2onsets.py
+++ b/code/speechanno2onsets.py
@@ -0,0 +1,275 @@
 
				+#!/usr/bin/env python3
			
 
				+'''
			
 
				+To Do:
			
 
				+'''
			
 
				+from collections import defaultdict
			
 
				+from glob import glob
			
 
				+import argparse
			
 
				+import csv
			
 
				+import os.path
			
 
				+
			
 
				+
			
 
				+# a whiteliste just to check for errors in the annotation
			
 
				+DESCRNOUNS = [
			
 
				+    'body', 'bodypart',
			
 
				+    'face', 'head',
			
 
				+    'female', 'females', 'fname',
			
 
				+    'male', 'males', 'mname',
			
 
				+    'persons',
			
 
				+    'setting_new', 'setting_rec',
			
 
				+    'geo', 'geo-room',
			
 
				+    'object', 'objects', 'furniture',
			
 
				+    'time', '-', '+', '++'
			
 
				+]
			
 
				+
			
 
				+# dictionary with the used events and the mapping to the condition they
			
 
				+# belong to; not used: 'time', '-', '+', '++'
			
 
				+DESCRNOUNS = {
			
 
				+    'body': 'body',
			
 
				+    'bodypart': 'bpart',
			
 
				+    'face': 'fahead',
			
 
				+    'head': 'fahead',
			
 
				+    'fname': 'sex_f',
			
 
				+    'female': 'sex_f',
			
 
				+    'females': 'sex_f',
			
 
				+    'mname': 'sex_m',
			
 
				+    'male': 'sex_m',
			
 
				+    'males': 'sex_m',
			
 
				+    'person': 'sex_u',
			
 
				+    'persons': 'sex_u',
			
 
				+    'setting_new': 'se_new',
			
 
				+    'setting_rec': 'se_old',
			
 
				+    'geo': 'geo',
			
 
				+    'geo-room': 'groom',
			
 
				+    'object': 'obj',
			
 
				+    'objects': 'obj',
			
 
				+    'furniture': 'furn'
			
 
				+}
			
 
				+
			
 
				+def parse_arguments():
			
 
				+    '''
			
 
				+    '''
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='''converts annotated events to
			
 
				+        event files to be used in FSL''')
			
 
				+    parser.add_argument('-ind',
			
 
				+                        default='events/segments/aomovie',
			
 
				+                        help='''directory that contains the segmented
			
 
				+                        annotation; e.g. 'events/segments/aomovie' ''')
			
 
				+
			
 
				+    parser.add_argument('-inp',
			
 
				+                        default='fg_rscut_ad_ger_speech_tagged_run-*.tsv',
			
 
				+                        help='''input pattern of the segmented
			
 
				+                        annotation files ''')
			
 
				+
			
 
				+    parser.add_argument('-outd',
			
 
				+                        default='events/onsets',
			
 
				+                        help='''output directory; e.g. 'events/onsets' ''')
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    inDir = args.ind
			
 
				+    inPat = args.inp
			
 
				+    outDir = args.outd
			
 
				+
			
 
				+    return inDir, inPat, outDir
			
 
				+
			
 
				+
			
 
				+def get_anno_segments(directory, fname_pattern):
			
 
				+    '''
			
 
				+    '''
			
 
				+    path_pattern = os.path.join(directory, fname_pattern)
			
 
				+    anno_pathes = glob(path_pattern)
			
 
				+
			
 
				+    return sorted(anno_pathes)
			
 
				+
			
 
				+
			
 
				+def get_run_number(path):
			
 
				+    '''
			
 
				+    '''
			
 
				+    fname = os.path.basename(path)
			
 
				+    run = fname.split('run-')[1].split('_events')[0]
			
 
				+
			
 
				+    return run
			
 
				+
			
 
				+
			
 
				+def read_n_clean(path):
			
 
				+    '''
			
 
				+    '''
			
 
				+    with open(path, 'r') as csvfile:
			
 
				+        all_rows = csv.reader(csvfile, delimiter='\t')
			
 
				+
			
 
				+        # skip the headers
			
 
				+        next(all_rows, None)
			
 
				+
			
 
				+        # put files' content into a list
			
 
				+        anno = []
			
 
				+        for row in all_rows:
			
 
				+            # ignore lines with whole sentences,
			
 
				+            # and onsets with an values smaller than 0
			
 
				+            # (which is, at the beginnign of the last run, an result of
			
 
				+            # timing corrections made by researchcut2segments.py
			
 
				+            # in data speech annotation data paper)
			
 
				+            if float(row[0]) < 0:
			
 
				+                continue
			
 
				+
			
 
				+            # convert onset from str to float
			
 
				+            row[0] = float(row[0])
			
 
				+            # convert duration to offset
			
 
				+            row[1] = round(float(row[1]), 3)
			
 
				+
			
 
				+            if ';' in row[9]:
			
 
				+                row[9] = row[9].split(';')[0]
			
 
				+
			
 
				+            # choose columns for onset, duration, text, tag, and noun
			
 
				+            # (dropping person, pos, dep, lemma, stop, word vector)
			
 
				+            anno.append([row[0], row[1], row[3], row[4], row[5], row[9]])
			
 
				+
			
 
				+    return anno
			
 
				+
			
 
				+
			
 
				+def extract_descr_nouns(row, run, events_dict):
			
 
				+    '''
			
 
				+    '''
			
 
				+    timing = row[0:2]
			
 
				+    descrEntry = row[5]
			
 
				+    # take the first category if column contains multiple (seperator=';')
			
 
				+    if ';' in descrEntry:
			
 
				+        category = descrEntry.split(';')[0]
			
 
				+    else:
			
 
				+        category = descrEntry
			
 
				+
			
 
				+    if category not in DESCRNOUNS:
			
 
				+        print(category, 'is a unknown localizer')
			
 
				+    else:
			
 
				+        # populate the dict
			
 
				+        events_dict[run][category].append(timing)
			
 
				+
			
 
				+    return events_dict
			
 
				+
			
 
				+
			
 
				+def convert_noun2cond(events_dict):
			
 
				+    '''
			
 
				+    '''
			
 
				+    conds_dict = defaultdict(int)
			
 
				+
			
 
				+    # loop over the events dict
			
 
				+    for run in sorted(events_dict.keys()):
			
 
				+        # make keys from run number with an empty default dict as value
			
 
				+        conds_dict[run] = defaultdict(list)
			
 
				+
			
 
				+        for category in sorted(events_dict[run].keys()):
			
 
				+            timings = events_dict[run][category]
			
 
				+
			
 
				+            # abbreviations of the categories to be used for the filenames
			
 
				+            # pool the events to conditions by using the mapping in the
			
 
				+            # dictionary DESCRNOUNS
			
 
				+            condition = DESCRNOUNS[category]
			
 
				+            conds_dict[run][condition].extend(timings)
			
 
				+
			
 
				+    # sort the pooled lists such that event timings are in temporal order
			
 
				+    for run in conds_dict.keys():
			
 
				+        for cond in conds_dict[run].keys():
			
 
				+            conds_dict[run][cond] = sorted(conds_dict[run][cond])
			
 
				+
			
 
				+    return conds_dict
			
 
				+
			
 
				+
			
 
				+def count_events_conds(events_dict):
			
 
				+    '''
			
 
				+    '''
			
 
				+    all_segments_dict = defaultdict(int)
			
 
				+
			
 
				+
			
 
				+    # print events per condition per run
			
 
				+    for run in sorted(events_dict.keys()):
			
 
				+        print('\nrun %s:' % run)
			
 
				+        for cond in sorted(events_dict[run].keys()):
			
 
				+            count = len(events_dict[run][cond])
			
 
				+            if count > 5:
			
 
				+                print('%s\t%s' % (cond, count))
			
 
				+            else:
			
 
				+                print('%s\t%s\t###' % (cond, count))
			
 
				+
			
 
				+            # add the event count of the current run to the dict for the
			
 
				+            # whole stimulus
			
 
				+            all_segments_dict[cond] += count
			
 
				+
			
 
				+    print('\n\nwhole stimulus:')
			
 
				+    cond_count = [[count, cond] for cond, count in all_segments_dict.items()]
			
 
				+    cond_count.sort(key=lambda x: int(x[0]), reverse=True)
			
 
				+    for count, cond in cond_count:
			
 
				+        print('%s\t%s' % (cond, count))
			
 
				+
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def write_event_files(conds_dict, out_dir):
			
 
				+    '''
			
 
				+    '''
			
 
				+    print('\nWriting onset files')
			
 
				+
			
 
				+    for run in conds_dict.keys():
			
 
				+        print('run', run)
			
 
				+        for cond in conds_dict[run].keys():
			
 
				+
			
 
				+            out_fname = os.path.join(out_dir,
			
 
				+                                     'run-%i' % run,
			
 
				+                                     cond + '.txt')
			
 
				+
			
 
				+            path = os.path.dirname(out_fname)
			
 
				+            if not os.path.exists(path):
			
 
				+                os.makedirs(path)
			
 
				+
			
 
				+            # write lines in FSL's EV3 format
			
 
				+            lines = ['%.3f\t%.3f\t1\n' % (timing[0], timing[1]) for timing in conds_dict[run][cond]]
			
 
				+
			
 
				+            outfile = open(out_fname, 'w')
			
 
				+            outfile.writelines(lines)
			
 
				+            outfile.close()
			
 
				+
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    inDir, inPat, outDir = parse_arguments()
			
 
				+    # build the name of the output directory from the input directory
			
 
				+    # handles if input has timing of audio-description or audio-visual movie
			
 
				+    outDir = os.path.join(outDir, os.path.basename(inDir))
			
 
				+    # search for files that contain the desired annotation
			
 
				+    annoSegments = get_anno_segments(inDir, inPat)
			
 
				+
			
 
				+    # initialize the dicts for the tags drawn from their columns
			
 
				+    descrEvents = {}
			
 
				+
			
 
				+    # looper over the segmented annotation
			
 
				+    for segment in annoSegments:
			
 
				+        # read annotation and convert duration to offset on the fly
			
 
				+        lastWordEnd = 0.0
			
 
				+        anno = read_n_clean(segment)
			
 
				+        run = int(get_run_number(segment))
			
 
				+
			
 
				+        # in the dict for every run, make a new dict with keys=conditions
			
 
				+        descrEvents[run] = {category:[] for category in DESCRNOUNS}
			
 
				+
			
 
				+        # EXTRACTING
			
 
				+        # loop over the rows in the current segment's annotation
			
 
				+        for row in anno:
			
 
				+            # extract events for 'descriptive nouns'
			
 
				+            if row[5] in DESCRNOUNS.keys():
			
 
				+                descrEvents = extract_descr_nouns(row, run, descrEvents)
			
 
				+
			
 
				+    # convert annotated nouns to regressors
			
 
				+    descrConditions = convert_noun2cond(descrEvents)
			
 
				+
			
 
				+    # counts for annotated categories
			
 
				+    print('Counts per Events:')
			
 
				+    count_events_conds(descrEvents)
			
 
				+
			
 
				+    # counts for the regressors build from (pooled) categories
			
 
				+    print('\n\nCounts per regressor:')
			
 
				+    print('Descriptive Nouns:')
			
 
				+    count_events_conds(descrConditions)
			
 
				+
			
 
				+    # write data of current run to file
			
 
				+    write_event_files(descrConditions, outDir)