speechanno2onsets.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. #!/usr/bin/env python3
  2. '''
  3. To Do:
  4. '''
  5. from collections import defaultdict
  6. from glob import glob
  7. import argparse
  8. import csv
  9. import os.path
  10. # a whiteliste just to check for errors in the annotation
  11. DESCRNOUNS = [
  12. 'body', 'bodypart',
  13. 'face', 'head',
  14. 'female', 'females', 'fname',
  15. 'male', 'males', 'mname',
  16. 'persons',
  17. 'setting_new', 'setting_rec',
  18. 'geo', 'geo-room',
  19. 'object', 'objects', 'furniture',
  20. 'time', '-', '+', '++'
  21. ]
  22. # dictionary with the used events and the mapping to the condition they
  23. # belong to; not used: 'time', '-', '+', '++'
  24. DESCRNOUNS = {
  25. 'body': 'body',
  26. 'bodypart': 'bpart',
  27. 'face': 'fahead',
  28. 'head': 'fahead',
  29. 'fname': 'sex_f',
  30. 'female': 'sex_f',
  31. 'females': 'sex_f',
  32. 'mname': 'sex_m',
  33. 'male': 'sex_m',
  34. 'males': 'sex_m',
  35. 'person': 'sex_u',
  36. 'persons': 'sex_u',
  37. 'setting_new': 'se_new',
  38. 'setting_rec': 'se_old',
  39. 'geo': 'geo',
  40. 'geo-room': 'groom',
  41. 'object': 'obj',
  42. 'objects': 'obj',
  43. 'furniture': 'furn'
  44. }
  45. def parse_arguments():
  46. '''
  47. '''
  48. parser = argparse.ArgumentParser(
  49. description='''converts annotated events to
  50. event files to be used in FSL''')
  51. parser.add_argument('-ind',
  52. default='events/segments/aomovie',
  53. help='''directory that contains the segmented
  54. annotation; e.g. 'events/segments/aomovie' ''')
  55. parser.add_argument('-inp',
  56. default='fg_rscut_ad_ger_speech_tagged_run-*.tsv',
  57. help='''input pattern of the segmented
  58. annotation files ''')
  59. parser.add_argument('-outd',
  60. default='events/onsets',
  61. help='''output directory; e.g. 'events/onsets' ''')
  62. args = parser.parse_args()
  63. inDir = args.ind
  64. inPat = args.inp
  65. outDir = args.outd
  66. return inDir, inPat, outDir
  67. def get_anno_segments(directory, fname_pattern):
  68. '''
  69. '''
  70. path_pattern = os.path.join(directory, fname_pattern)
  71. anno_pathes = glob(path_pattern)
  72. return sorted(anno_pathes)
  73. def get_run_number(path):
  74. '''
  75. '''
  76. fname = os.path.basename(path)
  77. run = fname.split('run-')[1].split('_events')[0]
  78. return run
  79. def read_n_clean(path):
  80. '''
  81. '''
  82. with open(path, 'r') as csvfile:
  83. all_rows = csv.reader(csvfile, delimiter='\t')
  84. # skip the headers
  85. next(all_rows, None)
  86. # put files' content into a list
  87. anno = []
  88. for row in all_rows:
  89. # ignore lines with whole sentences,
  90. # and onsets with an values smaller than 0
  91. # (which is, at the beginnign of the last run, an result of
  92. # timing corrections made by researchcut2segments.py
  93. # in data speech annotation data paper)
  94. if float(row[0]) < 0:
  95. continue
  96. # convert onset from str to float
  97. row[0] = float(row[0])
  98. # convert duration to offset
  99. row[1] = round(float(row[1]), 3)
  100. if ';' in row[9]:
  101. row[9] = row[9].split(';')[0]
  102. # choose columns for onset, duration, text, tag, and noun
  103. # (dropping person, pos, dep, lemma, stop, word vector)
  104. anno.append([row[0], row[1], row[3], row[4], row[5], row[9]])
  105. return anno
  106. def extract_descr_nouns(row, run, events_dict):
  107. '''
  108. '''
  109. timing = row[0:2]
  110. descrEntry = row[5]
  111. # take the first category if column contains multiple (seperator=';')
  112. if ';' in descrEntry:
  113. category = descrEntry.split(';')[0]
  114. else:
  115. category = descrEntry
  116. if category not in DESCRNOUNS:
  117. print(category, 'is a unknown localizer')
  118. else:
  119. # populate the dict
  120. events_dict[run][category].append(timing)
  121. return events_dict
  122. def convert_noun2cond(events_dict):
  123. '''
  124. '''
  125. conds_dict = defaultdict(int)
  126. # loop over the events dict
  127. for run in sorted(events_dict.keys()):
  128. # make keys from run number with an empty default dict as value
  129. conds_dict[run] = defaultdict(list)
  130. for category in sorted(events_dict[run].keys()):
  131. timings = events_dict[run][category]
  132. # abbreviations of the categories to be used for the filenames
  133. # pool the events to conditions by using the mapping in the
  134. # dictionary DESCRNOUNS
  135. condition = DESCRNOUNS[category]
  136. conds_dict[run][condition].extend(timings)
  137. # sort the pooled lists such that event timings are in temporal order
  138. for run in conds_dict.keys():
  139. for cond in conds_dict[run].keys():
  140. conds_dict[run][cond] = sorted(conds_dict[run][cond])
  141. return conds_dict
  142. def count_events_conds(events_dict):
  143. '''
  144. '''
  145. all_segments_dict = defaultdict(int)
  146. # print events per condition per run
  147. for run in sorted(events_dict.keys()):
  148. print('\nrun %s:' % run)
  149. for cond in sorted(events_dict[run].keys()):
  150. count = len(events_dict[run][cond])
  151. if count > 5:
  152. print('%s\t%s' % (cond, count))
  153. else:
  154. print('%s\t%s\t###' % (cond, count))
  155. # add the event count of the current run to the dict for the
  156. # whole stimulus
  157. all_segments_dict[cond] += count
  158. print('\n\nwhole stimulus:')
  159. cond_count = [[count, cond] for cond, count in all_segments_dict.items()]
  160. cond_count.sort(key=lambda x: int(x[0]), reverse=True)
  161. for count, cond in cond_count:
  162. print('%s\t%s' % (cond, count))
  163. return None
  164. def write_event_files(conds_dict, out_dir):
  165. '''
  166. '''
  167. print('\nWriting onset files')
  168. for run in conds_dict.keys():
  169. print('run', run)
  170. for cond in conds_dict[run].keys():
  171. out_fname = os.path.join(out_dir,
  172. 'run-%i' % run,
  173. cond + '.txt')
  174. path = os.path.dirname(out_fname)
  175. if not os.path.exists(path):
  176. os.makedirs(path)
  177. # write lines in FSL's EV3 format
  178. lines = ['%.3f\t%.3f\t1\n' % (timing[0], timing[1]) for timing in conds_dict[run][cond]]
  179. outfile = open(out_fname, 'w')
  180. outfile.writelines(lines)
  181. outfile.close()
  182. return None
  183. if __name__ == "__main__":
  184. inDir, inPat, outDir = parse_arguments()
  185. # build the name of the output directory from the input directory
  186. # handles if input has timing of audio-description or audio-visual movie
  187. outDir = os.path.join(outDir, os.path.basename(inDir))
  188. # search for files that contain the desired annotation
  189. annoSegments = get_anno_segments(inDir, inPat)
  190. # initialize the dicts for the tags drawn from their columns
  191. descrEvents = {}
  192. # looper over the segmented annotation
  193. for segment in annoSegments:
  194. # read annotation and convert duration to offset on the fly
  195. lastWordEnd = 0.0
  196. anno = read_n_clean(segment)
  197. run = int(get_run_number(segment))
  198. # in the dict for every run, make a new dict with keys=conditions
  199. descrEvents[run] = {category:[] for category in DESCRNOUNS}
  200. # EXTRACTING
  201. # loop over the rows in the current segment's annotation
  202. for row in anno:
  203. # extract events for 'descriptive nouns'
  204. if row[5] in DESCRNOUNS.keys():
  205. descrEvents = extract_descr_nouns(row, run, descrEvents)
  206. # convert annotated nouns to regressors
  207. descrConditions = convert_noun2cond(descrEvents)
  208. # counts for annotated categories
  209. print('Counts per Events:')
  210. count_events_conds(descrEvents)
  211. # counts for the regressors build from (pooled) categories
  212. print('\n\nCounts per regressor:')
  213. print('Descriptive Nouns:')
  214. count_events_conds(descrConditions)
  215. # write data of current run to file
  216. write_event_files(descrConditions, outDir)