researchcut2segments.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/python3
  2. """
  3. created on Wed Jan 30 2018
  4. author: Christian Olaf Haeusler
  5. Parameters:
  6. ----------
  7. input:
  8. 'aomovie' means that input is an annotation of the AUDIOTRACK
  9. 'avmovie' means that input is an annotation of the MOVIE FRAMES
  10. """
  11. from collections import defaultdict
  12. import os
  13. from os.path import basename
  14. from os.path import join as opj
  15. from os.path import exists
  16. import re
  17. import sys
  18. import pandas as pd
  19. SEGMENTS_OFFSETS = (
  20. (0.00, 0.00),
  21. (886.00, 0.00),
  22. (1752.08, 0.08), # third segment's start
  23. (2612.16, 0.16),
  24. (3572.20, 0.20),
  25. (4480.28, 0.28),
  26. (5342.36, 0.36),
  27. (6410.44, 0.44), # last segment's start
  28. (7086.00, 0.00)) # movie's last time point
  29. # dictionaries with paired touples containing time (2sec steps) and offset
  30. # in respect to the audiovisual movie (forrestgump_researchcut_ger_mono.mkv)
  31. AUDIO_AV_OFFSETS = {
  32. 0: { 0: 21.33},
  33. 1: { 0: 37.33,
  34. 408: 21.33},
  35. 2: { 0: 69.33,
  36. 199: 61.33},
  37. 3: { 0: 93.33,
  38. 320: 101.33},
  39. 4: { 0: 109.33,
  40. 401: 101.33},
  41. 5: { 0: 141.33},
  42. 6: { 0: 189.31,
  43. 61: 181.31},
  44. 7: { 0: 205.33}}
  45. AUDIO_AO_OFFSETS = {
  46. 0: { 0: 47.02},
  47. 1: { 0: 36.35,
  48. 203: 47.02},
  49. 2: { 0: 87.02,
  50. 199: 92.35},
  51. 3: { 0: 124.35,
  52. 320: 132.35},
  53. 4: { 0: 105.69,
  54. 401: 92.35},
  55. 5: { 0: 137.69,
  56. 364: 167.02},
  57. 6: { 0: 201.67,
  58. 61: 543.00},
  59. 7: { 0:-1422.31}}
  60. def time_stamp_to_msec(t_stamp='01:50:34:01'):
  61. '''
  62. Input:
  63. time stamp (str) in format HH:MM:SS:Frame
  64. Output:
  65. time point in milliseconds (int)
  66. '''
  67. splitted_stamp = t_stamp.split(':')
  68. milliseconds = (int(splitted_stamp[0]) * 60 * 60 * 1000) +\
  69. (int(splitted_stamp[1]) * 60 * 1000) +\
  70. (int(splitted_stamp[2]) * 1000) +\
  71. (int(splitted_stamp[3]) * 40)
  72. return milliseconds
  73. def msec_to_time_stamp(milliseconds=6634040):
  74. '''
  75. Input:
  76. a time point in milliseconds (int)
  77. Output:
  78. a time stamp (str) in format HH:MM:SS:Frame
  79. '''
  80. # convert in case function was called from the command line with the
  81. # timing given as a string
  82. milliseconds = int(milliseconds)
  83. hours = (milliseconds / (60 * 60 * 1000))
  84. minutes = (milliseconds % (60 * 60 * 1000) / (60 * 1000))
  85. seconds = (milliseconds % (60 * 60 * 1000) % (60 * 1000) / 1000)
  86. frame = (milliseconds % (60 * 60 * 1000) % (60 * 1000) % (1000) // 40)
  87. time_stamp = '%02d:%02d:%02d:%02d' % (hours, minutes, seconds, frame)
  88. return time_stamp
  89. def get_run_number(starts, onset):
  90. '''
  91. '''
  92. for start in sorted(starts, reverse=True):
  93. if onset >= start:
  94. run = starts.index(start)
  95. break
  96. return run
  97. def whole_anno_to_segments(seg_starts, run_nr, anno_time):
  98. '''
  99. "The position of an event from a movie annotation with respect to the
  100. cropped fMRI time series can now be determined by substracting the
  101. start time of the respective segment as listed in Table 1"
  102. http://studyforrest.org/annotation_timing.html
  103. '''
  104. seg_time = anno_time - seg_starts[run_nr]
  105. return seg_time
  106. def fix_audio_movie_segments(AUDIO_AV_OFFSETS, run, uncorrected):
  107. '''corrects the segments' audio offsets
  108. in respect to the unsegmented movie
  109. '''
  110. critical_time_points = sorted(AUDIO_AV_OFFSETS[run].keys(), reverse=True)
  111. for crit in critical_time_points:
  112. if uncorrected >= crit * 2.0:
  113. corrected = uncorrected + (AUDIO_AV_OFFSETS[run][crit] / 1000.0)
  114. break
  115. return corrected
  116. def fix_audio_descr_segments(AUDIO_AO_OFFSETS, run, uncorrected):
  117. '''corrects the segments' audio offsets
  118. in respect to the unsegmented audiobook
  119. '''
  120. critical_time_points = sorted(AUDIO_AO_OFFSETS[run].keys(), reverse=True)
  121. for crit in critical_time_points:
  122. if uncorrected >= crit * 2.0:
  123. corrected = uncorrected + (AUDIO_AO_OFFSETS[run][crit] / 1000.0)
  124. break
  125. return corrected
  126. def write_segmented_annos(infilename, stimulus, run_dict, out_dir):
  127. '''
  128. '''
  129. basefilename = basename(infilename)[:-4]
  130. # rename the speech annotation (since it did not run through the importer
  131. # script that renamed e.g. structure.csv to locations.tsv
  132. # name it 'speech_ad' because the narrator will also be in it if
  133. # segmented into timings of the movie (-> e.g. for control contrasts)
  134. print(basefilename)
  135. if basefilename == 'fg_rscut_ad_ger_speech_tagged':
  136. basefilename = 'speech_ad'
  137. outdir = opj(out_dir, stimulus)
  138. if not exists(outdir):
  139. os.makedirs(outdir)
  140. for run in sorted(run_dict.keys()):
  141. outname = opj(out_dir, stimulus, '{}_run-{}_events.tsv'.format(
  142. basefilename,
  143. run + 1))
  144. pd.DataFrame.from_records(
  145. run_dict[run],
  146. columns=run_dict[run][0].dtype.names).to_csv(
  147. outname,
  148. sep='\t',
  149. index=False,
  150. encoding='utf-8')
  151. #### main program #####
  152. if __name__ == "__main__":
  153. # constants #
  154. infile = sys.argv[1]
  155. annotated_time = sys.argv[2]
  156. target_time = sys.argv[3]
  157. outdir = sys.argv[4]
  158. # with launch_ipdb_on_exception():
  159. # read the annotation file
  160. anno = pd.read_csv(infile, sep='\t', encoding='utf-8').to_records(index=False)
  161. segment_starts = [start for start, offset in SEGMENTS_OFFSETS]
  162. run_events = defaultdict(list)
  163. for row in anno:
  164. # get the run number
  165. run = get_run_number(segment_starts, row['onset'])
  166. # convert the timings of a continuous annotation
  167. # to timings in respect to the start of the corresponding segment
  168. onset_in_seg = whole_anno_to_segments(
  169. segment_starts,
  170. run,
  171. float(row['onset']))
  172. # correct for the stimulus used to annotate the audiotrack
  173. # 'aomovie' means that input is an annotation of the AUDIOTRACK
  174. # 'avmovie' means that input is an annotation of the MOVIE FRAMES
  175. if annotated_time == 'aomovie':
  176. # first, correct for the offset between the (unshifted) audio
  177. # description and audiovisual movie
  178. # it turned out the offset is varying +/- one frame (40 ms) around 0
  179. # across the course of the whole stimuli
  180. onset_in_seg -= 0.000
  181. # second, correct for the offset between whole stimulus
  182. # (audiovisual or audio-description) and its segments
  183. if target_time == 'avmovie':
  184. onset_in_seg = fix_audio_movie_segments(
  185. AUDIO_AV_OFFSETS,
  186. run,
  187. onset_in_seg)
  188. elif target_time == 'aomovie':
  189. onset_in_seg = fix_audio_descr_segments(
  190. AUDIO_AO_OFFSETS,
  191. run,
  192. onset_in_seg)
  193. else:
  194. raise ValueError('Unknown time label %s', target_time)
  195. elif annotated_time == 'avmovie':
  196. # all splendid for now
  197. pass
  198. row['onset'] = round(onset_in_seg, 3)
  199. row['duration'] = round(row['duration'], 3)
  200. # append that shit
  201. run_events[run].append(row)
  202. write_segmented_annos(infile, target_time, run_events, outdir)