researchcut2segments.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. #!/usr/bin/python3
  2. """
  3. created on Wed Jan 30 2018
  4. author: Christian Olaf Haeusler
  5. To Do:
  6. argparser
  7. Erzaehler Filtern wennn MOVIE = True
  8. """
  9. from collections import defaultdict
  10. import os
  11. from os.path import basename
  12. from os.path import join as opj
  13. from os.path import exists
  14. import re
  15. import sys
  16. import pandas as pd
  17. SEGMENTS_OFFSETS = (
  18. (0.00, 0.00),
  19. (886.00, 0.00),
  20. (1752.08, 0.08), # third segment's start
  21. (2612.16, 0.16),
  22. (3572.20, 0.20),
  23. (4480.28, 0.28),
  24. (5342.36, 0.36),
  25. (6410.44, 0.44), # last segment's start
  26. (7086.00, 0.00)) # movie's last time point
  27. # dictionaries with paired touples containing time (2sec steps) and offset
  28. # in respect to the audiovisual movie (forrestgump_researchcut_ger_mono.mkv)
  29. AUDIO_AV_OFFSETS = {
  30. 0: { 0: 21.33},
  31. 1: { 0: 37.33,
  32. 408: 21.33},
  33. 2: { 0: 69.33,
  34. 199: 61.33},
  35. 3: { 0: 93.33,
  36. 320: 101.33},
  37. 4: { 0: 109.33,
  38. 401: 101.33},
  39. 5: { 0: 141.33},
  40. 6: { 0: 189.31,
  41. 61: 181.31},
  42. 7: { 0: 205.33}}
  43. AUDIO_AO_OFFSETS = {
  44. 0: { 0: 47.02},
  45. 1: { 0: 36.35,
  46. 203: 47.02},
  47. 2: { 0: 87.02,
  48. 199: 92.35},
  49. 3: { 0: 124.35,
  50. 320: 132.35},
  51. 4: { 0: 105.69,
  52. 401: 92.35},
  53. 5: { 0: 137.69,
  54. 364: 167.02},
  55. 6: { 0: 201.67,
  56. 61: 543.00},
  57. 7: { 0:-1422.31}}
  58. def time_stamp_to_msec(t_stamp='01:50:34:01'):
  59. '''
  60. Input:
  61. time stamp (str) in format HH:MM:SS:Frame
  62. Output:
  63. time point in milliseconds (int)
  64. '''
  65. splitted_stamp = t_stamp.split(':')
  66. milliseconds = (int(splitted_stamp[0]) * 60 * 60 * 1000) +\
  67. (int(splitted_stamp[1]) * 60 * 1000) +\
  68. (int(splitted_stamp[2]) * 1000) +\
  69. (int(splitted_stamp[3]) * 40)
  70. return milliseconds
  71. def msec_to_time_stamp(milliseconds=6634040):
  72. '''
  73. Input:
  74. a time point in milliseconds (int)
  75. Output:
  76. a time stamp (str) in format HH:MM:SS:Frame
  77. '''
  78. # convert in case function was called from the command line with the
  79. # timing given as a string
  80. milliseconds = int(milliseconds)
  81. hours = (milliseconds / (60 * 60 * 1000))
  82. minutes = (milliseconds % (60 * 60 * 1000) / (60 * 1000))
  83. seconds = (milliseconds % (60 * 60 * 1000) % (60 * 1000) / 1000)
  84. frame = (milliseconds % (60 * 60 * 1000) % (60 * 1000) % (1000) // 40)
  85. time_stamp = '%02d:%02d:%02d:%02d' % (hours, minutes, seconds, frame)
  86. return time_stamp
  87. def get_run_number(starts, onset):
  88. '''
  89. '''
  90. for start in sorted(starts, reverse=True):
  91. if onset >= start:
  92. run = starts.index(start)
  93. break
  94. return run
  95. def whole_anno_to_segments(seg_starts, run_nr, anno_time):
  96. '''
  97. "The position of an event from a movie annotation with respect to the
  98. cropped fMRI time series can now be determined by substracting the
  99. start time of the respective segment as listed in Table 1"
  100. http://studyforrest.org/annotation_timing.html
  101. '''
  102. seg_time = anno_time - seg_starts[run_nr]
  103. return seg_time
  104. def fix_audio_movie_segments(AUDIO_AV_OFFSETS, run, uncorrected):
  105. '''corrects the segments' audio offsets
  106. in respect to the unsegmented movie
  107. '''
  108. critical_time_points = sorted(AUDIO_AV_OFFSETS[run].keys(), reverse=True)
  109. for crit in critical_time_points:
  110. if uncorrected >= crit * 2.0:
  111. corrected = uncorrected + (AUDIO_AV_OFFSETS[run][crit] / 1000.0)
  112. break
  113. return corrected
  114. def fix_audio_descr_segments(AUDIO_AO_OFFSETS, run, uncorrected):
  115. '''corrects the segments' audio offsets
  116. in respect to the unsegmented audiobook
  117. '''
  118. critical_time_points = sorted(AUDIO_AO_OFFSETS[run].keys(), reverse=True)
  119. for crit in critical_time_points:
  120. if uncorrected >= crit * 2.0:
  121. corrected = uncorrected + (AUDIO_AO_OFFSETS[run][crit] / 1000.0)
  122. break
  123. return corrected
  124. def write_segmented_annos(infilename, stimulus, run_dict, out_dir):
  125. '''
  126. '''
  127. basefilename = basename(infilename)[:-4]
  128. outdir = opj(out_dir, stimulus)
  129. if not exists(outdir):
  130. os.makedirs(outdir)
  131. for run in sorted(run_dict.keys()):
  132. outname = opj(out_dir, stimulus, '{}_run-{}_events.tsv'.format(
  133. basefilename,
  134. run + 1))
  135. pd.DataFrame.from_records(
  136. run_dict[run],
  137. columns=run_dict[run][0].dtype.names).to_csv(
  138. outname,
  139. sep='\t',
  140. index=False,
  141. encoding='utf-8')
  142. #### main program #####
  143. if __name__ == "__main__":
  144. # constants #
  145. infile = sys.argv[1]
  146. annotated_time = sys.argv[2]
  147. target_time = sys.argv[3]
  148. outdir = sys.argv[4]
  149. # with launch_ipdb_on_exception():
  150. # read the annotation file
  151. anno = pd.read_csv(infile, sep='\t', encoding='utf-8').to_records(index=False)
  152. segment_starts = [start for start, offset in SEGMENTS_OFFSETS]
  153. run_events = defaultdict(list)
  154. for row in anno:
  155. # get the run number
  156. run = get_run_number(segment_starts, row['onset'])
  157. # convert the timings of a continuous annotation
  158. # to timings in respect to the start of the corresponding segment
  159. onset_in_seg = whole_anno_to_segments(
  160. segment_starts,
  161. run,
  162. float(row['onset']))
  163. # correct for the stimulus used to annotate the audiotrack
  164. if annotated_time == 'aomovie':
  165. # first, correct for the offset between the (unshifted) audio
  166. # description and audiovisual movie
  167. # it turned out the offset is varying +/- one frame (40 ms) around 0
  168. # across the course of the whole stimuli
  169. onset_in_seg -= 0.000
  170. # second, correct for the offset between whole stimulus
  171. # (audiovisual or audio-description) and its segments
  172. if target_time == 'avmovie':
  173. onset_in_seg = fix_audio_movie_segments(
  174. AUDIO_AV_OFFSETS,
  175. run,
  176. onset_in_seg)
  177. elif target_time == 'aomovie':
  178. onset_in_seg = fix_audio_descr_segments(
  179. AUDIO_AO_OFFSETS,
  180. run,
  181. onset_in_seg)
  182. else:
  183. raise ValueError('Unknown time label %s', target_time)
  184. elif annotated_time == 'avmovie':
  185. # all splendid for now
  186. pass
  187. row['onset'] = round(onset_in_seg, 3)
  188. row['duration'] = round(row['duration'], 3)
  189. # append that shit
  190. run_events[run].append(row)
  191. write_segmented_annos(infile, target_time, run_events, outdir)