6 years ago · 4fbaf7e895
--- a/code/researchcut2segments.py
+++ b/code/researchcut2segments.py
@@ -1,12 +1,12 @@
 
				-#!/usr/bin/python
			
 
				+#!/usr/bin/python3
			
 
				 """
			
 
				 created on Wed Jan 30 2018
			
 
				 author: Christian Olaf Haeusler
			
 
				 
			
 
				 To Do:
			
 
				     argparser
			
 
				+    Erzaehler Filtern wennn MOVIE = True
			
 
				 """
			
 
				-from __future__ import print_function
			
 
				 from collections import defaultdict
			
 
				 import os
			
 
				 from os.path import basename
			
@@ -17,12 +17,6 @@ import sys
 
				 import pandas as pd
			
 
				 
			
 
				 
			
 
				-# constants #
			
 
				-MOVIE = True
			
 
				-CROPPED = 0 # in sec; is a concatenated time series with cropped volumes used?
			
 
				-INPUT_FILES = sys.argv[1:]
			
 
				-OUT_DIR = 'segments'
			
 
				-
			
 
				 SEGMENTS_OFFSETS = (
			
 
				     (0.00, 0.00),
			
 
				     (886.00, 0.00),
			
@@ -34,6 +28,39 @@ SEGMENTS_OFFSETS = (
 
				     (6410.44, 0.44),  # last segment's start
			
 
				     (7086.00, 0.00))  # movie's last time point
			
 
				 
			
 
				+# dictionaries with paired touples containing time (2sec steps) and offset
			
 
				+# in respect to the audiovisual movie (forrestgump_researchcut_ger_mono.mkv)
			
 
				+AUDIO_AV_OFFSETS = {
			
 
				+    0: {  0:  21.33},
			
 
				+    1: {  0:  37.33,
			
 
				+        408:  21.33},
			
 
				+    2: {  0:  69.33,
			
 
				+        199:  61.33},
			
 
				+    3: {  0:  93.33,
			
 
				+        320: 101.33},
			
 
				+    4: {  0: 109.33,
			
 
				+        401: 101.33},
			
 
				+    5: {  0: 141.33},
			
 
				+    6: {  0: 189.31,
			
 
				+         61: 181.31},
			
 
				+    7: {  0: 205.33}}
			
 
				+
			
 
				+AUDIO_AO_OFFSETS = {
			
 
				+    0: {  0:  47.02},
			
 
				+    1: {  0:  36.35,
			
 
				+        203:  47.02},
			
 
				+    2: {  0:  87.02,
			
 
				+        199:  92.35},
			
 
				+    3: {  0: 124.35,
			
 
				+        320: 132.35},
			
 
				+    4: {  0: 105.69,
			
 
				+        401:  92.35},
			
 
				+    5: {  0: 137.69,
			
 
				+        364: 167.02},
			
 
				+    6: {  0: 201.67,
			
 
				+         61: 543.00},
			
 
				+    7: {  0:-1422.31}}
			
 
				+
			
 
				 
			
 
				 def time_stamp_to_msec(t_stamp='01:50:34:01'):
			
 
				     '''
			
@@ -84,60 +111,47 @@ def get_run_number(starts, onset):
 
				     return run
			
 
				 
			
 
				 
			
 
				-def fix_segment_shift(timing_in_anno, cropped_time):
			
 
				+def whole_anno_to_segments(seg_starts, run_nr, anno_time):
			
 
				     '''
			
 
				-    the function is not necessary anymore since the correction
			
 
				-    is implicitly done by additionally given offsets in SEGMENTS_OFFSETS
			
 
				-
			
 
				-
			
 
				-    fixes the timing of the 8 stimulus movie sigments
			
 
				-    https://github.com/psychoinformatics-de/studyforrest-data-phase2/blob/master/code/stimulus/movie/segment_timing.csv
			
 
				+    "The position of an event from a movie annotation with respect to the
			
 
				+    cropped fMRI time series can now be determined by substracting the
			
 
				+    start time of the respective segment as listed in Table 1"
			
 
				+    http://studyforrest.org/annotation_timing.html
			
 
				     '''
			
 
				-    # regular case which will be kept in runs 1 and 2
			
 
				-    timing_in_segment = timing_in_anno
			
 
				-
			
 
				-    # correct for the accumulating offsets in segments 3 to 8
			
 
				-    for segment_start, offset in sorted(SEGMENTS_OFFSETS, reverse = True):
			
 
				-        # if timing is in a critical segment, correct the timing
			
 
				-        if timing_in_anno >= segment_start + cropped_time:
			
 
				-            timing_in_segment = round(timing_in_anno - offset, 3)
			
 
				-            break
			
 
				+    seg_time = anno_time - seg_starts[run_nr]
			
 
				 
			
 
				-    return timing_in_segment
			
 
				+    return seg_time
			
 
				 
			
 
				 
			
 
				-def fix_audio_timing(uncorrected_audio):
			
 
				-    '''the movie's audiotrack lacks behind the visual frames
			
 
				-    there is an slightly increasing offset (but problably no continuous drift)
			
 
				-    over the movie segments
			
 
				+def fix_audio_movie_segments(AUDIO_AV_OFFSETS, run, uncorrected):
			
 
				+    '''corrects the segments' audio offsets
			
 
				+    in respect to the unsegmented movie
			
 
				     '''
			
 
				-    corrected_audio = uncorrected_audio
			
 
				-    return corrected_audio
			
 
				+    critical_time_points = sorted(AUDIO_AV_OFFSETS[run].keys(), reverse=True)
			
 
				+    for crit in critical_time_points:
			
 
				+       if uncorrected >= crit * 2.0:
			
 
				+           corrected = uncorrected + (AUDIO_AV_OFFSETS[run][crit] / 1000.0)
			
 
				+           break
			
 
				 
			
 
				+    return corrected
			
 
				 
			
 
				-def anno_time_to_seg_time(seg_starts, run_nr, anno_time, cropped_time):
			
 
				-    '''
			
 
				-    "The position of an event from a movie annotation with respect to the
			
 
				-    cropped fMRI time series can now be determined by substracting the
			
 
				-    start time of the respective segment as listed in Table 1"
			
 
				-    http://studyforrest.org/annotation_timing.html
			
 
				 
			
 
				-    events occur earlier in the cropped stimulus segments.
			
 
				-    hence the cropped ammount is additionally substracted from the anno timing
			
 
				+def fix_audio_descr_segments(AUDIO_AO_OFFSETS, run, uncorrected):
			
 
				+    '''corrects the segments' audio offsets
			
 
				+    in respect to the unsegmented audiobook
			
 
				     '''
			
 
				-    seg_time = round(anno_time - (seg_starts[run_nr] + cropped_time), 2)
			
 
				+    critical_time_points = sorted(AUDIO_AO_OFFSETS[run].keys(), reverse=True)
			
 
				+    for crit in critical_time_points:
			
 
				+       if uncorrected >= crit * 2.0:
			
 
				+           corrected = uncorrected + (AUDIO_AO_OFFSETS[run][crit] / 1000.0)
			
 
				+           break
			
 
				 
			
 
				-    return seg_time
			
 
				+    return corrected
			
 
				 
			
 
				 
			
 
				-def write_segmented_annos(infilename, movie, cropped, run_dict, out_dir, ):
			
 
				+def write_segmented_annos(infilename, stimulus, run_dict, out_dir):
			
 
				     '''
			
 
				     '''
			
 
				-    if MOVIE is True:
			
 
				-        stimulus = 'avmovie'
			
 
				-    else:
			
 
				-        stimulus = 'aomovie'
			
 
				-
			
 
				     basefilename = basename(infilename)[:-4]
			
 
				     outdir = opj(out_dir, stimulus)
			
 
				     if not exists(outdir):
			
@@ -153,49 +167,82 @@ def write_segmented_annos(infilename, movie, cropped, run_dict, out_dir, ):
 
				             columns=run_dict[run][0].dtype.names).to_csv(
			
 
				                 outname,
			
 
				                 sep='\t',
			
 
				-                index=False)
			
 
				+                index=False,
			
 
				+                encoding='utf-8')
			
 
				 
			
 
				 
			
 
				 #### main program #####
			
 
				 if __name__ == "__main__":
			
 
				+    # constants #
			
 
				+    infile = sys.argv[1]
			
 
				+    annotated_time = sys.argv[2]
			
 
				+    target_time = sys.argv[3]
			
 
				+    outdir = sys.argv[4]
			
 
				 
			
 
				+#     with launch_ipdb_on_exception():
			
 
				     # read the annotation file
			
 
				-    for input_file in INPUT_FILES[:1]:
			
 
				-        anno = pd.read_csv(input_file, sep='\t').to_records(index=False)
			
 
				-        segment_starts = [start for start, offset in SEGMENTS_OFFSETS]
			
 
				-
			
 
				-        run_events = defaultdict(list)
			
 
				-        for row in anno:
			
 
				-            # get the run number
			
 
				-            run = get_run_number(segment_starts, row['onset'])
			
 
				-
			
 
				-            # SEGMENT SHIFT correction
			
 
				-            # is now implicitly done by func 'anno_time_to_seg_time'
			
 
				-            # using the adjusted segment starts (s. SEGMENTS_OFFSETS)
			
 
				-#             row[0] = fix_segment_shift(row[0], CROPPED)
			
 
				-#             if type(row[1]) == float:
			
 
				-#                 row[1] = fix_segment_shift(row[1], CROPPED)
			
 
				-
			
 
				-            # finally convert the timings of the continouos annotation
			
 
				-            # to timings in respect to the start of the corresponding segment
			
 
				-            onset = anno_time_to_seg_time(
			
 
				-                segment_starts,
			
 
				-                run,
			
 
				-                float(row['onset']),
			
 
				-                CROPPED)
			
 
				-            row['onset'] = onset
			
 
				-
			
 
				-            # AUDIO TIMING (MOVIE) correction
			
 
				-            # Dialoge im Film kommen 1/2 frame spater als das Hoerspiel,
			
 
				-            # das einem frame (40ms) nach vorn gezogen wurde
			
 
				-            if MOVIE is True:
			
 
				-                pass
			
 
				-
			
 
				-            # AUDIO TIMING (AUDIOBOOK) correction
			
 
				-            if MOVIE is False:
			
 
				-                pass
			
 
				-
			
 
				-            # append that shit
			
 
				-            run_events[run].append(row)
			
 
				-
			
 
				-        write_segmented_annos(input_file, MOVIE, CROPPED, run_events, OUT_DIR)
			
 
				+    anno = pd.read_csv(infile, sep='\t', encoding='utf-8').to_records(index=False)
			
 
				+    segment_starts = [start for start, offset in SEGMENTS_OFFSETS]
			
 
				+
			
 
				+    run_events = defaultdict(list)
			
 
				+    for row in anno:
			
 
				+        # get the run number
			
 
				+        run = get_run_number(segment_starts, row['onset'])
			
 
				+
			
 
				+        # convert the timings of a continuous annotation
			
 
				+        # to timings in respect to the start of the corresponding segment
			
 
				+        onset_in_seg = whole_anno_to_segments(
			
 
				+            segment_starts,
			
 
				+            run,
			
 
				+            float(row['onset']))
			
 
				+
			
 
				+
			
 
				+        # correct for the stimulus used to annotate the audiotrack
			
 
				+        if annotated_time == 'aomovie':
			
 
				+            # the files
			
 
				+            # forrestgump_researchcut_ad_ger.flac and
			
 
				+            # german_dvd_5.1_48000hz_488kb_research_cut_aligned_cutted_narrator_muted_48000Hz.flac
			
 
				+            # (that contain the audio description) were originally lagging
			
 
				+            # behind for XYZ msec and were shiftet forward
			
 
				+            # by one frame (40ms) in respect to the reference file
			
 
				+            # forrestgump_researchcut_ger.mkv
			
 
				+
			
 
				+            # 1st, correct for shifting the narrator (incl. dialogue) 40ms
			
 
				+            # to the front before annotating the narrator/dialogue
			
 
				+            onset_in_seg += 0.040
			
 
				+
			
 
				+            # 2nd, correct for the offset between the (unshifted) audio
			
 
				+            # description and the audiovisual movie
			
 
				+            # -> the offset is varying +/- one frame (40 ms) around 0
			
 
				+            onset_in_seg -= 0.000
			
 
				+
			
 
				+            # 3rd, correct for the offset between whole stimulus
			
 
				+            # (audiovisual or audio-only) and its segments
			
 
				+            if target_time == 'avmovie':
			
 
				+                onset_in_seg = fix_audio_movie_segments(
			
 
				+                    AUDIO_AV_OFFSETS,
			
 
				+                    run,
			
 
				+                    onset_in_seg)
			
 
				+
			
 
				+            elif target_time == 'aomovie':
			
 
				+                onset_in_seg = fix_audio_descr_segments(
			
 
				+                    AUDIO_AO_OFFSETS,
			
 
				+                    run,
			
 
				+                    onset_in_seg)
			
 
				+
			
 
				+            else:
			
 
				+                raise ValueError('Unknown time label %s', target_time)
			
 
				+
			
 
				+        elif annotated_time == 'avmovie':
			
 
				+            # all splendid for now
			
 
				+            pass
			
 
				+
			
 
				+        else:
			
 
				+            raise ValueError('%s is an unknown annotation', basename(input_file))
			
 
				+
			
 
				+        row['onset'] = round(onset_in_seg, 3)
			
 
				+
			
 
				+        # append that shit
			
 
				+        run_events[run].append(row)
			
 
				+
			
 
				+    write_segmented_annos(infile, target_time, run_events, outdir)