Browse Source

RF: Segment splitter to use Pandas and work based on cmdline arguments

Michael Hanke 6 years ago
parent
commit
dc7b55c746
1 changed files with 31 additions and 33 deletions
  1. 31 33
      code/researchcut2segments.py

+ 31 - 33
code/researchcut2segments.py

@@ -8,20 +8,20 @@ To Do:
 """
 from __future__ import print_function
 from collections import defaultdict
-from datetime import datetime
-import csv
 import os
+from os.path import basename
+from os.path import join as opj
+from os.path import exists
 import re
 import sys
+import pandas as pd
 
 
 # constants #
 MOVIE = True
 CROPPED = 0 # in sec; is a concatenated time series with cropped volumes used?
-INPUT_FILES = ['structure.csv',
-               'speech_vocalization.csv',
-               'speech_google_narrator.csv'] # sys.argv[1]
-OUT_DIR = './annos_segmented/output'
+INPUT_FILES = sys.argv[1:]
+OUT_DIR = 'segments'
 
 SEGMENTS_OFFSETS = (
     (0.00, 0.00),
@@ -189,35 +189,30 @@ def anno_time_to_seg_time(seg_starts, run_nr, anno_time, cropped_time):
     return seg_time
 
 
-def write_segmented_annos(source_anno, movie, cropped, run_dict, out_dir, ):
+def write_segmented_annos(infilename, movie, cropped, run_dict, out_dir, ):
     '''
     '''
     if MOVIE is True:
-        stimulus = 'movie'
+        stimulus = 'avmovie'
     else:
-        stimulus = 'audio'
+        stimulus = 'aomovie'
 
-    old_anno_name = os.path.splitext(os.path.basename(source_anno))[0]
-    new_anno_name = '%s_%s_%scr' % ((old_anno_name, stimulus, cropped))
-
-    print('Writing results to %s' % new_anno_name)
+    basefilename = basename(infilename)[:-4]
+    outdir = opj(out_dir, stimulus)
+    if not exists(outdir):
+        os.makedirs(outdir)
 
     for run in sorted(run_dict.keys()):
-        print(run)
-
-        tnow = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
-        out_fname = '%s_%s_run_%s.csv' % (new_anno_name, tnow, run + 1)
-        out_path = os.path.join(out_dir, out_fname)
-        print(out_path)
-
-        # in case the OUT_DIR changes to a directory incl. subdirectories
-        path = os.path.dirname(out_path)
-        if not os.path.exists(path):
-            os.makedirs(path)
+        outname = opj(out_dir, stimulus, '{}_run-{}_events.tsv'.format(
+            basefilename,
+            run + 1))
 
-        with open(out_path, 'w') as csv_file:
-            writer = csv.writer(csv_file)
-            writer.writerows(run_dict[run])
+        pd.DataFrame.from_records(
+            run_dict[run],
+            columns=run_dict[run][0].dtype.names).to_csv(
+                outname,
+                sep='\t',
+                index=False)
 
 
 #### main program #####
@@ -225,13 +220,13 @@ if __name__ == "__main__":
 
     # read the annotation file
     for input_file in INPUT_FILES[:1]:
-        anno = read_anno(input_file)
+        anno = pd.read_csv(input_file, sep='\t').to_records(index=False)
         segment_starts = [start for start, offset in SEGMENTS_OFFSETS]
 
         run_events = defaultdict(list)
         for row in anno:
-        # get the run number
-            run = get_run_number(segment_starts, row[0])
+            # get the run number
+            run = get_run_number(segment_starts, row['onset'])
 
             # SEGMENT SHIFT correction
             # is now implicitly done by func 'anno_time_to_seg_time'
@@ -242,9 +237,12 @@ if __name__ == "__main__":
 
             # finally convert the timings of the continouos annotation
             # to timings in respect to the start of the corresponding segment
-            row[0] = anno_time_to_seg_time(segment_starts, run, row[0], CROPPED)
-            if type(row[1]) == float:
-                row[1] = anno_time_to_seg_time(segment_starts, run, row[1], CROPPED)
+            onset = anno_time_to_seg_time(
+                segment_starts,
+                run,
+                float(row['onset']),
+                CROPPED)
+            row['onset'] = onset
 
             # AUDIO TIMING (MOVIE) correction
             # Dialoge im Film kommen 1/2 frame spater als das Hoerspiel,