Browse Source

sampler_polish.py update; sampling episode has to be within one recording

AgataKoziol 4 months ago
parent
commit
d51a648c3b
1 changed files with 57 additions and 5 deletions
  1. 57 5
      scripts/sampler_polish.py

+ 57 - 5
scripts/sampler_polish.py

@@ -8,7 +8,11 @@ from yaml import dump
 
 
 def find_recording(row, df):
-    idx = df[row['segment_onset'] < df['cumulative_time']]
+    idx = df[(row['segment_onset'] < df['cumulative_time']) & (row['segment_offset'] < df['cumulative_time'])]
+    try:
+        assert df[(row['segment_onset'] < df['cumulative_time']) & (row['segment_offset'] >= df['cumulative_time'])].empty
+    except AssertionError:
+        return np.nan
     return idx['recording_filename'].reset_index(drop=True)[0]
 
 def sample(
@@ -22,7 +26,7 @@ def sample(
         exclude: pd.DataFrame = None
     ):
     """Periodic sampling of a recording if there are multiple recordings per session.
-    :param destination: segments destination, defaults sto ./sampler
+    :param destination: segments destination, defaults to ./sampler
     :type destination: str
     :param length: length of each segment, in milliseconds, defaults 30000
     :type length: int
@@ -32,6 +36,9 @@ def sample(
     :type offset: int
     :param recordings: path to recordings to sample from; if None, all recordings will be sampled, defaults to './metadata/recordings.csv'
     :type recordings: str, optional
+
+    If sampled 30 seconds span over more than one recording, 5 minutes is subtracted from the segment_onset and offset until whole sampled period
+    is within one recording. If 5-minute subtraction continues until there is less than 5 minutes of the session left, this sampling is dropped
     """
     #self.project.get_recordings_from_list(self.recordings)
     recordings = pd.read_csv(recordings)
@@ -40,10 +47,35 @@ def sample(
 
     recordings["duration"].fillna(0, inplace=True)
 
-    # create timeline
     segments = pd.DataFrame(columns=['recording_filename', 'segment_onset', 'segment_offset'], dtype=object)
+    #segments = recordings[['recording_filename']].copy() #ONLY FOR FASTER VERSION
 
     recordings["cumulative_time"] = recordings.groupby(['child_id', 'date_iso'])['duration'].cumsum()
+
+
+#     ## FASTER
+#     segments['segment_onset'] = recordings.groupby(['child_id', 'date_iso'], group_keys=False).apply(
+#         lambda session: np.arange(
+#             offset,
+#             session.iloc[-1]["cumulative_time"] - length + 1e-4,
+#             period + length,
+#         )
+#     ).reset_index().iloc[:, -1]
+#     #print(segments)
+#     segments = segments.explode("segment_onset").reset_index(drop=True)
+#     segments.rename(
+#         columns={"segment_onset": "segment_onset"}, inplace=True
+#     )
+#     segments = segments.dropna(subset=['segment_onset']).reset_index(drop=True)
+#     segments["segment_onset"] = segments["segment_onset"].astype(int)
+#     segments["segment_offset"] = segments["segment_onset"] + length
+#     segments.rename(
+#         columns={"recording_filename": "recording_filename"}, inplace=True
+#     )
+#     #print(segments)
+#     #segments['recording_filename'] = segments.apply(lambda row: find_recording(row, group), axis=1)
+# ## FASTER
+
     for name, group in recordings.groupby(['child_id', 'date_iso']):
         group = group.reset_index(drop=True)
 
@@ -64,18 +96,38 @@ def sample(
                                                              group.iloc[-1]['date_iso'],
                                                              group.iloc[-1]['cumulative_time']/60000))
             continue
+
         segment = pd.DataFrame(data=onsets, columns=['segment_onset'])
         segment["segment_onset"] = segment["segment_onset"].astype(int)
         segment["segment_offset"] = segment["segment_onset"] + length
         segment['recording_filename'] = segment.apply(lambda row: find_recording(row, group), axis=1)
 
+        while segment.isna().any(axis=1).sum() > 0:
+            index = segment.isna().any(axis=1)
+
+            segment[index] = segment[index].apply(lambda x: x-300000)
+
+            try:
+                assert (segment.loc[index, 'segment_onset'] <= 300000).sum() == 0
+
+            except AssertionError:
+                segment = segment.drop((segment.loc[index, 'segment_onset'] <= 300000).index, axis='index').reset_index(drop=True)
+                print("Recordings of participant {0}, "
+                      "session {1} not sampled; "
+                      "5-minute shift failed".format(group.iloc[-1]['child_id'],
+                                                     group.iloc[-1]['date_iso']))
+                continue
+
+            segment.loc[index, 'recording_filename'] = segment[index].apply(lambda row: find_recording(row, group), axis=1)
 
         segments = pd.concat([segments, segment], ignore_index=True)
-    #segments = segments["recording_filename", "segment_onset", "segment_offset"]
+
     segments.rename(
         columns={"recording_filename": "recording_filename",
-                 "segment_onset": "segment_onset"}, inplace=True
+                 "segment_onset": "segment_onset",
+                 "segment_offset": "segment_offset"}, inplace=True
     )
+
     date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
 
     os.makedirs(destination, exist_ok=True)