4 months ago · d51a648c3b
--- a/scripts/sampler_polish.py
+++ b/scripts/sampler_polish.py
@@ -8,7 +8,11 @@ from yaml import dump
 
				 
			
 
				 
			
 
				 def find_recording(row, df):
			
 
				-    idx = df[row['segment_onset'] < df['cumulative_time']]
			
 
				+    idx = df[(row['segment_onset'] < df['cumulative_time']) & (row['segment_offset'] < df['cumulative_time'])]
			
 
				+    try:
			
 
				+        assert df[(row['segment_onset'] < df['cumulative_time']) & (row['segment_offset'] >= df['cumulative_time'])].empty
			
 
				+    except AssertionError:
			
 
				+        return np.nan
			
 
				     return idx['recording_filename'].reset_index(drop=True)[0]
			
 
				 
			
 
				 def sample(
			
@@ -22,7 +26,7 @@ def sample(
 
				         exclude: pd.DataFrame = None
			
 
				     ):
			
 
				     """Periodic sampling of a recording if there are multiple recordings per session.
			
 
				-    :param destination: segments destination, defaults sto ./sampler
			
 
				+    :param destination: segments destination, defaults to ./sampler
			
 
				     :type destination: str
			
 
				     :param length: length of each segment, in milliseconds, defaults 30000
			
 
				     :type length: int
			
@@ -32,6 +36,9 @@ def sample(
 
				     :type offset: int
			
 
				     :param recordings: path to recordings to sample from; if None, all recordings will be sampled, defaults to './metadata/recordings.csv'
			
 
				     :type recordings: str, optional
			
 
				+
			
 
				+    If sampled 30 seconds span over more than one recording, 5 minutes is subtracted from the segment_onset and offset until whole sampled period
			
 
				+    is within one recording. If 5-minute subtraction continues until there is less than 5 minutes of the session left, this sampling is dropped
			
 
				     """
			
 
				     #self.project.get_recordings_from_list(self.recordings)
			
 
				     recordings = pd.read_csv(recordings)
			
@@ -40,10 +47,35 @@ def sample(
 
				 
			
 
				     recordings["duration"].fillna(0, inplace=True)
			
 
				 
			
 
				-    # create timeline
			
 
				     segments = pd.DataFrame(columns=['recording_filename', 'segment_onset', 'segment_offset'], dtype=object)
			
 
				+    #segments = recordings[['recording_filename']].copy() #ONLY FOR FASTER VERSION
			
 
				 
			
 
				     recordings["cumulative_time"] = recordings.groupby(['child_id', 'date_iso'])['duration'].cumsum()
			
 
				+
			
 
				+
			
 
				+#     ## FASTER
			
 
				+#     segments['segment_onset'] = recordings.groupby(['child_id', 'date_iso'], group_keys=False).apply(
			
 
				+#         lambda session: np.arange(
			
 
				+#             offset,
			
 
				+#             session.iloc[-1]["cumulative_time"] - length + 1e-4,
			
 
				+#             period + length,
			
 
				+#         )
			
 
				+#     ).reset_index().iloc[:, -1]
			
 
				+#     #print(segments)
			
 
				+#     segments = segments.explode("segment_onset").reset_index(drop=True)
			
 
				+#     segments.rename(
			
 
				+#         columns={"segment_onset": "segment_onset"}, inplace=True
			
 
				+#     )
			
 
				+#     segments = segments.dropna(subset=['segment_onset']).reset_index(drop=True)
			
 
				+#     segments["segment_onset"] = segments["segment_onset"].astype(int)
			
 
				+#     segments["segment_offset"] = segments["segment_onset"] + length
			
 
				+#     segments.rename(
			
 
				+#         columns={"recording_filename": "recording_filename"}, inplace=True
			
 
				+#     )
			
 
				+#     #print(segments)
			
 
				+#     #segments['recording_filename'] = segments.apply(lambda row: find_recording(row, group), axis=1)
			
 
				+# ## FASTER
			
 
				+
			
 
				     for name, group in recordings.groupby(['child_id', 'date_iso']):
			
 
				         group = group.reset_index(drop=True)
			
 
				 
			
@@ -64,18 +96,38 @@ def sample(
 
				                                                              group.iloc[-1]['date_iso'],
			
 
				                                                              group.iloc[-1]['cumulative_time']/60000))
			
 
				             continue
			
 
				+
			
 
				         segment = pd.DataFrame(data=onsets, columns=['segment_onset'])
			
 
				         segment["segment_onset"] = segment["segment_onset"].astype(int)
			
 
				         segment["segment_offset"] = segment["segment_onset"] + length
			
 
				         segment['recording_filename'] = segment.apply(lambda row: find_recording(row, group), axis=1)
			
 
				 
			
 
				+        while segment.isna().any(axis=1).sum() > 0:
			
 
				+            index = segment.isna().any(axis=1)
			
 
				+
			
 
				+            segment[index] = segment[index].apply(lambda x: x-300000)
			
 
				+
			
 
				+            try:
			
 
				+                assert (segment.loc[index, 'segment_onset'] <= 300000).sum() == 0
			
 
				+
			
 
				+            except AssertionError:
			
 
				+                segment = segment.drop((segment.loc[index, 'segment_onset'] <= 300000).index, axis='index').reset_index(drop=True)
			
 
				+                print("Recordings of participant {0}, "
			
 
				+                      "session {1} not sampled; "
			
 
				+                      "5-minute shift failed".format(group.iloc[-1]['child_id'],
			
 
				+                                                     group.iloc[-1]['date_iso']))
			
 
				+                continue
			
 
				+
			
 
				+            segment.loc[index, 'recording_filename'] = segment[index].apply(lambda row: find_recording(row, group), axis=1)
			
 
				 
			
 
				         segments = pd.concat([segments, segment], ignore_index=True)
			
 
				-    #segments = segments["recording_filename", "segment_onset", "segment_offset"]
			
 
				+
			
 
				     segments.rename(
			
 
				         columns={"recording_filename": "recording_filename",
			
 
				-                 "segment_onset": "segment_onset"}, inplace=True
			
 
				+                 "segment_onset": "segment_onset",
			
 
				+                 "segment_offset": "segment_offset"}, inplace=True
			
 
				     )
			
 
				+
			
 
				     date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				 
			
 
				     os.makedirs(destination, exist_ok=True)