|
@@ -8,7 +8,11 @@ from yaml import dump
|
|
|
|
|
|
|
|
|
def find_recording(row, df):
|
|
|
- idx = df[row['segment_onset'] < df['cumulative_time']]
|
|
|
+ idx = df[(row['segment_onset'] < df['cumulative_time']) & (row['segment_offset'] < df['cumulative_time'])]
|
|
|
+ try:
|
|
|
+ assert df[(row['segment_onset'] < df['cumulative_time']) & (row['segment_offset'] >= df['cumulative_time'])].empty
|
|
|
+ except AssertionError:
|
|
|
+ return np.nan
|
|
|
return idx['recording_filename'].reset_index(drop=True)[0]
|
|
|
|
|
|
def sample(
|
|
@@ -22,7 +26,7 @@ def sample(
|
|
|
exclude: pd.DataFrame = None
|
|
|
):
|
|
|
"""Periodic sampling of a recording if there are multiple recordings per session.
|
|
|
- :param destination: segments destination, defaults sto ./sampler
|
|
|
+ :param destination: segments destination, defaults to ./sampler
|
|
|
:type destination: str
|
|
|
:param length: length of each segment, in milliseconds, defaults 30000
|
|
|
:type length: int
|
|
@@ -32,6 +36,9 @@ def sample(
|
|
|
:type offset: int
|
|
|
:param recordings: path to recordings to sample from; if None, all recordings will be sampled, defaults to './metadata/recordings.csv'
|
|
|
:type recordings: str, optional
|
|
|
+
|
|
|
+ If sampled 30 seconds span over more than one recording, 5 minutes is subtracted from the segment_onset and offset until whole sampled period
|
|
|
+ is within one recording. If 5-minute subtraction continues until there is less than 5 minutes of the session left, this sampling is dropped
|
|
|
"""
|
|
|
#self.project.get_recordings_from_list(self.recordings)
|
|
|
recordings = pd.read_csv(recordings)
|
|
@@ -40,10 +47,35 @@ def sample(
|
|
|
|
|
|
recordings["duration"].fillna(0, inplace=True)
|
|
|
|
|
|
- # create timeline
|
|
|
segments = pd.DataFrame(columns=['recording_filename', 'segment_onset', 'segment_offset'], dtype=object)
|
|
|
+ #segments = recordings[['recording_filename']].copy() #ONLY FOR FASTER VERSION
|
|
|
|
|
|
recordings["cumulative_time"] = recordings.groupby(['child_id', 'date_iso'])['duration'].cumsum()
|
|
|
+
|
|
|
+
|
|
|
+# ## FASTER
|
|
|
+# segments['segment_onset'] = recordings.groupby(['child_id', 'date_iso'], group_keys=False).apply(
|
|
|
+# lambda session: np.arange(
|
|
|
+# offset,
|
|
|
+# session.iloc[-1]["cumulative_time"] - length + 1e-4,
|
|
|
+# period + length,
|
|
|
+# )
|
|
|
+# ).reset_index().iloc[:, -1]
|
|
|
+# #print(segments)
|
|
|
+# segments = segments.explode("segment_onset").reset_index(drop=True)
|
|
|
+# segments.rename(
|
|
|
+# columns={"segment_onset": "segment_onset"}, inplace=True
|
|
|
+# )
|
|
|
+# segments = segments.dropna(subset=['segment_onset']).reset_index(drop=True)
|
|
|
+# segments["segment_onset"] = segments["segment_onset"].astype(int)
|
|
|
+# segments["segment_offset"] = segments["segment_onset"] + length
|
|
|
+# segments.rename(
|
|
|
+# columns={"recording_filename": "recording_filename"}, inplace=True
|
|
|
+# )
|
|
|
+# #print(segments)
|
|
|
+# #segments['recording_filename'] = segments.apply(lambda row: find_recording(row, group), axis=1)
|
|
|
+# ## FASTER
|
|
|
+
|
|
|
for name, group in recordings.groupby(['child_id', 'date_iso']):
|
|
|
group = group.reset_index(drop=True)
|
|
|
|
|
@@ -64,18 +96,38 @@ def sample(
|
|
|
group.iloc[-1]['date_iso'],
|
|
|
group.iloc[-1]['cumulative_time']/60000))
|
|
|
continue
|
|
|
+
|
|
|
segment = pd.DataFrame(data=onsets, columns=['segment_onset'])
|
|
|
segment["segment_onset"] = segment["segment_onset"].astype(int)
|
|
|
segment["segment_offset"] = segment["segment_onset"] + length
|
|
|
segment['recording_filename'] = segment.apply(lambda row: find_recording(row, group), axis=1)
|
|
|
|
|
|
+ while segment.isna().any(axis=1).sum() > 0:
|
|
|
+ index = segment.isna().any(axis=1)
|
|
|
+
|
|
|
+ segment[index] = segment[index].apply(lambda x: x-300000)
|
|
|
+
|
|
|
+ try:
|
|
|
+ assert (segment.loc[index, 'segment_onset'] <= 300000).sum() == 0
|
|
|
+
|
|
|
+ except AssertionError:
|
|
|
+ segment = segment.drop((segment.loc[index, 'segment_onset'] <= 300000).index, axis='index').reset_index(drop=True)
|
|
|
+ print("Recordings of participant {0}, "
|
|
|
+ "session {1} not sampled; "
|
|
|
+ "5-minute shift failed".format(group.iloc[-1]['child_id'],
|
|
|
+ group.iloc[-1]['date_iso']))
|
|
|
+ continue
|
|
|
+
|
|
|
+ segment.loc[index, 'recording_filename'] = segment[index].apply(lambda row: find_recording(row, group), axis=1)
|
|
|
|
|
|
segments = pd.concat([segments, segment], ignore_index=True)
|
|
|
- #segments = segments["recording_filename", "segment_onset", "segment_offset"]
|
|
|
+
|
|
|
segments.rename(
|
|
|
columns={"recording_filename": "recording_filename",
|
|
|
- "segment_onset": "segment_onset"}, inplace=True
|
|
|
+ "segment_onset": "segment_onset",
|
|
|
+ "segment_offset": "segment_offset"}, inplace=True
|
|
|
)
|
|
|
+
|
|
|
date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
|
|
os.makedirs(destination, exist_ok=True)
|