|
@@ -12,6 +12,7 @@ import numpy as np
|
|
|
from os.path import join as opj
|
|
|
from os.path import basename, exists, splitext
|
|
|
import pandas as pd
|
|
|
+from pyannote.core import Annotation, Segment, Timeline
|
|
|
import soundfile
|
|
|
|
|
|
def extrude(self, removed, mode: str = 'intersection'):
|
|
@@ -27,39 +28,53 @@ def extrude(self, removed, mode: str = 'intersection'):
|
|
|
|
|
|
return self.crop(truncating_support, mode=mode)
|
|
|
|
|
|
-def process_clip(am, clip):
|
|
|
+def process_clip(am, annotator, clip):
|
|
|
+ index, ann = clip
|
|
|
+ recording_filename, range_onset, range_offset, clip_path = index
|
|
|
+
|
|
|
speakers = ['CHI', 'OCH', 'FEM', 'MAL']
|
|
|
vtc_speakers = ['KCHI', 'CHI', 'FEM', 'MAL']
|
|
|
|
|
|
- onset = clip['range_onset']
|
|
|
- offset = clip['range_offset']
|
|
|
+ print('reading scores...')
|
|
|
|
|
|
- rows = []
|
|
|
+ data = None
|
|
|
for i, speaker in enumerate(vtc_speakers):
|
|
|
signal, sr = librosa.load(
|
|
|
- clip['clip_path'] + f'_{speaker}',
|
|
|
+ clip_path + f'/{speaker}.wav',
|
|
|
sr = 16000,
|
|
|
offset = 2,
|
|
|
- duration = (offset-onset)/1000
|
|
|
+ duration = (range_offset-range_onset)/1000
|
|
|
)
|
|
|
- rows.append(signal)
|
|
|
+ print(pd.DataFrame(signal).describe())
|
|
|
+ if data is not None:
|
|
|
+ data = np.vstack((data, signal))
|
|
|
+ else:
|
|
|
+ data = signal
|
|
|
|
|
|
- data = np.array(rows)
|
|
|
+ print('done.')
|
|
|
|
|
|
- ann = pd.concat([intersection, intersection.assign(set = annotator)])
|
|
|
- segments = am.get_segments(ann)
|
|
|
+ print(data.shape)
|
|
|
+ time_scale = data.shape[1]/(range_offset-range_onset)
|
|
|
+ print(time_scale)
|
|
|
|
|
|
+ segments = am.get_segments(ann)
|
|
|
+ if not len(segments):
|
|
|
+ return None
|
|
|
segments = segments[segments['speaker_type'].isin(speakers)]
|
|
|
|
|
|
vtc = {
|
|
|
speaker: segments_to_annotation(segments[(segments['set'] == 'vtc') & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
|
|
|
for speaker in speakers
|
|
|
}
|
|
|
+
|
|
|
+ print(vtc)
|
|
|
|
|
|
truth = {
|
|
|
speaker: segments_to_annotation(segments[(segments['set'] == annotator) & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
|
|
|
for speaker in speakers
|
|
|
}
|
|
|
+
|
|
|
+ print(truth)
|
|
|
|
|
|
stats = {}
|
|
|
for i, speaker_A in enumerate(speakers):
|
|
@@ -71,9 +86,16 @@ def process_clip(am, clip):
|
|
|
vtc[f'{speaker_A}_vocs_fp_{speaker_B}'] = vtc[f'{speaker_A}_vocs_fp'].crop(truth[speaker_B], mode = 'loose')
|
|
|
|
|
|
s = []
|
|
|
- for explained in vtc[f'{speaker_A}_vocs_explained']:
|
|
|
+ for explained in vtc[speaker_A]:
|
|
|
onset, offset = explained
|
|
|
- s.append(np.mean(data[i, 16*onset:16*offset]))
|
|
|
+ onset -= range_onset
|
|
|
+ offset -= range_onset
|
|
|
+
|
|
|
+ print(speaker_A, onset, offset)
|
|
|
+ print(data[i, int(onset*time_scale):int(offset*time_scale)])
|
|
|
+ print(np.mean(data[i, int(onset*time_scale):int(offset*time_scale)]))
|
|
|
+
|
|
|
+ s.append(np.mean(data[i, int(onset*time_scale):int(offset*time_scale)]))
|
|
|
|
|
|
stats[f'{speaker_A}_vocs_explained'] = s
|
|
|
|
|
@@ -83,12 +105,18 @@ def process_clip(am, clip):
|
|
|
|
|
|
for fp in vtc[f'{speaker_A}_vocs_fp_{speaker_B}']:
|
|
|
onset, offset = fp
|
|
|
- sA.append(np.mean(data[i, 16*onset:16*offset]))
|
|
|
- sB.append(np.mean(data[j, 16*onset:16*offset]))
|
|
|
+ onset -= range_onset
|
|
|
+ offset -= range_onset
|
|
|
+
|
|
|
+ sA.append(np.mean(data[i, int(onset*time_scale):int(offset*time_scale)]))
|
|
|
+ sA.append(np.mean(data[j, int(onset*time_scale):int(offset*time_scale)]))
|
|
|
+
|
|
|
|
|
|
stats[f'{speaker_A}_vocs_fp_{speaker_B}_false'] = sA
|
|
|
stats[f'{speaker_A}_vocs_fp_{speaker_B}_true'] = sB
|
|
|
|
|
|
+ print(stats)
|
|
|
+
|
|
|
return stats
|
|
|
|
|
|
|
|
@@ -104,13 +132,17 @@ def generate_stats(parameters):
|
|
|
)
|
|
|
|
|
|
intersection['clip_path'] = intersection.apply(
|
|
|
- lambda f: opj('output/scores/', splitext(basename(clip['recording_filename']))[0], str(clip['range_onset']) + '_' + str(clip['range_offset'])),
|
|
|
+ lambda r: opj('output/scores/', splitext(basename(r['recording_filename']))[0] + '_' + str(r['range_onset']) + '_' + str(r['range_offset'])),
|
|
|
axis = 1
|
|
|
)
|
|
|
|
|
|
- clips = intersection[intersection['set'] == 'vtc']
|
|
|
- with mp.Pool(processes = 32) as mp:
|
|
|
- stats = pool.map(partial(process_clip, am), clips)
|
|
|
+ with mp.Pool(processes = 4) as pool:
|
|
|
+ stats = [
|
|
|
+ process_clip(am, annotator, clip)
|
|
|
+ for clip in intersection.groupby([
|
|
|
+ 'recording_filename', 'range_onset', 'range_offset', 'clip_path'
|
|
|
+ ])
|
|
|
+ ]
|
|
|
|
|
|
return stats
|
|
|
|
|
@@ -118,3 +150,4 @@ if __name__ == '__main__':
|
|
|
annotators = pd.read_csv('input/annotators.csv')
|
|
|
annotators['path'] = annotators['corpus'].apply(lambda c: opj('input', c))
|
|
|
stats = [generate_stats(annotator) for annotator in annotators.to_dict(orient = 'records')]
|
|
|
+
|