12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- from lxml import etree
- import os
- import re
- import pandas as pd
- import multiprocessing as mp
- def extract_from_regex(pattern, subject):
- match = pattern.search(subject)
- return match.group(1) if match else ''
- class AnnotationImporter:
- parameters = {
- 'vtc': {'set': 'vtc', 'format': 'vtc_rttm'},
- 'alice': {'set': 'alice/output', 'format': 'alice'},
- 'vcm': {'set': 'vcm', 'format': 'vcm_rttm'},
- 'its': {'set': 'its', 'format': 'its'}
- }
- def __init__(self, path, threads = 1):
- self.path = path
- self.project = ChildProject('.')
- self.am = AnnotationManager(self.project)
- def extract_offsets(self, row):
- its = row['its_filename']
- xml = etree.parse(os.path.join('annotations/its/raw', its))
- timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
- recording = xml.xpath('/ITS/ProcessingUnit/Recording[@num="{}"]'.format(row['filter']))[0]
- row['range_onset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('startTime'))))
- row['range_offset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('endTime'))))
- row['time_seek'] = -int(row['range_onset'])
- return row
- def process(self, set, filter_func = None, raw_name_func = None, threads = 1):
- threads = threads if threads >= 1 else mp.cpu_count()
- input = self.project.recordings[['recording_filename', 'its_filename', 'duration']]
- input['set'] = self.parameters[set]['set']
- input['format'] = self.parameters[set]['format']
- input['filter'] = input['recording_filename'].str.extract(r"_([0-9]{1,})$")
- input['raw_filename'] = ''
- pool = mp.Pool(processes = threads)
- input = pd.DataFrame(pool.map(self.extract_offsets, input.to_dict(orient = 'records')))
- if set == 'its':
- input['raw_filename'] = input['its_filename']
- if callable(filter_func):
- input['filter'] = input.apply(filter_func, axis = 1)
- if callable(raw_name_func):
- input['raw_filename'] = input.apply(raw_name_func, axis = 1)
- input.dropna(subset = ['raw_filename'], inplace = True)
- self.am.remove_set(set, recursive = True)
- self.am.import_annotations(input, threads = threads)
- if set == 'alice':
- self.am.merge_sets(
- left_set = 'vtc',
- right_set = 'alice/output',
- left_columns = ['speaker_type'],
- right_columns = ['phonemes','syllables','words'],
- output_set = 'alice',
- threads = threads
- )
- def matching_recordings(self, annotation):
- recordings = self.project.recordings[self.project.recordings['session_id'] == annotation['session_id']]
- recordings.sort_values(['session_offset'], ascending = True, inplace = True)
- if not len(recordings):
- return {}
- on = annotation['time_seek'] + annotation['range_onset']
- off = annotation['time_seek'] + annotation['range_offset']
- recordings['on'] = recordings['duration'].cumsum().shift(periods = 1, fill_value = 0)
- recordings['off'] = recordings['duration'].cumsum()
- recordings['on'].clip(lower = on, upper = off, inplace = True)
- recordings['off'].clip(lower = on, upper = off, inplace = True)
- return recordings[(recordings['off']-recordings['on']).astype(int) > 0][['recording_filename', 'on', 'off']]\
- .to_dict(orient = 'records')
|