LAAC-LSCP
/
tools


			
			
				
					
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
							from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
from lxml import etree
import os
import re
import pandas as pd
import multiprocessing as mp

def extract_from_regex(pattern, subject):
    match = pattern.search(subject)
    return match.group(1) if match else ''

class AnnotationImporter:

    parameters = {
        'vtc': {'set': 'vtc', 'format': 'vtc_rttm'},
        'alice': {'set': 'alice/output', 'format': 'alice'},
        'vcm': {'set': 'vcm', 'format': 'vcm_rttm'},
        'its': {'set': 'its', 'format': 'its'}
    }

    def __init__(self, path, threads = 1):
        self.path = path

        self.project = ChildProject('.')
        self.am = AnnotationManager(self.project)

    def extract_offsets(self, row):
        its = row['its_filename']
        xml = etree.parse(os.path.join('annotations/its/raw', its))

        timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")

        recording = xml.xpath('/ITS/ProcessingUnit/Recording[@num="{}"]'.format(row['filter']))[0]
        row['range_onset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('startTime'))))
        row['range_offset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('endTime'))))
        row['time_seek'] = -int(row['range_onset'])
        return row

    def process(self, set, filter_func = None, raw_name_func = None, threads = 1):
        threads = threads if threads >= 1 else mp.cpu_count()

        input = self.project.recordings[['recording_filename', 'its_filename', 'duration']]
        input['set'] = self.parameters[set]['set']
        input['format'] = self.parameters[set]['format']
        input['filter'] = input['recording_filename'].str.extract(r"_([0-9]{1,})$")
        input['raw_filename'] = ''

        pool = mp.Pool(processes = threads)
        input = pd.DataFrame(pool.map(self.extract_offsets, input.to_dict(orient = 'records')))

        if set == 'its':
            input['raw_filename'] = input['its_filename']

        if callable(filter_func):
            input['filter'] = input.apply(filter_func, axis = 1)

        if callable(raw_name_func):
            input['raw_filename'] = input.apply(raw_name_func, axis = 1)

        input.dropna(subset = ['raw_filename'], inplace = True)

        self.am.remove_set(set, recursive = True)
        self.am.import_annotations(input, threads = threads)

        if set == 'alice':
            self.am.merge_sets(
                left_set = 'vtc',
                right_set = 'alice/output',
                left_columns = ['speaker_type'],
                right_columns = ['phonemes','syllables','words'],
                output_set = 'alice',
                threads = threads
            )

    def matching_recordings(self, annotation):
        recordings = self.project.recordings[self.project.recordings['session_id'] == annotation['session_id']]
        recordings.sort_values(['session_offset'], ascending = True, inplace = True)

        if not len(recordings):
            return {}

        on = annotation['time_seek'] + annotation['range_onset']
        off = annotation['time_seek'] + annotation['range_offset']

        recordings['on'] = recordings['duration'].cumsum().shift(periods = 1, fill_value = 0)
        recordings['off'] = recordings['duration'].cumsum()

        recordings['on'].clip(lower = on, upper = off, inplace = True)
        recordings['off'].clip(lower = on, upper = off, inplace = True)

        return recordings[(recordings['off']-recordings['on']).astype(int) > 0][['recording_filename', 'on', 'off']]\
            .to_dict(orient = 'records')