annotations.py 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. from ChildProject.projects import ChildProject
  2. from ChildProject.annotations import AnnotationManager
  3. from lxml import etree
  4. import os
  5. import re
  6. import pandas as pd
  7. import multiprocessing as mp
  8. def extract_from_regex(pattern, subject):
  9. match = pattern.search(subject)
  10. return match.group(1) if match else ''
  11. class AnnotationImporter:
  12. parameters = {
  13. 'vtc': {'set': 'vtc', 'format': 'vtc_rttm'},
  14. 'alice': {'set': 'alice/output', 'format': 'alice'},
  15. 'vcm': {'set': 'vcm', 'format': 'vcm_rttm'},
  16. 'its': {'set': 'its', 'format': 'its'}
  17. }
  18. def __init__(self, path, threads = 1):
  19. self.path = path
  20. self.project = ChildProject('.')
  21. self.am = AnnotationManager(self.project)
  22. def extract_offsets(self, row):
  23. its = row['its_filename']
  24. xml = etree.parse(os.path.join('annotations/its/raw', its))
  25. timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
  26. recording = xml.xpath('/ITS/ProcessingUnit/Recording[@num="{}"]'.format(row['filter']))[0]
  27. row['range_onset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('startTime'))))
  28. row['range_offset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('endTime'))))
  29. row['time_seek'] = -int(row['range_onset'])
  30. return row
  31. def process(self, set, filter_func = None, raw_name_func = None, threads = 1):
  32. threads = threads if threads >= 1 else mp.cpu_count()
  33. input = self.project.recordings[['recording_filename', 'its_filename', 'duration']]
  34. input['set'] = self.parameters[set]['set']
  35. input['format'] = self.parameters[set]['format']
  36. input['filter'] = input['recording_filename'].str.extract(r"_([0-9]{1,})$")
  37. input['raw_filename'] = ''
  38. pool = mp.Pool(processes = threads)
  39. input = pd.DataFrame(pool.map(self.extract_offsets, input.to_dict(orient = 'records')))
  40. if set == 'its':
  41. input['raw_filename'] = input['its_filename']
  42. if callable(filter_func):
  43. input['filter'] = input.apply(filter_func, axis = 1)
  44. if callable(raw_name_func):
  45. input['raw_filename'] = input.apply(raw_name_func, axis = 1)
  46. input.dropna(subset = ['raw_filename'], inplace = True)
  47. self.am.remove_set(set, recursive = True)
  48. self.am.import_annotations(input, threads = threads)
  49. if set == 'alice':
  50. self.am.merge_sets(
  51. left_set = 'vtc',
  52. right_set = 'alice/output',
  53. left_columns = ['speaker_type'],
  54. right_columns = ['phonemes','syllables','words'],
  55. output_set = 'alice',
  56. threads = threads
  57. )
  58. def matching_recordings(self, annotation):
  59. recordings = self.project.recordings[self.project.recordings['session_id'] == annotation['session_id']]
  60. recordings.sort_values(['session_offset'], ascending = True, inplace = True)
  61. if not len(recordings):
  62. return {}
  63. on = annotation['time_seek'] + annotation['range_onset']
  64. off = annotation['time_seek'] + annotation['range_offset']
  65. recordings['on'] = recordings['duration'].cumsum().shift(periods = 1, fill_value = 0)
  66. recordings['off'] = recordings['duration'].cumsum()
  67. recordings['on'].clip(lower = on, upper = off, inplace = True)
  68. recordings['off'].clip(lower = on, upper = off, inplace = True)
  69. return recordings[(recordings['off']-recordings['on']).astype(int) > 0][['recording_filename', 'on', 'off']]\
  70. .to_dict(orient = 'records')