annotations.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. from ChildProject.projects import ChildProject
  2. from ChildProject.annotations import AnnotationManager
  3. from lxml import etree
  4. import os
  5. import re
  6. import pandas as pd
  7. import multiprocessing as mp
  8. def extract_from_regex(pattern, subject):
  9. match = pattern.search(subject)
  10. return match.group(1) if match else ''
  11. class AnnotationImporter:
  12. parameters = {
  13. 'vtc': {'set': 'vtc', 'format': 'vtc_rttm'},
  14. 'alice': {'set': 'alice/output', 'format': 'alice'},
  15. 'vcm': {'set': 'vcm', 'format': 'vcm_rttm'},
  16. 'its': {'set': 'its', 'format': 'its'}
  17. }
  18. def __init__(self, path, threads = 1):
  19. self.path = path
  20. self.project = ChildProject('.')
  21. self.am = AnnotationManager(self.project)
  22. def extract_offsets(self, row):
  23. its = row['its_filename']
  24. xml = etree.parse(os.path.join('annotations/its/raw', its))
  25. timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
  26. recording = xml.xpath('/ITS/ProcessingUnit/Recording[@num="{}"]'.format(row['filter']))[0]
  27. row['range_onset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('startTime'))))
  28. row['range_offset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('endTime'))))
  29. row['time_seek'] = -int(row['range_onset'])
  30. return row
  31. def process(self, set, filter_func = None, raw_name_func = None, format = None, threads = 1):
  32. threads = threads if threads >= 1 else mp.cpu_count()
  33. input = self.project.recordings[['recording_filename', 'its_filename', 'duration']]
  34. input['set'] = self.parameters[set]['set'] if set in self.parameters else set
  35. input['format'] = self.parameters[set]['format'] if set in self.parameters else format
  36. input['filter'] = input['recording_filename'].str.extract(r"_([0-9]{1,})$")
  37. input['raw_filename'] = ''
  38. pool = mp.Pool(processes = threads)
  39. input = pd.DataFrame(pool.map(self.extract_offsets, input.to_dict(orient = 'records')))
  40. if set == 'its':
  41. input['raw_filename'] = input['its_filename']
  42. if callable(filter_func):
  43. input['filter'] = input.apply(filter_func, axis = 1)
  44. if callable(raw_name_func):
  45. input['raw_filename'] = input.apply(raw_name_func, axis = 1)
  46. input.dropna(subset = ['raw_filename'], inplace = True)
  47. self.am.remove_set(set, recursive = True)
  48. self.am.import_annotations(input, threads = threads)
  49. if set == 'alice':
  50. self.am.merge_sets(
  51. left_set = 'vtc',
  52. right_set = 'alice/output',
  53. left_columns = ['speaker_type'],
  54. right_columns = ['phonemes','syllables','words'],
  55. output_set = 'alice',
  56. threads = threads
  57. )
  58. def matching_recordings(self, annotation):
  59. recordings = self.project.recordings[self.project.recordings['session_id'] == annotation['session_id']].copy()
  60. recordings = recordings[['recording_filename', 'duration', 'session_offset']]
  61. recordings = recordings.assign(**annotation)
  62. recordings.sort_values(['session_offset'], ascending = True, inplace = True)
  63. if not len(recordings):
  64. return {}
  65. on = annotation['time_seek'] + annotation['range_onset']
  66. off = annotation['time_seek'] + annotation['range_offset']
  67. recordings['start'] = recordings['duration'].cumsum().shift(periods = 1, fill_value = 0)
  68. recordings['on'] = recordings['start']
  69. recordings['off'] = recordings['duration'].cumsum()
  70. recordings['on'].clip(lower = on, upper = off, inplace = True)
  71. recordings['off'].clip(lower = on, upper = off, inplace = True)
  72. recordings = recordings[(recordings['off']-recordings['on']).astype(int) > 0]
  73. recordings['time_seek'] = annotation['time_seek'] - recordings['start']
  74. recordings['range_onset'] = annotation['range_onset'] + (recordings['on'] - on)
  75. recordings['range_offset'] = annotation['range_offset'] + (recordings['off'] - off)
  76. return recordings