annotations.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. from ChildProject.projects import ChildProject
  2. from ChildProject.annotations import AnnotationManager
  3. from lxml import etree
  4. import os
  5. import re
  6. import pandas as pd
  7. import multiprocessing as mp
  8. def extract_from_regex(pattern, subject):
  9. match = pattern.search(subject)
  10. return match.group(1) if match else ''
  11. class AnnotationImporter:
  12. parameters = {
  13. 'vtc': {'set': 'vtc', 'format': 'vtc_rttm'},
  14. 'alice': {'set': 'alice/output', 'format': 'alice'},
  15. 'vcm': {'set': 'vcm', 'format': 'vcm_rttm'},
  16. 'its': {'set': 'its', 'format': 'its'}
  17. }
  18. def __init__(self, path, threads = 1):
  19. self.path = path
  20. self.project = ChildProject('.')
  21. self.am = AnnotationManager(self.project)
  22. def extract_offsets(self, row):
  23. its = row['its_filename']
  24. xml = etree.parse(os.path.join('annotations/its/raw', its))
  25. timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
  26. recording = xml.xpath('/ITS/ProcessingUnit/Recording[@num="{}"]'.format(row['filter']))[0]
  27. start_time = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('startTime'))))
  28. end_time = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('endTime'))))
  29. row['range_onset'] = 0
  30. row['range_offset'] = end_time - start_time
  31. row['time_seek'] = -start_time
  32. return row
  33. def process(self, set, filter_func = None, raw_name_func = None, format = None, threads = 1):
  34. threads = threads if threads >= 1 else mp.cpu_count()
  35. input = self.project.recordings[['recording_filename', 'its_filename', 'duration']]
  36. input['set'] = self.parameters[set]['set'] if set in self.parameters else set
  37. input['format'] = self.parameters[set]['format'] if set in self.parameters else format
  38. input['filter'] = input['recording_filename'].str.extract(r"_([0-9]{1,})$")
  39. input['raw_filename'] = ''
  40. pool = mp.Pool(processes = threads)
  41. input = pd.DataFrame(pool.map(self.extract_offsets, input.to_dict(orient = 'records')))
  42. if set == 'its':
  43. input['raw_filename'] = input['its_filename']
  44. if callable(filter_func):
  45. input['filter'] = input.apply(filter_func, axis = 1)
  46. if callable(raw_name_func):
  47. input['raw_filename'] = input.apply(raw_name_func, axis = 1)
  48. input.dropna(subset = ['raw_filename'], inplace = True)
  49. self.am.remove_set(set, recursive = True)
  50. self.am.import_annotations(input, threads = threads)
  51. if set == 'alice':
  52. self.am.merge_sets(
  53. left_set = 'vtc',
  54. right_set = 'alice/output',
  55. left_columns = ['speaker_type'],
  56. right_columns = ['phonemes','syllables','words'],
  57. output_set = 'alice',
  58. threads = threads
  59. )
  60. def matching_recordings(self, annotation):
  61. recordings = self.project.recordings[self.project.recordings['session_id'] == annotation['session_id']].copy()
  62. recordings = recordings[['recording_filename', 'duration', 'session_offset']]
  63. recordings = recordings.assign(**annotation)
  64. recordings['n'] = recordings['recording_filename'].str.extract(r"_([0-9]+)$").astype(int)
  65. recordings.sort_values(['n'], ascending = True, inplace = True)
  66. if not len(recordings):
  67. return {}
  68. recordings['start'] = recordings['duration'].cumsum().shift(periods = 1, fill_value = 0)
  69. # segments of the session covered by each recording
  70. recordings['range_onset'] = recordings['start']
  71. recordings['range_offset'] = recordings['duration'].cumsum()
  72. # segments of the annotation covered by each recording, timestamps relative to beginning of the session
  73. recordings['range_onset'].clip(lower = annotation['range_onset'], upper = annotation['range_offset'], inplace = True)
  74. recordings['range_offset'].clip(lower = annotation['range_onset'], upper = annotation['range_offset'], inplace = True)
  75. # remove recordings that do not intersect with the annotation
  76. recordings = recordings[(recordings['range_offset']-recordings['range_onset']).astype(int) > 0]
  77. # translate session timestamps to recording-level timestamps.
  78. recordings['time_seek'] = annotation['time_seek'] - recordings['start']
  79. recordings['range_onset'] -= recordings['start']
  80. recordings['range_offset'] -= recordings['start']
  81. return recordings