Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

annotations.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. from ChildProject.projects import ChildProject
  2. from ChildProject.annotations import AnnotationManager
  3. from lxml import etree
  4. import os
  5. import re
  6. import pandas as pd
  7. import multiprocessing as mp
  8. def extract_from_regex(pattern, subject):
  9. match = pattern.search(subject)
  10. return match.group(1) if match else ''
  11. class AnnotationImporter:
  12. parameters = {
  13. 'vtc': {'set': 'vtc', 'format': 'vtc_rttm'},
  14. 'alice': {'set': 'alice/output', 'format': 'alice'},
  15. 'vcm': {'set': 'vcm', 'format': 'vcm_rttm'},
  16. 'its': {'set': 'its', 'format': 'its'}
  17. }
  18. def __init__(self, path, threads = 1):
  19. self.path = path
  20. self.project = ChildProject('.')
  21. self.am = AnnotationManager(self.project)
  22. def extract_offsets(self, row):
  23. its = row['its_filename']
  24. xml = etree.parse(os.path.join('annotations/its/raw', its))
  25. timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
  26. recording = xml.xpath('/ITS/ProcessingUnit/Recording[@num="{}"]'.format(row['filter']))[0]
  27. start_time = round(1000*float(extract_from_regex(timestamp_pattern, recording.get('startTime'))))
  28. end_time = round(1000*float(extract_from_regex(timestamp_pattern, recording.get('endTime'))))
  29. row['range_onset'] = 0
  30. row['range_offset'] = end_time - start_time
  31. row['time_seek'] = -start_time
  32. return row
  33. def process(self, set, filter_func = None, raw_name_func = None, format = None, threads = 1):
  34. threads = threads if threads >= 1 else mp.cpu_count()
  35. input = self.project.recordings[['recording_filename', 'its_filename', 'duration']]
  36. input['set'] = self.parameters[set]['set'] if set in self.parameters else set
  37. input['format'] = self.parameters[set]['format'] if set in self.parameters else format
  38. input['filter'] = input['recording_filename'].str.extract(r"_([0-9]{1,})(?:\.wav)?$")
  39. input['raw_filename'] = ''
  40. pool = mp.Pool(processes = threads)
  41. input = pd.DataFrame(pool.map(self.extract_offsets, input.to_dict(orient = 'records')))
  42. if set == 'its':
  43. input['raw_filename'] = input['its_filename']
  44. if callable(filter_func):
  45. input['filter'] = input.apply(filter_func, axis = 1)
  46. if callable(raw_name_func):
  47. input['raw_filename'] = input.apply(raw_name_func, axis = 1)
  48. input.dropna(subset = ['raw_filename'], inplace = True)
  49. self.am.remove_set(set, recursive = True)
  50. self.am.import_annotations(input, threads = threads)
  51. if set == 'alice':
  52. self.am.merge_sets(
  53. left_set = 'vtc',
  54. right_set = 'alice/output',
  55. left_columns = ['speaker_type'],
  56. right_columns = ['phonemes','syllables','words'],
  57. output_set = 'alice',
  58. threads = threads
  59. )
  60. def matching_recordings(self, annotation):
  61. recordings = self.project.recordings[self.project.recordings['session_id'] == annotation['session_id']].copy()
  62. recordings = recordings[['recording_filename', 'duration', 'session_offset']]
  63. recordings = recordings.assign(**annotation)
  64. recordings['n'] = recordings['recording_filename'].str.extract(r"_([0-9]+)(?:\.wav)?$").astype(int)
  65. recordings.sort_values(['n'], ascending = True, inplace = True)
  66. if not len(recordings):
  67. return {}
  68. recordings['start'] = recordings['duration'].cumsum().shift(periods = 1, fill_value = 0)
  69. # segments of the session covered by each recording
  70. recordings['range_onset'] = recordings['start']
  71. recordings['range_offset'] = recordings['duration'].cumsum()
  72. # segments of the annotation covered by each recording, timestamps relative to beginning of the session
  73. recordings['range_onset'].clip(lower = annotation['range_onset'], upper = annotation['range_offset'], inplace = True)
  74. recordings['range_offset'].clip(lower = annotation['range_onset'], upper = annotation['range_offset'], inplace = True)
  75. # remove recordings that do not intersect with the annotation
  76. recordings = recordings[(recordings['range_offset']-recordings['range_onset']).astype(int) > 0]
  77. # translate session timestamps to recording-level timestamps.
  78. recordings['time_seek'] = annotation['time_seek'] - recordings['start']
  79. recordings['range_onset'] -= recordings['start']
  80. recordings['range_offset'] -= recordings['start']
  81. return recordings