Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

annotations.py 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. from ChildProject.projects import ChildProject
  2. from ChildProject.annotations import AnnotationManager
  3. from lxml import etree
  4. import os
  5. import re
  6. import pandas as pd
  7. import multiprocessing as mp
  8. def extract_from_regex(pattern, subject):
  9. match = pattern.search(subject)
  10. return match.group(1) if match else ''
  11. class AnnotationImporter:
  12. parameters = {
  13. 'vtc': {'set': 'vtc', 'format': 'vtc_rttm'},
  14. 'alice': {'set': 'alice/output', 'format': 'alice'},
  15. 'vcm': {'set': 'vcm', 'format': 'vcm_rttm'},
  16. 'its': {'set': 'its', 'format': 'its'}
  17. }
  18. def __init__(self, path, threads = 1):
  19. self.path = path
  20. self.project = ChildProject('.')
  21. self.am = AnnotationManager(self.project)
  22. def extract_offsets(self, row):
  23. its = row['its_filename']
  24. xml = etree.parse(os.path.join('annotations/its/raw', its))
  25. timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
  26. recording = xml.xpath('/ITS/ProcessingUnit/Recording[@num="{}"]'.format(row['filter']))[0]
  27. row['range_onset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('startTime'))))
  28. row['range_offset'] = int(1000*float(extract_from_regex(timestamp_pattern, recording.get('endTime'))))
  29. row['time_seek'] = -int(row['range_onset'])
  30. return row
  31. def process(self, set, filter_func = None, raw_name_func = None, threads = 1):
  32. threads = threads if threads >= 1 else mp.cpu_count()
  33. input = self.project.recordings[['recording_filename', 'its_filename', 'duration']]
  34. input['set'] = self.parameters[set]['set']
  35. input['format'] = self.parameters[set]['format']
  36. input['filter'] = input['recording_filename'].str.extract(r"_([0-9]{1,})$")
  37. input['raw_filename'] = ''
  38. pool = mp.Pool(processes = threads)
  39. input = pd.DataFrame(pool.map(self.extract_offsets, input.to_dict(orient = 'records')))
  40. if set == 'its':
  41. input['raw_filename'] = input['its_filename']
  42. if callable(filter_func):
  43. input['filter'] = input.apply(filter_func, axis = 1)
  44. if callable(raw_name_func):
  45. input['raw_filename'] = input.apply(raw_name_func, axis = 1)
  46. input.dropna(subset = ['raw_filename'], inplace = True)
  47. self.am.remove_set(set, recursive = True)
  48. self.am.import_annotations(input, threads = threads)
  49. if set == 'alice':
  50. self.am.merge_sets(
  51. left_set = 'vtc',
  52. right_set = 'alice/output',
  53. left_columns = ['speaker_type'],
  54. right_columns = ['phonemes','syllables','words'],
  55. output_set = 'alice',
  56. threads = threads
  57. )
  58. def matching_recordings(self, annotation):
  59. recordings = self.project.recordings[self.project.recordings['session_id'] == annotation['session_id']]
  60. recordings.sort_values(['session_offset'], ascending = True, inplace = True)
  61. if not len(recordings):
  62. return {}
  63. on = annotation['time_seek'] + annotation['range_onset']
  64. off = annotation['time_seek'] + annotation['range_offset']
  65. recordings['on'] = recordings['duration'].cumsum().shift(periods = 1, fill_value = 0)
  66. recordings['off'] = recordings['duration'].cumsum()
  67. recordings['on'].clip(lower = on, upper = off, inplace = True)
  68. recordings['off'].clip(lower = on, upper = off, inplace = True)
  69. return recordings[(recordings['off']-recordings['on']).astype(int) > 0][['recording_filename', 'on', 'off']]\
  70. .to_dict(orient = 'records')