import_eaf_poland_safe.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Mon Jan 2 14:53:20 2023
  5. @author: lpeurey
  6. Manage the importation of eaf 2022 annotation campaign
  7. custom converter to import properly
  8. """
  9. import glob
  10. from pathlib import Path
  11. import numpy as np
  12. import pandas as pd
  13. import pympi
  14. from collections import defaultdict
  15. import os
  16. from ChildProject.projects import ChildProject
  17. from ChildProject.annotations import AnnotationManager
  18. SPEAKER_ID_TO_TYPE = defaultdict(
  19. lambda: "NA",
  20. {
  21. "CHI": "CHI",
  22. "FEM": "FEM",
  23. "MAL": "MAL",
  24. "OCH": "OCH",
  25. },
  26. )
  27. VCM_MAPPING = {
  28. 'A':'N',
  29. 'P':'N',
  30. 'W':'C',
  31. 'V':'C',
  32. 'L':'L',
  33. 'Y':'Y',
  34. 'U':'U',
  35. }
  36. XDS_MAPPING = {
  37. 'T':'T',
  38. 'C':'C',
  39. 'B':'A,C',
  40. 'A':'A',
  41. 'P':'P',
  42. 'O':'O',
  43. 'U':'U',
  44. }
  45. BP_RECS = ['77033_5/77033_5.WAV', '77021_5/V20230127-070014.WAV']
  46. def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
  47. eaf = pympi.Elan.Eaf(filename)
  48. segments = {}
  49. for tier_name in eaf.tiers:
  50. print(tier_name)
  51. annotations = eaf.tiers[tier_name][0]
  52. if (
  53. tier_name not in SPEAKER_ID_TO_TYPE
  54. and len(annotations) > 0
  55. ):
  56. print(
  57. "warning: unknown tier '{}' will be ignored in '{}'".format(
  58. tier_name, filename
  59. )
  60. )
  61. continue
  62. for aid in annotations:
  63. (start_ts, end_ts, value, svg_ref) = annotations[aid]
  64. (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
  65. segment = {
  66. "segment_onset": int(round(start_t)),
  67. "segment_offset": int(round(end_t)),
  68. "speaker_id": tier_name,
  69. "speaker_type": SPEAKER_ID_TO_TYPE[tier_name],
  70. "vcm_type": "NA",
  71. "vcm_type_precise": "NA",
  72. "msc_type": "NA",
  73. "xds_type": "NA",
  74. "gra_type": "NA",
  75. "addressee": "NA",
  76. }
  77. segments[aid] = segment
  78. for tier_name in eaf.tiers:
  79. if "@" in tier_name:
  80. label, ref = tier_name.split("@")
  81. else:
  82. label, ref = tier_name, None
  83. reference_annotations = eaf.tiers[tier_name][1]
  84. if ref not in SPEAKER_ID_TO_TYPE:
  85. continue
  86. for aid in reference_annotations:
  87. (ann, value, prev, svg) = reference_annotations[aid]
  88. ann = aid
  89. parentTier = eaf.tiers[eaf.annotations[ann]]
  90. while (
  91. "PARENT_REF" in parentTier[2]
  92. and parentTier[2]["PARENT_REF"]
  93. and len(parentTier[2]) > 0
  94. ):
  95. ann = parentTier[1][ann][0]
  96. parentTier = eaf.tiers[eaf.annotations[ann]]
  97. if ann not in segments:
  98. print(
  99. "warning: annotation '{}' not found in segments for '{}'".format(
  100. ann, filename
  101. )
  102. )
  103. continue
  104. segment = segments[ann]
  105. if value: #discard segments that have no label (kept NA)
  106. if label == "vcm":
  107. segment["vcm_type"] = VCM_MAPPING[value]
  108. segment["vcm_type_precise"] = value
  109. elif label == "msc":
  110. segment["msc_type"] = value
  111. elif label == "gra":
  112. segment["gra_type"] = value
  113. elif label == "xds":
  114. segment["addressee"] = XDS_MAPPING[value]
  115. return pd.DataFrame(segments.values())
  116. BP_REC = ['77033_5.WAV']
  117. chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
  118. if __name__ == '__main__' :
  119. project = ChildProject('.')
  120. am = AnnotationManager(project)
  121. files = pd.DataFrame([
  122. {'raw_filename': f}
  123. for f in glob.glob('./annotations/eaf_2023/ak/raw/*/*.eaf') if f.split('/')[-1] in BP_REC
  124. ])
  125. files['time_seek'] = 0
  126. print(files['raw_filename'])
  127. files['raw_filename'] = files['raw_filename'].apply(os.path.basename)
  128. print(files['raw_filename'])
  129. files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('.')[-2] + '/' + x.split('.')[-2] + '.WAV')
  130. # files = files[files['recording_filename'].isin(project.recordings['recording_filename'])]
  131. files['set'] = 'eaf_2023/ak'
  132. files['format'] = 'eaf'
  133. print(files)
  134. _files = []
  135. for f in files.to_dict(orient='records'):
  136. eaf = pympi.Elan.Eaf(Path('./annotations') / 'eaf_2023' / 'ak' / 'raw' / '77033_5/77033_5.eaf')
  137. print(f['raw_filename'])
  138. df = pd.DataFrame(columns=['range_onset', 'range_offset'])
  139. eaf.get_full_time_interval()
  140. for tier in eaf.get_tier_names():
  141. for ann in eaf.get_annotation_data_for_tier(tier):
  142. df2 = pd.DataFrame({'range_onset': ann[0], 'range_offset': ann[1]}, index=[0])
  143. df = pd.concat([df, df2], ignore_index=True)
  144. df = df.sort_values('range_onset').reset_index(drop=True)
  145. dif_st = np.diff(df['range_onset'].to_numpy())
  146. idx_st = [x + 1 for x, val in enumerate(dif_st) if val >= chunk_break]
  147. start_times = df['range_onset'].to_numpy()[idx_st]
  148. start_times = np.insert(start_times, 0, df['range_onset'].to_numpy()[0])
  149. df = df.sort_values('range_offset').reset_index(drop=True)
  150. dif_end = np.diff(df['range_offset'].to_numpy())
  151. idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
  152. end_times = df['range_offset'].to_numpy()[idx_end]
  153. end_times = np.append(end_times, df['range_offset'].to_numpy()[-1])
  154. final = pd.DataFrame(columns=['range_onset', 'range_offset'])
  155. final['range_onset'] = start_times
  156. final['range_offset'] = end_times
  157. final['time_seek'] = 0
  158. final['raw_filename'] = '77033_5/77033_5.eaf'
  159. final['recording_filename'] = '77033_5/77033_5.WAV'
  160. final['format'] = 'eaf'
  161. final['set'] = 'eaf_2023/ak'
  162. #_files.append(pd.DataFrame([f]))
  163. # for tier_name in ['CHI', 'FEM', 'MAL', 'OCH']:
  164. # portions = eaf.tiers[tier_name][0] #tier names
  165. #
  166. # for pid in portions:
  167. # (start_ts, end_ts, value, svg_ref) = portions[pid]
  168. # (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
  169. #
  170. # # if value.upper() != 'Y':
  171. # # continue
  172. # f['tier'] = tier_name
  173. # f['range_onset'] = start_t
  174. # f['range_offset'] = end_t
  175. #
  176. # _files.append(pd.DataFrame([f]))
  177. #import_df = pd.concat(_files).reset_index(drop=True)
  178. import_df = final.reset_index(drop=True)
  179. print(import_df)
  180. # import_df = project.recordings[['recording_filename', 'duration']]
  181. # import_df = import_df[import_df["recording_filename"].isin(BP_RECS)] #only keep bp recs
  182. # import_df.rename(columns={'duration':'range_offset'}, inplace=True)
  183. #
  184. # #import_df['set'] = 'eaf_2022/an1' # first batch
  185. # import_df['set'] = 'eaf_2023/ak' #import bautista's annotations
  186. #
  187. # import_df['time_seek'] = 0
  188. # import_df['range_onset'] = 0
  189. # import_df['format'] = 'eaf'
  190. #
  191. # #import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: RECORDINGS_MAPPING[x])
  192. # import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: os.path.basename(x.replace(".WAV",".eaf")))
  193. #print(import_df)
  194. #am.remove_set('eaf_2023/ak') #JESLI NIE DZIALA
  195. am.import_annotations(import_df, threads=1, import_function=convert, overwrite_existing=True)