AgataKoziol преди 6 месеца
родител
ревизия
2d9d0c1368
променени са 33 файла, в които са добавени 320 реда и са изтрити 106 реда
  1. 1 0
      annotations/alice/converted/.DS_Store
  2. 1 1
      annotations/eaf_2023/ak/converted/.DS_Store
  3. 1 1
      annotations/eaf_2023/ak/converted/77033_5/77033_5_18623929_18668843.csv
  4. 1 0
      annotations/joined/77021_5_vtc+eaf.eaf
  5. 1 0
      annotations/joined/77021_5_vtc+eaf.pfsx
  6. 1 0
      annotations/joined/77033_5_vtc+eaf.eaf
  7. 1 0
      annotations/joined/77033_5_vtc+eaf.pfsx
  8. 0 1
      conf_matrix.png
  9. 0 1
      conf_matrix_normalized.png
  10. 1 0
      confusion_matrices/.DS_Store
  11. 1 0
      confusion_matrices/77021+77033_conf_matrix.png
  12. 1 0
      confusion_matrices/77021+77033_conf_matrix_normalized.png
  13. 1 0
      confusion_matrices/77021+77033_scores.txt
  14. 1 0
      confusion_matrices/77033_5_conf_matrix.png
  15. 1 0
      confusion_matrices/77033_5_normalized.png
  16. 1 0
      confusion_matrices/77033_5_scores.txt
  17. 1 0
      confusion_matrices/V20230127-070014_conf_matrix.png
  18. 1 0
      confusion_matrices/V20230127-070014_conf_matrix_normalized.png
  19. 1 0
      confusion_matrices/V20230127-070014_scores.txt
  20. 1 0
      extra/errors_import_20231024-143103.csv
  21. 1 0
      extra/errors_import_20231024-145554.csv
  22. 11 12
      metadata/annotations.csv
  23. 0 1
      scores.txt
  24. BIN
      scripts/.DS_Store
  25. 3 0
      scripts/.idea/.gitignore
  26. 1 0
      scripts/.idea/inspectionProfiles/profiles_settings.xml
  27. 1 0
      scripts/.idea/misc.xml
  28. 1 0
      scripts/.idea/modules.xml
  29. 1 0
      scripts/.idea/scripts.iml
  30. 1 0
      scripts/.idea/vcs.xml
  31. 6 3
      scripts/confusion_matrix2.py
  32. 41 86
      scripts/import_eaf_poland.py
  33. 235 0
      scripts/import_eaf_poland_safe.py

+ 1 - 0
annotations/alice/converted/.DS_Store

@@ -0,0 +1 @@
+../../../.git/annex/objects/0x/mJ/MD5E-s6148--3c37a1be6541f69cc00ad97ae4d861dc/MD5E-s6148--3c37a1be6541f69cc00ad97ae4d861dc

+ 1 - 1
annotations/eaf_2023/ak/converted/.DS_Store

@@ -1 +1 @@
-../../../../.git/annex/objects/5P/W9/MD5E-s6148--5dd6486a90acc6f6787dee9f770b7fbc/MD5E-s6148--5dd6486a90acc6f6787dee9f770b7fbc
+../../../../.git/annex/objects/p9/40/MD5E-s6148--92aed08864d62d1198b10f4f6aa0d279/MD5E-s6148--92aed08864d62d1198b10f4f6aa0d279

+ 1 - 1
annotations/eaf_2023/ak/converted/77033_5/77033_5_18623929_18668843.csv

@@ -1 +1 @@
-../../../../../.git/annex/objects/8Z/kp/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv/MD5E-s2177--615c45205e56fea6994f1baeccb63723.csv
+../../../../../.git/annex/objects/Mm/Fg/MD5E-s2049--be7535d17d7bc6c774a5822350d90899.csv/MD5E-s2049--be7535d17d7bc6c774a5822350d90899.csv

+ 1 - 0
annotations/joined/77021_5_vtc+eaf.eaf

@@ -0,0 +1 @@
+../../.git/annex/objects/23/34/MD5E-s167157--ee333c782d3c1afa578091011fe31646.eaf/MD5E-s167157--ee333c782d3c1afa578091011fe31646.eaf

+ 1 - 0
annotations/joined/77021_5_vtc+eaf.pfsx

@@ -0,0 +1 @@
+../../.git/annex/objects/3P/Jz/MD5E-s4866--15f83ed589aafd7e53a66934f79dd2f2.pfsx/MD5E-s4866--15f83ed589aafd7e53a66934f79dd2f2.pfsx

+ 1 - 0
annotations/joined/77033_5_vtc+eaf.eaf

@@ -0,0 +1 @@
+../../.git/annex/objects/0f/xJ/MD5E-s4187493--8a6fc4e0834bb7dab0c751c3f18d7677.eaf/MD5E-s4187493--8a6fc4e0834bb7dab0c751c3f18d7677.eaf

+ 1 - 0
annotations/joined/77033_5_vtc+eaf.pfsx

@@ -0,0 +1 @@
+../../.git/annex/objects/jX/QW/MD5E-s4918--628e68886b78beae4b8e8f395ceeb5fc.pfsx/MD5E-s4918--628e68886b78beae4b8e8f395ceeb5fc.pfsx

+ 0 - 1
conf_matrix.png

@@ -1 +0,0 @@
-.git/annex/objects/wz/jF/MD5E-s41831--9f7bf87dafe197a2a77172d0110d72e5.png/MD5E-s41831--9f7bf87dafe197a2a77172d0110d72e5.png

+ 0 - 1
conf_matrix_normalized.png

@@ -1 +0,0 @@
-.git/annex/objects/m0/9w/MD5E-s54528--6510a946809a440c5d7ca1311cd8a30e.png/MD5E-s54528--6510a946809a440c5d7ca1311cd8a30e.png

+ 1 - 0
confusion_matrices/.DS_Store

@@ -0,0 +1 @@
+../.git/annex/objects/f8/xk/MD5E-s6148--8f9b2022689daadc3b199b10d14211d3/MD5E-s6148--8f9b2022689daadc3b199b10d14211d3

+ 1 - 0
confusion_matrices/77021+77033_conf_matrix.png

@@ -0,0 +1 @@
+../.git/annex/objects/wz/jF/MD5E-s41831--9f7bf87dafe197a2a77172d0110d72e5.png/MD5E-s41831--9f7bf87dafe197a2a77172d0110d72e5.png

+ 1 - 0
confusion_matrices/77021+77033_conf_matrix_normalized.png

@@ -0,0 +1 @@
+../.git/annex/objects/m0/9w/MD5E-s54528--6510a946809a440c5d7ca1311cd8a30e.png/MD5E-s54528--6510a946809a440c5d7ca1311cd8a30e.png

+ 1 - 0
confusion_matrices/77021+77033_scores.txt

@@ -0,0 +1 @@
+../.git/annex/objects/v5/5K/MD5E-s456--623782ee688701a2a4a087d4b970fed5.txt/MD5E-s456--623782ee688701a2a4a087d4b970fed5.txt

+ 1 - 0
confusion_matrices/77033_5_conf_matrix.png

@@ -0,0 +1 @@
+../.git/annex/objects/wm/qv/MD5E-s36394--df4b2ba29f0729f54dbc999c1d6cabf9.png/MD5E-s36394--df4b2ba29f0729f54dbc999c1d6cabf9.png

+ 1 - 0
confusion_matrices/77033_5_normalized.png

@@ -0,0 +1 @@
+../.git/annex/objects/8q/Q5/MD5E-s41337--bbca154b290ab8bdc9914ae992184712.png/MD5E-s41337--bbca154b290ab8bdc9914ae992184712.png

+ 1 - 0
confusion_matrices/77033_5_scores.txt

@@ -0,0 +1 @@
+../.git/annex/objects/0X/47/MD5E-s348--bdefb12599ba68fcd62ee4d4f1bc1d7b.txt/MD5E-s348--bdefb12599ba68fcd62ee4d4f1bc1d7b.txt

+ 1 - 0
confusion_matrices/V20230127-070014_conf_matrix.png

@@ -0,0 +1 @@
+../.git/annex/objects/3M/qg/MD5E-s33270--67de2ebe6367628f980cf32cc108ac15.png/MD5E-s33270--67de2ebe6367628f980cf32cc108ac15.png

+ 1 - 0
confusion_matrices/V20230127-070014_conf_matrix_normalized.png

@@ -0,0 +1 @@
+../.git/annex/objects/gw/8v/MD5E-s43589--f1d2a8084f8e62425c6c27d0d5497964.png/MD5E-s43589--f1d2a8084f8e62425c6c27d0d5497964.png

+ 1 - 0
confusion_matrices/V20230127-070014_scores.txt

@@ -0,0 +1 @@
+../.git/annex/objects/kq/67/MD5E-s365--01d8a336db03795b57ce8975da0f67f6.txt/MD5E-s365--01d8a336db03795b57ce8975da0f67f6.txt

+ 1 - 0
extra/errors_import_20231024-143103.csv

@@ -0,0 +1 @@
+../.git/annex/objects/2w/FV/MD5E-s2806--505927defcd0f4217878cb5dbcc942e5.csv/MD5E-s2806--505927defcd0f4217878cb5dbcc942e5.csv

+ 1 - 0
extra/errors_import_20231024-145554.csv

@@ -0,0 +1 @@
+../.git/annex/objects/zg/p5/MD5E-s2806--9501437cea7af8f9e8b52c17cb0956fb.csv/MD5E-s2806--9501437cea7af8f9e8b52c17cb0956fb.csv

+ 11 - 12
metadata/annotations.csv

@@ -1,19 +1,18 @@
 set,recording_filename,time_seek,range_onset,range_offset,raw_filename,format,filter,annotation_filename,imported_at,package_version,error,merged_from
-eaf_2023/ak,77033_5/77033_5.WAV,0,10614464,10682016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_10614464_10682016.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,14928930,14991430,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_14928930_14991430.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,17240844,17312171,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_17240844_17312171.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,1804637,1918913,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_1804637_1918913.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,18623929,18668843,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_18623929_18668843.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,19080362,19095206,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_19080362_19095206.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,310,72488,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_310_72488.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,3378792,3441447,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_3378792_3441447.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,7034603,7098981,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_7034603_7098981.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,8343248,8406016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_8343248_8406016.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77033_5/77033_5.WAV,0,899827,962068,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_899827_962068.csv,2023-10-22 15:45:26,0.1.1,,
-eaf_2023/ak,77021_5/V20230127-070014.WAV,0,0,209500,77021_5/V20230127-070014.eaf,eaf,,77021_5/V20230127-070014_0_209500.csv,2023-10-22 16:55:12,0.1.1,,
 alice/output,77033_5/77033_5.WAV,0,0,19100960,77033_5/77033_5.txt,alice,,77033_5/77033_5_0_19100960.csv,2023-10-23 09:15:20,0.1.1,,
 vtc,77033_5/77033_5.WAV,0,0,19100960,77033_5/77033_5.rttm,vtc_rttm,,77033_5/77033_5_0_19100960.csv,2023-10-23 09:16:32,0.1.1,,
 vtc,77021_5/V20230127-070014.WAV,0,0,584705,77021_5/V20230127-070014.rttm,vtc_rttm,,77021_5/V20230127-070014_0_584705.csv,2023-10-23 09:18:21,0.1.1,,
 alice/output,77021_5/V20230127-070014.WAV,0,0,584705,77021_5/V20230127-070014.txt,alice,,77021_5/V20230127-070014_0_584705.csv,2023-10-23 09:18:33,0.1.1,,
 alice,77021_5/V20230127-070014.WAV,0,0,584705,"77021_5/V20230127-070014.rttm,77021_5/V20230127-070014.txt",NA,,77021_5/V20230127-070014_0_584705.csv,2023-10-23 09:20:09,0.1.1,,"vtc,alice/output"
 alice,77033_5/77033_5.WAV,0,0,19100960,"77033_5/77033_5.rttm,77033_5/77033_5.txt",NA,,77033_5/77033_5_0_19100960.csv,2023-10-23 09:20:09,0.1.1,,"vtc,alice/output"
+eaf_2023/ak,77033_5/77033_5.WAV,0,10614464,10682016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_10614464_10682016.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,14928930,14991430,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_14928930_14991430.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,17240844,17312171,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_17240844_17312171.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,1804637,1918913,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_1804637_1918913.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,18623929,18668843,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_18623929_18668843.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,19080362,19095206,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_19080362_19095206.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,310,72488,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_310_72488.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,3378792,3441447,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_3378792_3441447.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,7034603,7098981,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_7034603_7098981.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,8343248,8406016,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_8343248_8406016.csv,2023-10-24 15:03:34,0.1.1,,
+eaf_2023/ak,77033_5/77033_5.WAV,0,899827,962068,77033_5/77033_5.eaf,eaf,,77033_5/77033_5_899827_962068.csv,2023-10-24 15:03:34,0.1.1,,

+ 0 - 1
scores.txt

@@ -1 +0,0 @@
-.git/annex/objects/v5/5K/MD5E-s456--623782ee688701a2a4a087d4b970fed5.txt/MD5E-s456--623782ee688701a2a4a087d4b970fed5.txt

BIN
scripts/.DS_Store


+ 3 - 0
scripts/.idea/.gitignore

@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml

+ 1 - 0
scripts/.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1 @@
+../../../.git/annex/objects/0X/zk/MD5E-s174--05dbe611f5bdb7a801adb3f064d4bfa9.xml/MD5E-s174--05dbe611f5bdb7a801adb3f064d4bfa9.xml

+ 1 - 0
scripts/.idea/misc.xml

@@ -0,0 +1 @@
+../../.git/annex/objects/WX/fJ/MD5E-s186--b1acd126c19eb652443ec7eb09a2cda3.xml/MD5E-s186--b1acd126c19eb652443ec7eb09a2cda3.xml

+ 1 - 0
scripts/.idea/modules.xml

@@ -0,0 +1 @@
+../../.git/annex/objects/KP/06/MD5E-s266--ee62b3bf219ff21ba297705f558418a5.xml/MD5E-s266--ee62b3bf219ff21ba297705f558418a5.xml

+ 1 - 0
scripts/.idea/scripts.iml

@@ -0,0 +1 @@
+../../.git/annex/objects/mX/7x/MD5E-s284--746712ec47cdba5a5a256e980847b0bb.iml/MD5E-s284--746712ec47cdba5a5a256e980847b0bb.iml

+ 1 - 0
scripts/.idea/vcs.xml

@@ -0,0 +1 @@
+../../.git/annex/objects/15/Jv/MD5E-s183--f990e6d29e03715a04983dd2f51d3595.xml/MD5E-s183--f990e6d29e03715a04983dd2f51d3595.xml

+ 6 - 3
scripts/confusion_matrix2.py

@@ -5,15 +5,18 @@ from ChildProject.metrics import segments_to_grid, conf_matrix
 import numpy as np
 import matplotlib.pyplot as plt
 
-speakers = ['CHI', 'OCH', 'FEM', 'MAL'] #PUT HERE THE LABELS YOU WANT TO INCLUDE
+speakers = ['CHI', 'FEM', 'OCH'] #PUT HERE THE LABELS YOU WANT TO INCLUDE
 project = ChildProject('.')
 am = AnnotationManager(project)
 am.read()
 
 SET_1 = 'eaf_2023/ak' #CHANGE THE FOLDER TO WHERE THE MANUAL ANNOTATIONS ARE
 SET_2 = 'vtc' #CHANGE THE FOLDER TO WHERE VTC GENERATED ANNOTATIONS ARE
-
+recording = ['77021_5/V20230127-070014.WAV']
 intersection = AnnotationManager.intersection(am.annotations, [SET_1, SET_2])
+intersection = intersection[intersection['recording_filename'].isin(recording)]
+
+print(intersection)
 
 segments = am.get_collapsed_segments(intersection)
 segments = segments[segments['speaker_type'].isin(speakers)]
@@ -85,4 +88,4 @@ ax.xaxis.set_label_position("top")
 plt.ylabel(SET_1, fontsize=18)
 plt.xlabel(SET_2, fontsize=18)
 plt.title('Confusion Matrix', fontsize=18)
-plt.savefig('conf_matrix_normalized.png')
+plt.savefig('conf_matrix_normalized.png')

+ 41 - 86
scripts/import_eaf_poland.py

@@ -11,7 +11,6 @@ custom converter to import properly
 import glob
 from pathlib import Path
 
-import numpy as np
 import pandas as pd
 import pympi
 from collections import defaultdict
@@ -49,15 +48,16 @@ XDS_MAPPING = {
         'U':'U',
         }
 
-BP_RECS = ['77033_5/77033_5.WAV', '77021_5/V20230127-070014.WAV']
+BP_EAFS = ['77033_5/77033_5.eaf']
+BP_RECS = ['77033_5/77033_5.WAV']
 def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
 
     eaf = pympi.Elan.Eaf(filename)
     
     segments = {}
     for tier_name in eaf.tiers:
-        print(tier_name)
         annotations = eaf.tiers[tier_name][0]
+    
         if (
             tier_name not in SPEAKER_ID_TO_TYPE
             and len(annotations) > 0
@@ -68,7 +68,18 @@ def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
                 )
             )
             continue
-    
+        ## Do not include tiers that have 5 or fewer annotations
+        if (
+            tier_name in SPEAKER_ID_TO_TYPE
+            and len(annotations) <= 5
+        ):
+            print(
+                "warning: Tier '{}' has too little annotations and it will be ignored in '{}'".format(
+                    tier_name, filename
+                )
+            )
+            continue
+        ##
         for aid in annotations:
             (start_ts, end_ts, value, svg_ref) = annotations[aid]
             (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
@@ -135,100 +146,44 @@ def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
                 
     
     return pd.DataFrame(segments.values())
-BP_REC = ['77033_5.WAV']
-chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
+
 if __name__ == '__main__' :
     
     project = ChildProject('.')
     am = AnnotationManager(project)
+    
+
+        #tier_names = [x for x in eaf.tiers.keys() if x in SPEAKER_ID_TO_TYPE and len(eaf.tiers[x][0]) > 0]
 
     files = pd.DataFrame([
         {'raw_filename': f}
-        for f in glob.glob('./annotations/eaf_2023/ak/raw/*/*.eaf') if f.split('/')[-1] in BP_REC
+        for f in glob.glob('annotations/eaf_2023/*/raw/*/*.eaf') if f.split('raw/')[-1] in BP_EAFS
     ])
 
     files['time_seek'] = 0
-    print(files['raw_filename'])
-    files['raw_filename'] = files['raw_filename'].apply(os.path.basename)
-    print(files['raw_filename'])
-    files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('.')[-2] + '/' + x.split('.')[-2] + '.WAV')
-    # files = files[files['recording_filename'].isin(project.recordings['recording_filename'])]
-    files['set'] = 'eaf_2023/ak'
+    files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('raw/')[-1].split('.')[0]) + '.WAV'
+    files['set'] = files['raw_filename'].str.extract(r"^annotations/(eaf_2023/[A-Za-z]{2})")
+    files['raw_filename'] = files['raw_filename'].apply(os.path.relpath, start='./annotations/eaf_2023/ak/raw/')
     files['format'] = 'eaf'
-    print(files)
+
     _files = []
 
     for f in files.to_dict(orient='records'):
-        eaf = pympi.Elan.Eaf(Path('./annotations') / 'eaf_2023' / 'ak' / 'raw' / '77033_5/77033_5.eaf')
-        print(f['raw_filename'])
-        
-        df = pd.DataFrame(columns=['range_onset', 'range_offset'])
-        eaf.get_full_time_interval()
-
-        for tier in eaf.get_tier_names():
-            for ann in eaf.get_annotation_data_for_tier(tier):
-                df2 = pd.DataFrame({'range_onset': ann[0], 'range_offset': ann[1]}, index=[0])
-                df = pd.concat([df, df2], ignore_index=True)
-        df = df.sort_values('range_onset').reset_index(drop=True)
-
-        dif_st = np.diff(df['range_onset'].to_numpy())
-        idx_st = [x + 1 for x, val in enumerate(dif_st) if val >= chunk_break]
-        start_times = df['range_onset'].to_numpy()[idx_st]
-        start_times = np.insert(start_times, 0, df['range_onset'].to_numpy()[0])
-
-        df = df.sort_values('range_offset').reset_index(drop=True)
-        dif_end = np.diff(df['range_offset'].to_numpy())
-
-        idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
-        end_times = df['range_offset'].to_numpy()[idx_end]
-        end_times = np.append(end_times, df['range_offset'].to_numpy()[-1])
-
-        final = pd.DataFrame(columns=['range_onset', 'range_offset'])
-        final['range_onset'] = start_times
-        final['range_offset'] = end_times
-        final['time_seek'] = 0
-        final['raw_filename'] = '77033_5/77033_5.eaf'
-        final['recording_filename'] = '77033_5/77033_5.WAV'
-        final['format'] = 'eaf'
-        final['set'] = 'eaf_2023/ak'
-
-
-        #_files.append(pd.DataFrame([f]))
-
-        # for tier_name in ['CHI', 'FEM', 'MAL', 'OCH']:
-        #     portions = eaf.tiers[tier_name][0] #tier names
-        #
-        #     for pid in portions:
-        #         (start_ts, end_ts, value, svg_ref) = portions[pid]
-        #         (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
-        #
-        #         # if value.upper() != 'Y':
-        #         #    continue
-        #         f['tier'] = tier_name
-        #         f['range_onset'] = start_t
-        #         f['range_offset'] = end_t
-        #
-        #         _files.append(pd.DataFrame([f]))
-
-
-    #import_df = pd.concat(_files).reset_index(drop=True)
-    import_df = final.reset_index(drop=True)
-    print(import_df)
-    
-    # import_df = project.recordings[['recording_filename', 'duration']]
-    # import_df = import_df[import_df["recording_filename"].isin(BP_RECS)] #only keep bp recs
-    # import_df.rename(columns={'duration':'range_offset'}, inplace=True)
-    #
-    # #import_df['set'] = 'eaf_2022/an1' # first batch
-    # import_df['set'] = 'eaf_2023/ak' #import bautista's annotations
-    #
-    # import_df['time_seek'] = 0
-    # import_df['range_onset'] = 0
-    # import_df['format'] = 'eaf'
-    #
-    # #import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: RECORDINGS_MAPPING[x])
-    # import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: os.path.basename(x.replace(".WAV",".eaf")))
-
-    #print(import_df)
-    am.import_annotations(import_df, threads=1, import_function=convert, overwrite_existing=True)
+        eaf = pympi.Elan.Eaf(Path('annotations')/ f['set'] / 'raw' / f['raw_filename'])
+        portions = eaf.tiers['chunks'][0]
+
+        for pid in portions:
+            (start_ts, end_ts, value, svg_ref) = portions[pid]
+            (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
+
+            # if value.upper() != 'Y':
+            #    continue
+
+            f['range_onset'] = start_t
+            f['range_offset'] = end_t
+
+            _files.append(pd.DataFrame([f]))
 
+    files = pd.concat(_files)
+    am.remove_set('eaf_2023/ak')
+    am.import_annotations(files, threads=1, import_function = convert)

+ 235 - 0
scripts/import_eaf_poland_safe.py

@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jan  2 14:53:20 2023
+
+@author: lpeurey
+
+Manage  the importation of eaf 2022 annotation campaign
+custom converter to import properly
+"""
+import glob
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pympi
+from collections import defaultdict
+import os
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+
+SPEAKER_ID_TO_TYPE = defaultdict(
+        lambda: "NA",
+        {
+            "CHI": "CHI",
+            "FEM": "FEM",
+            "MAL": "MAL",
+            "OCH": "OCH",
+        },
+    )
+
+VCM_MAPPING = {
+        'A':'N',
+        'P':'N',
+        'W':'C',
+        'V':'C',
+        'L':'L',
+        'Y':'Y',
+        'U':'U',
+        }
+XDS_MAPPING = {
+        'T':'T',
+        'C':'C',
+        'B':'A,C',
+        'A':'A',
+        'P':'P',
+        'O':'O',
+        'U':'U',
+        }
+
+BP_RECS = ['77033_5/77033_5.WAV', '77021_5/V20230127-070014.WAV']
+def convert(filename: str, filter=None, **kwargs) -> pd.DataFrame:
+
+    eaf = pympi.Elan.Eaf(filename)
+    
+    segments = {}
+    for tier_name in eaf.tiers:
+        print(tier_name)
+        annotations = eaf.tiers[tier_name][0]
+        if (
+            tier_name not in SPEAKER_ID_TO_TYPE
+            and len(annotations) > 0
+        ):
+            print(
+                "warning: unknown tier '{}' will be ignored in '{}'".format(
+                    tier_name, filename
+                )
+            )
+            continue
+    
+        for aid in annotations:
+            (start_ts, end_ts, value, svg_ref) = annotations[aid]
+            (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
+    
+            segment = {
+                "segment_onset": int(round(start_t)),
+                "segment_offset": int(round(end_t)),
+                "speaker_id": tier_name,
+                "speaker_type": SPEAKER_ID_TO_TYPE[tier_name],
+                "vcm_type": "NA",
+                "vcm_type_precise": "NA",
+                "msc_type": "NA",
+                "xds_type": "NA",
+                "gra_type": "NA",
+                "addressee": "NA",
+            }
+    
+            segments[aid] = segment
+    
+    for tier_name in eaf.tiers:
+        if "@" in tier_name:
+            label, ref = tier_name.split("@")
+        else:
+            label, ref = tier_name, None
+    
+        reference_annotations = eaf.tiers[tier_name][1]
+    
+        if ref not in SPEAKER_ID_TO_TYPE:
+            continue
+    
+        for aid in reference_annotations:
+            (ann, value, prev, svg) = reference_annotations[aid]
+    
+            ann = aid
+            parentTier = eaf.tiers[eaf.annotations[ann]]
+            while (
+                "PARENT_REF" in parentTier[2]
+                and parentTier[2]["PARENT_REF"]
+                and len(parentTier[2]) > 0
+            ):
+                ann = parentTier[1][ann][0]
+                parentTier = eaf.tiers[eaf.annotations[ann]]
+    
+            if ann not in segments:
+                print(
+                    "warning: annotation '{}' not found in segments for '{}'".format(
+                        ann, filename
+                    )
+                )
+                continue
+    
+            segment = segments[ann]
+    
+            if value: #discard segments that have no label (kept NA)
+                if label == "vcm":
+                    segment["vcm_type"] = VCM_MAPPING[value]
+                    segment["vcm_type_precise"] = value
+                elif label == "msc":
+                    segment["msc_type"] = value
+                elif label == "gra":
+                    segment["gra_type"] = value
+                elif label == "xds":
+                    segment["addressee"] = XDS_MAPPING[value]
+                
+    
+    return pd.DataFrame(segments.values())
+BP_REC = ['77033_5.WAV']
+chunk_break = 300000 #here put in miliseconds approximately how long is the shortest break between annotation chunks
+if __name__ == '__main__' :
+    
+    project = ChildProject('.')
+    am = AnnotationManager(project)
+
+    files = pd.DataFrame([
+        {'raw_filename': f}
+        for f in glob.glob('./annotations/eaf_2023/ak/raw/*/*.eaf') if f.split('/')[-1] in BP_REC
+    ])
+
+    files['time_seek'] = 0
+    print(files['raw_filename'])
+    files['raw_filename'] = files['raw_filename'].apply(os.path.basename)
+    print(files['raw_filename'])
+    files['recording_filename'] = files['raw_filename'].apply(lambda x: x.split('.')[-2] + '/' + x.split('.')[-2] + '.WAV')
+    # files = files[files['recording_filename'].isin(project.recordings['recording_filename'])]
+    files['set'] = 'eaf_2023/ak'
+    files['format'] = 'eaf'
+    print(files)
+    _files = []
+
+    for f in files.to_dict(orient='records'):
+        eaf = pympi.Elan.Eaf(Path('./annotations') / 'eaf_2023' / 'ak' / 'raw' / '77033_5/77033_5.eaf')
+        print(f['raw_filename'])
+        
+        df = pd.DataFrame(columns=['range_onset', 'range_offset'])
+        eaf.get_full_time_interval()
+
+        for tier in eaf.get_tier_names():
+            for ann in eaf.get_annotation_data_for_tier(tier):
+                df2 = pd.DataFrame({'range_onset': ann[0], 'range_offset': ann[1]}, index=[0])
+                df = pd.concat([df, df2], ignore_index=True)
+        df = df.sort_values('range_onset').reset_index(drop=True)
+
+        dif_st = np.diff(df['range_onset'].to_numpy())
+        idx_st = [x + 1 for x, val in enumerate(dif_st) if val >= chunk_break]
+        start_times = df['range_onset'].to_numpy()[idx_st]
+        start_times = np.insert(start_times, 0, df['range_onset'].to_numpy()[0])
+
+        df = df.sort_values('range_offset').reset_index(drop=True)
+        dif_end = np.diff(df['range_offset'].to_numpy())
+
+        idx_end = [x for x, val in enumerate(dif_end) if val >= chunk_break]
+        end_times = df['range_offset'].to_numpy()[idx_end]
+        end_times = np.append(end_times, df['range_offset'].to_numpy()[-1])
+
+        final = pd.DataFrame(columns=['range_onset', 'range_offset'])
+        final['range_onset'] = start_times
+        final['range_offset'] = end_times
+        final['time_seek'] = 0
+        final['raw_filename'] = '77033_5/77033_5.eaf'
+        final['recording_filename'] = '77033_5/77033_5.WAV'
+        final['format'] = 'eaf'
+        final['set'] = 'eaf_2023/ak'
+
+
+        #_files.append(pd.DataFrame([f]))
+
+        # for tier_name in ['CHI', 'FEM', 'MAL', 'OCH']:
+        #     portions = eaf.tiers[tier_name][0] #tier names
+        #
+        #     for pid in portions:
+        #         (start_ts, end_ts, value, svg_ref) = portions[pid]
+        #         (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
+        #
+        #         # if value.upper() != 'Y':
+        #         #    continue
+        #         f['tier'] = tier_name
+        #         f['range_onset'] = start_t
+        #         f['range_offset'] = end_t
+        #
+        #         _files.append(pd.DataFrame([f]))
+
+
+    #import_df = pd.concat(_files).reset_index(drop=True)
+    import_df = final.reset_index(drop=True)
+    print(import_df)
+    
+    # import_df = project.recordings[['recording_filename', 'duration']]
+    # import_df = import_df[import_df["recording_filename"].isin(BP_RECS)] #only keep bp recs
+    # import_df.rename(columns={'duration':'range_offset'}, inplace=True)
+    #
+    # #import_df['set'] = 'eaf_2022/an1' # first batch
+    # import_df['set'] = 'eaf_2023/ak' #import bautista's annotations
+    #
+    # import_df['time_seek'] = 0
+    # import_df['range_onset'] = 0
+    # import_df['format'] = 'eaf'
+    #
+    # #import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: RECORDINGS_MAPPING[x])
+    # import_df['raw_filename'] = import_df['recording_filename'].apply(lambda x: os.path.basename(x.replace(".WAV",".eaf")))
+
+    #print(import_df)
+    #am.remove_set('eaf_2023/ak') #JESLI NIE DZIALA
+    am.import_annotations(import_df, threads=1, import_function=convert, overwrite_existing=True)
+