Browse Source

modified grid2csv to remove phones and silences

Martin Frébourg 2 years ago
parent
commit
0a4b9730d8

+ 10 - 4
code/grid2csv.py

@@ -52,14 +52,20 @@ for file in files:
     if(audio_onset < corpus_onset): corpus_onset = audio_onset
     if(audio_offset > corpus_offset): corpus_offset = audio_offset
 
+    
     #loop through all tiers
     for tier in grid.get_tiers():
+
+        #remove all phones
+        if 'phones' in tier.name: continue
+
         for interval in tier.get_all_intervals():
             
+            #conditions to skip this iteration: empty, "sp", "sil" or if interval is a phone
             if not interval[2]: continue
-            if interval[2] == "sil" : 
-                continue
-
+            if interval[2] == "sil" : continue
+            if interval[2] == "sp" : continue
+           
             #populates dataframe
             temp_dict = {'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
                 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}
@@ -85,5 +91,5 @@ df = pd.concat([df, orig_df_subset])
 df.sort_values(by='segment_onset', inplace= True)
 
 #exports to csv
-df.to_csv("{0}/BN32_010007-aligned.csv.csv".format(output_path), mode = "x", na_rep= "NA", index= False)
+df.to_csv("{0}/BN32_010007-aligned.csv".format(output_path), mode = "x", na_rep= "NA", index= False)
 print("----------------SAVED!-----------------")

+ 1 - 0
outputs/grid2csv_output/BN32_010007-aligned.csv

@@ -0,0 +1 @@
+../../.git/annex/objects/z1/VM/MD5E-s1887017--148c4e202b06f8f1711d195bd0e2953d.csv/MD5E-s1887017--148c4e202b06f8f1711d195bd0e2953d.csv

+ 0 - 1
outputs/grid2csv_output/BN32_010007-aligned.csv.csv

@@ -1 +0,0 @@
-../../.git/annex/objects/xZ/ZQ/MD5E-s7705400--ed44f3c5b73a0f1ba209b780aba62b3d.csv.csv/MD5E-s7705400--ed44f3c5b73a0f1ba209b780aba62b3d.csv.csv