|
@@ -1,6 +1,7 @@
|
|
|
import pympi.Praat as pr
|
|
|
import pandas as pd
|
|
|
from pathlib import Path
|
|
|
+import re
|
|
|
|
|
|
input_dir = "outputs/mfa_align_output"
|
|
|
output_path = "outputs/grid2csv_output"
|
|
@@ -15,6 +16,9 @@ for file in files:
|
|
|
print(file)
|
|
|
grid = pr.TextGrid(file)
|
|
|
|
|
|
+ #gets original onset of the sliced recording
|
|
|
+ audio_onset = int(re.split("\D+", file.stem)[1])
|
|
|
+
|
|
|
#initialize header
|
|
|
df = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
|
|
|
"segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
|
|
@@ -22,8 +26,11 @@ for file in files:
|
|
|
#loop through all tiers
|
|
|
for tier in grid.get_tiers():
|
|
|
for interval in tier.get_all_intervals():
|
|
|
- df = df.append(pd.Series({'speaker_id': tier.name, 'segment_onset': interval[0]*1000,
|
|
|
- 'segment_offset': interval[1]*1000, 'transcription': interval[2]}), ignore_index= True)
|
|
|
+
|
|
|
+ #populates dataframe
|
|
|
+ df = df.append(pd.Series({'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
|
|
|
+ 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}), ignore_index= True)
|
|
|
+
|
|
|
|
|
|
#exports to csv
|
|
|
df.to_csv("{0}/{1}.csv".format(output_path, file.stem), mode = "x", na_rep= "NA", index= False)
|