|
@@ -6,6 +6,11 @@ import re
|
|
|
input_dir = "outputs/mfa_align_output"
|
|
|
output_path = "outputs/grid2csv_output"
|
|
|
|
|
|
+#creates dataframe with previously dropped entries
|
|
|
+original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
|
|
|
+original_df = original_df.dropna(subset = ['transcription'])
|
|
|
+original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
|
|
|
+
|
|
|
speaker_type_dic = {
|
|
|
"CHI - words" : "CHI",
|
|
|
"CHI - phones" : "CHI",
|
|
@@ -27,8 +32,9 @@ for file in files:
|
|
|
print(file)
|
|
|
grid = pr.TextGrid(file)
|
|
|
|
|
|
- #gets original onset of the sliced recording
|
|
|
+ #gets original onset and offset of the sliced recording
|
|
|
audio_onset = int(re.split("\D+", file.stem)[1])
|
|
|
+ audio_offset = int(re.split("\D+", file.stem)[2])
|
|
|
|
|
|
#initialize header
|
|
|
df = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
|
|
@@ -44,6 +50,16 @@ for file in files:
|
|
|
|
|
|
#populates speaker_type column
|
|
|
df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
|
|
|
+
|
|
|
+ #enriches the csv with previously dropped entries corresponsing to the right timestamps
|
|
|
+ orig_df_subset = original_df[(original_df['segment_onset'] >= audio_onset) &
|
|
|
+ (original_df['segment_offset'] <= audio_offset)]
|
|
|
+
|
|
|
+ df = df.append(orig_df_subset)
|
|
|
+
|
|
|
+ #sort values by segment_onset before exporting
|
|
|
+ df.sort_values(by='segment_onset', inplace= True)
|
|
|
+
|
|
|
#exports to csv
|
|
|
df.to_csv("{0}/{1}.csv".format(output_path, file.stem), mode = "x", na_rep= "NA", index= False)
|
|
|
print("----------------SAVED!-----------------")
|