|
@@ -25,6 +25,13 @@ speaker_type_dic = {
|
|
#selects TextGrid files only
|
|
#selects TextGrid files only
|
|
files = Path(input_dir).glob('*.TextGrid')
|
|
files = Path(input_dir).glob('*.TextGrid')
|
|
|
|
|
|
|
|
+#initialize header
|
|
|
|
+df = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
|
|
|
|
+ "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
|
|
|
|
+
|
|
|
|
+corpus_onset= 0
|
|
|
|
+corpus_offset = 0
|
|
|
|
+
|
|
#Loop through each textgrid file of the folder
|
|
#Loop through each textgrid file of the folder
|
|
for file in files:
|
|
for file in files:
|
|
|
|
|
|
@@ -36,9 +43,8 @@ for file in files:
|
|
audio_onset = int(re.split("\D+", file.stem)[1])
|
|
audio_onset = int(re.split("\D+", file.stem)[1])
|
|
audio_offset = int(re.split("\D+", file.stem)[2])
|
|
audio_offset = int(re.split("\D+", file.stem)[2])
|
|
|
|
|
|
- #initialize header
|
|
|
|
- df = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
|
|
|
|
- "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
|
|
|
|
|
|
+ if(audio_onset < corpus_onset): corpus_onset = audio_onset
|
|
|
|
+ if(audio_offset > corpus_offset): corpus_offset = audio_offset
|
|
|
|
|
|
#loop through all tiers
|
|
#loop through all tiers
|
|
for tier in grid.get_tiers():
|
|
for tier in grid.get_tiers():
|
|
@@ -47,19 +53,21 @@ for file in files:
|
|
#populates dataframe
|
|
#populates dataframe
|
|
df = df.append(pd.Series({'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
|
|
df = df.append(pd.Series({'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
|
|
'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}), ignore_index= True)
|
|
'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}), ignore_index= True)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+#populates speaker_type column
|
|
|
|
+df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
|
|
|
|
|
|
- #populates speaker_type column
|
|
|
|
- df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
|
|
|
|
|
|
+#enriches the csv with previously dropped entries corresponsing to the right timestamps
|
|
|
|
+orig_df_subset = original_df[(original_df['segment_onset'] >= corpus_onset) &
|
|
|
|
+ (original_df['segment_offset'] <= corpus_offset)]
|
|
|
|
|
|
- #enriches the csv with previously dropped entries corresponsing to the right timestamps
|
|
|
|
- orig_df_subset = original_df[(original_df['segment_onset'] >= audio_onset) &
|
|
|
|
- (original_df['segment_offset'] <= audio_offset)]
|
|
|
|
|
|
|
|
- df = df.append(orig_df_subset)
|
|
|
|
|
|
+df = df.append(orig_df_subset)
|
|
|
|
|
|
- #sort values by segment_onset before exporting
|
|
|
|
- df.sort_values(by='segment_onset', inplace= True)
|
|
|
|
|
|
+#sort values by segment_onset before exporting
|
|
|
|
+df.sort_values(by='segment_onset', inplace= True)
|
|
|
|
|
|
- #exports to csv
|
|
|
|
- df.to_csv("{0}/{1}.csv".format(output_path, file.stem), mode = "x", na_rep= "NA", index= False)
|
|
|
|
- print("----------------SAVED!-----------------")
|
|
|
|
|
|
+#exports to csv
|
|
|
|
+df.to_csv("{0}/BN.csv".format(output_path), mode = "x", na_rep= "NA", index= False)
|
|
|
|
+print("----------------SAVED!-----------------")
|