|
@@ -1,3 +1,5 @@
|
|
|
+from numpy import NaN
|
|
|
+from pandas.core.frame import DataFrame
|
|
|
import pympi.Praat as pr
|
|
|
import pandas as pd
|
|
|
from pathlib import Path
|
|
@@ -9,7 +11,8 @@ output_path = "outputs/grid2csv_output"
|
|
|
#creates dataframe with previously dropped entries
|
|
|
original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
|
|
|
original_df = original_df.dropna(subset = ['transcription'])
|
|
|
-original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
|
|
|
+#original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
|
|
|
+original_df = (original_df[(pd.isnull(original_df['transcription'])) | (original_df['transcription'].str.match(r"[^a-zA-Z]"))])
|
|
|
|
|
|
speaker_type_dic = {
|
|
|
"CHI - words" : "CHI",
|
|
@@ -26,12 +29,15 @@ speaker_type_dic = {
|
|
|
files = Path(input_dir).glob('*.TextGrid')
|
|
|
|
|
|
#initialize header
|
|
|
-df = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
|
|
|
+df_header = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
|
|
|
"segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
|
|
|
|
|
|
corpus_onset= 0
|
|
|
corpus_offset = 0
|
|
|
|
|
|
+#initialize empty list
|
|
|
+interval_list = []
|
|
|
+
|
|
|
#Loop through each textgrid file of the folder
|
|
|
for file in files:
|
|
|
|
|
@@ -51,19 +57,25 @@ for file in files:
|
|
|
for interval in tier.get_all_intervals():
|
|
|
|
|
|
#populates dataframe
|
|
|
- df = df.append(pd.Series({'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
|
|
|
- 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}), ignore_index= True)
|
|
|
-
|
|
|
-
|
|
|
+ temp_dict = {'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
|
|
|
+ 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}
|
|
|
+
|
|
|
+ #populates list
|
|
|
+ interval_list.append(temp_dict)
|
|
|
+
|
|
|
+#makes dataframe from header and data
|
|
|
+df = DataFrame(interval_list, columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
|
|
|
+ "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
|
|
|
+
|
|
|
#populates speaker_type column
|
|
|
df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
|
|
|
-
|
|
|
+
|
|
|
#enriches the csv with previously dropped entries corresponsing to the right timestamps
|
|
|
orig_df_subset = original_df[(original_df['segment_onset'] >= corpus_onset) &
|
|
|
(original_df['segment_offset'] <= corpus_offset)]
|
|
|
|
|
|
|
|
|
-df = df.append(orig_df_subset)
|
|
|
+df = pd.concat([df, orig_df_subset])
|
|
|
|
|
|
#sort values by segment_onset before exporting
|
|
|
df.sort_values(by='segment_onset', inplace= True)
|