2 years ago · 3b285fe745
--- a/code/grid2csv.py
+++ b/code/grid2csv.py
@@ -6,6 +6,11 @@ import re
 
				 input_dir = "outputs/mfa_align_output"
			
 
				 output_path = "outputs/grid2csv_output"
			
 
				 
			
 
				+#creates dataframe with previously dropped entries
			
 
				+original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
			
 
				+original_df = original_df.dropna(subset = ['transcription'])
			
 
				+original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
			
 
				+
			
 
				 speaker_type_dic = {
			
 
				         "CHI - words" : "CHI",
			
 
				         "CHI - phones" : "CHI",
			
@@ -27,8 +32,9 @@ for file in files:
 
				     print(file)
			
 
				     grid = pr.TextGrid(file)
			
 
				 
			
 
				-    #gets original onset of the sliced recording
			
 
				+    #gets original onset and offset of the sliced recording
			
 
				     audio_onset = int(re.split("\D+", file.stem)[1])
			
 
				+    audio_offset = int(re.split("\D+", file.stem)[2])
			
 
				 
			
 
				     #initialize header
			
 
				     df = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
			
@@ -44,6 +50,16 @@ for file in files:
 
				     
			
 
				     #populates speaker_type column
			
 
				     df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
			
 
				+
			
 
				+    #enriches the csv with previously dropped entries corresponsing to the right timestamps
			
 
				+    orig_df_subset = original_df[(original_df['segment_onset'] >= audio_onset) &
			
 
				+                         (original_df['segment_offset'] <= audio_offset)]
			
 
				+
			
 
				+    df = df.append(orig_df_subset)
			
 
				+
			
 
				+    #sort values by segment_onset before exporting
			
 
				+    df.sort_values(by='segment_onset', inplace= True)
			
 
				+
			
 
				     #exports to csv
			
 
				     df.to_csv("{0}/{1}.csv".format(output_path, file.stem), mode = "x", na_rep= "NA", index= False)
			
 
				     print("----------------SAVED!-----------------")
			
--- a/outputs/grid2csv_output/BN-0-727249.csv
+++ b/outputs/grid2csv_output/BN-0-727249.csv
@@ -1 +1 @@
 
				-../../.git/annex/objects/49/28/MD5E-s101199--f0ef1e67969cef1cedc4d534f65b7860.csv/MD5E-s101199--f0ef1e67969cef1cedc4d534f65b7860.csv
			
 
				+../../.git/annex/objects/k1/85/MD5E-s104298--03972dfb2675594715bd6929eaf313e3.csv/MD5E-s104298--03972dfb2675594715bd6929eaf313e3.csv
			
--- a/outputs/grid2csv_output/BN-1297003-1913555.csv
+++ b/outputs/grid2csv_output/BN-1297003-1913555.csv
@@ -1 +1 @@
 
				-../../.git/annex/objects/2J/Vm/MD5E-s118161--e455d4a37c0927f792dcd1f9c6627603.csv/MD5E-s118161--e455d4a37c0927f792dcd1f9c6627603.csv
			
 
				+../../.git/annex/objects/G6/FV/MD5E-s122253--5819007f4d16526ec30ed07891ede1cf.csv/MD5E-s122253--5819007f4d16526ec30ed07891ede1cf.csv
			
--- a/outputs/grid2csv_output/BN-727249-1297003.csv
+++ b/outputs/grid2csv_output/BN-727249-1297003.csv
@@ -1 +1 @@
 
				-../../.git/annex/objects/47/gP/MD5E-s103933--c04d33b0486c0d94a012bdf44edfac4e.csv/MD5E-s103933--c04d33b0486c0d94a012bdf44edfac4e.csv
			
 
				+../../.git/annex/objects/0g/2V/MD5E-s105529--f99080b1a555ede680f060c467ba9dfa.csv/MD5E-s105529--f99080b1a555ede680f060c467ba9dfa.csv