from numpy import NaN from pandas.core.frame import DataFrame import pympi.Praat as pr import pandas as pd from pathlib import Path import re input_dir = "outputs/mfa_align_output" output_path = "outputs/grid2csv_output" #creates dataframe with previously dropped entries original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv") original_df = original_df.dropna(subset = ['transcription']) #original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")] original_df = (original_df[(pd.isnull(original_df['transcription'])) | (original_df['transcription'].str.match(r"[^a-zA-Z]"))]) speaker_type_dic = { "CHI - words" : "CHI", "CHI - phones" : "CHI", "MOT - words" : "FEM", "MOT - phones" : "FEM", "FAT - words" : "MAL", "FAT - phones" : "MAL", "SIS - words" : "OCH", "SIS - phones" : "OCH" } #selects TextGrid files only files = Path(input_dir).glob('*.TextGrid') #initialize header df_header = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type", "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"]) corpus_onset= 0 corpus_offset = 0 #initialize empty list interval_list = [] #Loop through each textgrid file of the folder for file in files: #open textgrid file print(file) grid = pr.TextGrid(file) #gets original onset and offset of the sliced recording audio_onset = int(re.split("\D+", file.stem)[1]) audio_offset = int(re.split("\D+", file.stem)[2]) if(audio_onset < corpus_onset): corpus_onset = audio_onset if(audio_offset > corpus_offset): corpus_offset = audio_offset #loop through all tiers for tier in grid.get_tiers(): #remove all phones if 'phones' in tier.name: continue for interval in tier.get_all_intervals(): #conditions to skip this iteration: empty, "sp", "sil" or if interval is a phone if not interval[2]: continue if interval[2] == "sil" : continue if interval[2] == "sp" : continue #populates dataframe temp_dict = {'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset), 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]} #populates list interval_list.append(temp_dict) #makes dataframe from header and data df = DataFrame(interval_list, columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type", "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"]) #populates speaker_type column df['speaker_type'] = df['speaker_id'].map(speaker_type_dic) #enriches the csv with previously dropped entries corresponsing to the right timestamps orig_df_subset = original_df[(original_df['segment_onset'] >= corpus_onset) & (original_df['segment_offset'] <= corpus_offset)] df = pd.concat([df, orig_df_subset]) #sort values by segment_onset before exporting df.sort_values(by='segment_onset', inplace= True) #exports to csv df.to_csv("{0}/BN32_010007-aligned.csv".format(output_path), mode = "x", na_rep= "NA", index= False) print("----------------SAVED!-----------------")