1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495 |
- from numpy import NaN
- from pandas.core.frame import DataFrame
- import pympi.Praat as pr
- import pandas as pd
- from pathlib import Path
- import re
- input_dir = "outputs/mfa_align_output"
- output_path = "outputs/grid2csv_output"
- #creates dataframe with previously dropped entries
- original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
- original_df = original_df.dropna(subset = ['transcription'])
- #original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
- original_df = (original_df[(pd.isnull(original_df['transcription'])) | (original_df['transcription'].str.match(r"[^a-zA-Z]"))])
- speaker_type_dic = {
- "CHI - words" : "CHI",
- "CHI - phones" : "CHI",
- "MOT - words" : "FEM",
- "MOT - phones" : "FEM",
- "FAT - words" : "MAL",
- "FAT - phones" : "MAL",
- "SIS - words" : "OCH",
- "SIS - phones" : "OCH"
- }
- #selects TextGrid files only
- files = Path(input_dir).glob('*.TextGrid')
- #initialize header
- df_header = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
- "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
- corpus_onset= 0
- corpus_offset = 0
- #initialize empty list
- interval_list = []
- #Loop through each textgrid file of the folder
- for file in files:
- #open textgrid file
- print(file)
- grid = pr.TextGrid(file)
- #gets original onset and offset of the sliced recording
- audio_onset = int(re.split("\D+", file.stem)[1])
- audio_offset = int(re.split("\D+", file.stem)[2])
- if(audio_onset < corpus_onset): corpus_onset = audio_onset
- if(audio_offset > corpus_offset): corpus_offset = audio_offset
-
- #loop through all tiers
- for tier in grid.get_tiers():
- #remove all phones
- if 'phones' in tier.name: continue
- for interval in tier.get_all_intervals():
-
- #conditions to skip this iteration: empty, "sp", "sil" or if interval is a phone
- if not interval[2]: continue
- if interval[2] == "sil" : continue
- if interval[2] == "sp" : continue
-
- #populates dataframe
- temp_dict = {'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
- 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}
- #populates list
- interval_list.append(temp_dict)
- #makes dataframe from header and data
- df = DataFrame(interval_list, columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
- "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
- #populates speaker_type column
- df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
- #enriches the csv with previously dropped entries corresponsing to the right timestamps
- orig_df_subset = original_df[(original_df['segment_onset'] >= corpus_onset) &
- (original_df['segment_offset'] <= corpus_offset)]
- df = pd.concat([df, orig_df_subset])
- #sort values by segment_onset before exporting
- df.sort_values(by='segment_onset', inplace= True)
- #exports to csv
- df.to_csv("{0}/BN32_010007-aligned.csv".format(output_path), mode = "x", na_rep= "NA", index= False)
- print("----------------SAVED!-----------------")
|