grid2csv.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. from numpy import NaN
  2. from pandas.core.frame import DataFrame
  3. import pympi.Praat as pr
  4. import pandas as pd
  5. from pathlib import Path
  6. import re
  7. input_dir = "outputs/mfa_align_output"
  8. output_path = "outputs/grid2csv_output"
  9. #creates dataframe with previously dropped entries
  10. original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
  11. original_df = original_df.dropna(subset = ['transcription'])
  12. #original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
  13. original_df = (original_df[(pd.isnull(original_df['transcription'])) | (original_df['transcription'].str.match(r"[^a-zA-Z]"))])
  14. speaker_type_dic = {
  15. "CHI - words" : "CHI",
  16. "CHI - phones" : "CHI",
  17. "MOT - words" : "FEM",
  18. "MOT - phones" : "FEM",
  19. "FAT - words" : "MAL",
  20. "FAT - phones" : "MAL",
  21. "SIS - words" : "OCH",
  22. "SIS - phones" : "OCH"
  23. }
  24. #selects TextGrid files only
  25. files = Path(input_dir).glob('*.TextGrid')
  26. #initialize header
  27. df_header = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
  28. "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
  29. corpus_onset= 0
  30. corpus_offset = 0
  31. #initialize empty list
  32. interval_list = []
  33. #Loop through each textgrid file of the folder
  34. for file in files:
  35. #open textgrid file
  36. print(file)
  37. grid = pr.TextGrid(file)
  38. #gets original onset and offset of the sliced recording
  39. audio_onset = int(re.split("\D+", file.stem)[1])
  40. audio_offset = int(re.split("\D+", file.stem)[2])
  41. if(audio_onset < corpus_onset): corpus_onset = audio_onset
  42. if(audio_offset > corpus_offset): corpus_offset = audio_offset
  43. #loop through all tiers
  44. for tier in grid.get_tiers():
  45. for interval in tier.get_all_intervals():
  46. if not interval[2]: continue
  47. if interval[2] == "sil" :
  48. continue
  49. #populates dataframe
  50. temp_dict = {'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
  51. 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}
  52. #populates list
  53. interval_list.append(temp_dict)
  54. #makes dataframe from header and data
  55. df = DataFrame(interval_list, columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
  56. "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
  57. #populates speaker_type column
  58. df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
  59. #enriches the csv with previously dropped entries corresponsing to the right timestamps
  60. orig_df_subset = original_df[(original_df['segment_onset'] >= corpus_onset) &
  61. (original_df['segment_offset'] <= corpus_offset)]
  62. df = pd.concat([df, orig_df_subset])
  63. #sort values by segment_onset before exporting
  64. df.sort_values(by='segment_onset', inplace= True)
  65. #exports to csv
  66. df.to_csv("{0}/BN32_010007-aligned.csv.csv".format(output_path), mode = "x", na_rep= "NA", index= False)
  67. print("----------------SAVED!-----------------")