grid2csv.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. from numpy import NaN
  2. from pandas.core.frame import DataFrame
  3. import pympi.Praat as pr
  4. import pandas as pd
  5. from pathlib import Path
  6. import re
  7. input_dir = "outputs/mfa_align_output"
  8. output_path = "outputs/grid2csv_output"
  9. #creates dataframe with previously dropped entries
  10. original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
  11. original_df = original_df.dropna(subset = ['transcription'])
  12. #original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
  13. original_df = (original_df[(pd.isnull(original_df['transcription'])) | (original_df['transcription'].str.match(r"[^a-zA-Z]"))])
  14. speaker_type_dic = {
  15. "CHI - words" : "CHI",
  16. "CHI - phones" : "CHI",
  17. "MOT - words" : "FEM",
  18. "MOT - phones" : "FEM",
  19. "FAT - words" : "MAL",
  20. "FAT - phones" : "MAL",
  21. "SIS - words" : "OCH",
  22. "SIS - phones" : "OCH"
  23. }
  24. #selects TextGrid files only
  25. files = Path(input_dir).glob('*.TextGrid')
  26. #initialize header
  27. df_header = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
  28. "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
  29. corpus_onset= 0
  30. corpus_offset = 0
  31. #initialize empty list
  32. interval_list = []
  33. #Loop through each textgrid file of the folder
  34. for file in files:
  35. #open textgrid file
  36. print(file)
  37. grid = pr.TextGrid(file)
  38. #gets original onset and offset of the sliced recording
  39. audio_onset = int(re.split("\D+", file.stem)[1])
  40. audio_offset = int(re.split("\D+", file.stem)[2])
  41. if(audio_onset < corpus_onset): corpus_onset = audio_onset
  42. if(audio_offset > corpus_offset): corpus_offset = audio_offset
  43. #loop through all tiers
  44. for tier in grid.get_tiers():
  45. #remove all phones
  46. if 'phones' in tier.name: continue
  47. for interval in tier.get_all_intervals():
  48. #conditions to skip this iteration: empty, "sp", "sil" or if interval is a phone
  49. if not interval[2]: continue
  50. if interval[2] == "sil" : continue
  51. if interval[2] == "sp" : continue
  52. #populates dataframe
  53. temp_dict = {'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
  54. 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}
  55. #populates list
  56. interval_list.append(temp_dict)
  57. #makes dataframe from header and data
  58. df = DataFrame(interval_list, columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
  59. "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
  60. #populates speaker_type column
  61. df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
  62. #enriches the csv with previously dropped entries corresponsing to the right timestamps
  63. orig_df_subset = original_df[(original_df['segment_onset'] >= corpus_onset) &
  64. (original_df['segment_offset'] <= corpus_offset)]
  65. df = pd.concat([df, orig_df_subset])
  66. #sort values by segment_onset before exporting
  67. df.sort_values(by='segment_onset', inplace= True)
  68. #exports to csv
  69. df.to_csv("{0}/BN32_010007-aligned.csv".format(output_path), mode = "x", na_rep= "NA", index= False)
  70. print("----------------SAVED!-----------------")