Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

grid2csv.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. from numpy import NaN
  2. from pandas.core.frame import DataFrame
  3. import pympi.Praat as pr
  4. import pandas as pd
  5. from pathlib import Path
  6. import re
  7. input_dir = "outputs/mfa_align_output"
  8. output_path = "outputs/grid2csv_output"
  9. #creates dataframe with previously dropped entries
  10. original_df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
  11. original_df = original_df.dropna(subset = ['transcription'])
  12. #original_df = original_df[original_df['transcription'].str.match(r"[^a-zA-Z]")]
  13. original_df = (original_df[(pd.isnull(original_df['transcription'])) | (original_df['transcription'].str.match(r"[^a-zA-Z]"))])
  14. speaker_type_dic = {
  15. "CHI - words" : "CHI",
  16. "CHI - phones" : "CHI",
  17. "MOT - words" : "FEM",
  18. "MOT - phones" : "FEM",
  19. "FAT - words" : "MAL",
  20. "FAT - phones" : "MAL",
  21. "SIS - words" : "OCH",
  22. "SIS - phones" : "OCH"
  23. }
  24. #selects TextGrid files only
  25. files = Path(input_dir).glob('*.TextGrid')
  26. #initialize header
  27. df_header = pd.DataFrame(columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
  28. "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
  29. corpus_onset= 0
  30. corpus_offset = 0
  31. #initialize empty list
  32. interval_list = []
  33. #Loop through each textgrid file of the folder
  34. for file in files:
  35. #open textgrid file
  36. print(file)
  37. grid = pr.TextGrid(file)
  38. #gets original onset and offset of the sliced recording
  39. audio_onset = int(re.split("\D+", file.stem)[1])
  40. audio_offset = int(re.split("\D+", file.stem)[2])
  41. if(audio_onset < corpus_onset): corpus_onset = audio_onset
  42. if(audio_offset > corpus_offset): corpus_offset = audio_offset
  43. #loop through all tiers
  44. for tier in grid.get_tiers():
  45. for interval in tier.get_all_intervals():
  46. #populates dataframe
  47. temp_dict = {'speaker_id': tier.name, 'segment_onset': (interval[0]*1000 + audio_onset),
  48. 'segment_offset': (interval[1]*1000 + audio_onset), 'transcription': interval[2]}
  49. #populates list
  50. interval_list.append(temp_dict)
  51. #makes dataframe from header and data
  52. df = DataFrame(interval_list, columns= ["speaker_id","segment_offset","mwu_type","transcription","speaker_type",
  53. "segment_onset","vcm_type","addresseee","raw_filename","ling_type","lex_type"])
  54. #populates speaker_type column
  55. df['speaker_type'] = df['speaker_id'].map(speaker_type_dic)
  56. #enriches the csv with previously dropped entries corresponsing to the right timestamps
  57. orig_df_subset = original_df[(original_df['segment_onset'] >= corpus_onset) &
  58. (original_df['segment_offset'] <= corpus_offset)]
  59. df = pd.concat([df, orig_df_subset])
  60. #sort values by segment_onset before exporting
  61. df.sort_values(by='segment_onset', inplace= True)
  62. #exports to csv
  63. df.to_csv("{0}/BN.csv".format(output_path), mode = "x", na_rep= "NA", index= False)
  64. print("----------------SAVED!-----------------")