Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

csv2grid_bis.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import pandas as pd
  2. import pympi.Praat as pr
  3. from pydub import AudioSegment
  4. #open csv as pandas dataframe and clean dataframe
  5. df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
  6. df.dropna(subset = ['transcription'], inplace = True)
  7. df = df[df['transcription'].str.match(r"[a-zA-Z]")]
  8. #open recording
  9. recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav")
  10. #initiate
  11. slices_length = 100
  12. audio_onset = 0
  13. for a in range(0, len(df), slices_length):
  14. #creates sliced dataframe with 100 rows
  15. b = min(a + slices_length, len(df)-1)
  16. df_sliced = df[a:b]
  17. print(a,b)
  18. #finds the segment offset of the 100th transcription entry and stores it into var
  19. #in milliseconds
  20. audio_offset = df_sliced['segment_offset'].max()
  21. #finds the segment offset of the 100th transcription entry and stores it into var
  22. #in milliseconds
  23. audio_offset = int(df_sliced.tail(1)['segment_offset'])
  24. #extracts recording at desired length and exports it to new file
  25. recording_sliced = recording[audio_onset:audio_offset]
  26. recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav')
  27. #create textgrid
  28. grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000)
  29. #iterate through each row
  30. for speaker, segments in df_sliced.groupby('speaker_id'):
  31. aTier = grid.add_tier(speaker)
  32. for i in segments.index.values:
  33. print(i)
  34. if not segments.loc[i, 'transcription']:
  35. continue
  36. aTier.add_interval(
  37. (segments.loc[i, 'segment_onset'] - audio_onset)/1000,
  38. (segments.loc[i, 'segment_offset'] - audio_onset)/1000,
  39. segments.loc[i, 'transcription'],
  40. False
  41. )
  42. grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b))
  43. #the end cut for this iteration becomes the starting point for next iteration
  44. audio_onset = audio_offset
  45. #increment row numbers
  46. a += 100
  47. b += 100