LAAC-LSCP
/
align-vandam


			
			
				
					
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
							import pandas as pd
import pympi.Praat as pr
from pydub import AudioSegment

#open csv as pandas dataframe and clean dataframe
df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
df.dropna(subset = ['transcription'], inplace = True)
df = df[df['transcription'].str.match(r"[a-zA-Z]")]

#open recording
recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav")

#initiate 
slices_length = 100
audio_onset = 0

for a in range(0, len(df), slices_length):
    #creates sliced dataframe with 100 rows
    b = min(a + slices_length, len(df)-1)
    df_sliced = df[a:b]
    print(a,b)

    #finds the segment offset of the 100th transcription entry and stores it into var 
    #in milliseconds
    audio_offset = df_sliced['segment_offset'].max()

    #finds the segment offset of the 100th transcription entry and stores it into var 
    #in milliseconds
    audio_offset = int(df_sliced.tail(1)['segment_offset'])

    #extracts recording at desired length and exports it to new file
    recording_sliced = recording[audio_onset:audio_offset]
    recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav')
    
    #create textgrid
    grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000)
    #iterate through each row
    for speaker, segments in df_sliced.groupby('speaker_id'):
        aTier = grid.add_tier(speaker)

        for i in segments.index.values:
            print(i)
            if not segments.loc[i, 'transcription']: 
                continue

            aTier.add_interval(
                (segments.loc[i, 'segment_onset'] - audio_onset)/1000,
                (segments.loc[i, 'segment_offset'] - audio_onset)/1000,
                segments.loc[i, 'transcription'],
                False
              )

        grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b))
    
    
    #the end cut for this iteration becomes the starting point for next iteration
    audio_onset = audio_offset
    
    #increment row numbers
    a += 100
    b += 100