mfrebo/align-vandam

import pandas as pd
import pympi.Praat as pr

#open csv as pandas dataframe
df = pd.read_csv(r"C:\Users\Martin\Desktop\LAAC\alignment-project\inputs\testCHA.csv")

# to make your code reproducible, use relative paths:
df = pd.read_csv(r"inputs/vandam-data/annotations/cha/converted/etc.csv")

#create textgrid
grid = pr.TextGrid(xmax = 300)

#specify number of rows
rows = 9000

#iterate through each row
for i in range(rows):

  #load data from dataframe
    speaker = df.iloc[i, 0] # you could use .loc instead, so you can refer to the column names, which make your code more transparent
    transcription = df.iloc[i, 3] 
   
    #loads onset and offset and converts from milliseconds to seconds
    onset = df.iloc[i,5]/1000
    offset = df.iloc[i, 1]/1000

    # their may be a way to check if a tier is present without throwing exceptions
    #checks if tier already exists. If not, create a new tier
    try:
        aTier = grid.get_tier(speaker)
    except IndexError:
        aTier = grid.add_tier(speaker)

    #creates interval and adds it to the tier  
    pr.Tier.add_interval(aTier, onset, offset, transcription, False)

    # the following should work too:
    aTier.add_interval(onset, offset, transcription, False)

print(list(pr.TextGrid.get_tier_name_num(grid)))

# use relative paths here too
grid.to_file(r"C:\Users\Martin\Desktop\LAAC\alignment-project\outputs\converterTest.txt")

Thanks for the review!

I'll check out the .loc. you're right that is better to use column names.
I did the try-except block because I found it to be the easiest since they are no is_tier function for example. There is a function to get all tiers but it returns a list of pairs with the number of the tier and the tier. Since the number is auto-generated we can't look it up in that list and there is no Tier.get_name() function so we have tier objects and can't look up their names.
I had put down aTier.add_interval(...) at first but VS Code did not recognize it as a function belonging to Praat which is why I did it the other way around but there is actually no issue when running the code so I'll switch it back to that.

mfrebo referenced this issue from a commit 3 years ago

update (recovery from race #1; expected commit parent Ref "b228c67b24ccb16827697a87fd238a4a93383454" but found [Ref "2cde6830cf0cb866eef01fec4f5f68fd4e773657"] )

I've wrote a few comments (ctrl+f "[REVIEW]" to find them easily)

import pandas as pd
import pympi.Praat as pr
from pydub import AudioSegment

#open csv as pandas dataframe and clean dataframe
df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
df.dropna(subset = ['transcription'], inplace = True)
df = df[df['transcription'].str.match(r"[a-zA-Z]")]

#open recording
recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav")

#initiate 
a = 0
b = 99
audio_onset = 0
end = False

while end == False: 

    #check if b is not out of range
    #if it is, place last row # in b and change boolean to True to stop loop
    if(b > int(df.count()[0])): # [REVIEW] here we'd usually do len(df) or df.shape[0] instead
        b = int(df.count()[0])
        end = True

    #creates sliced dataframe with 100 rows
    df_sliced = df[a:b]

    #finds the segment offset of the 100th transcription entry and stores it into var 
    #in milliseconds
    audio_offset = int(df_sliced.tail(1)['segment_offset'])
    
    # [REVIEW] the code above assumes segments are sorted, which is true in the present case, but you might want to do df_sliced['segment_offset'].max() anyway (it's always better when your code relies on fewer/weaker assumptions about the input).

    #extracts recording at desired length and exports it to new file
    recording_sliced = recording[audio_onset:audio_offset]
    recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav')
    
    #create textgrid
    grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000)
    #iterate through each row
    for speaker, segments in df_sliced.groupby('speaker_id'):
        aTier = grid.add_tier(speaker)

        for i in segments.index.values:
            print(i)
            if not segments.loc[i, 'transcription']: 
                continue

            aTier.add_interval(
                (segments.loc[i, 'segment_onset'] - audio_onset)/1000,
                (segments.loc[i, 'segment_offset'] - audio_onset)/1000,
                segments.loc[i, 'transcription'],
                False
              )

        grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b))
    
    
    #the end cut for this iteration becomes the starting point for next iteration
    audio_onset = audio_offset
    
    #increment row numbers
    a += 100
    b += 100

I've wrote a few comments (ctrl+f "[REVIEW]" to find them easily) ```python import pandas as pd import pympi.Praat as pr from pydub import AudioSegment #open csv as pandas dataframe and clean dataframe df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv") df.dropna(subset = ['transcription'], inplace = True) df = df[df['transcription'].str.match(r"[a-zA-Z]")] #open recording recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav") #initiate a = 0 b = 99 audio_onset = 0 end = False while end == False: #check if b is not out of range #if it is, place last row # in b and change boolean to True to stop loop if(b > int(df.count()[0])): # [REVIEW] here we'd usually do len(df) or df.shape[0] instead b = int(df.count()[0]) end = True #creates sliced dataframe with 100 rows df_sliced = df[a:b] #finds the segment offset of the 100th transcription entry and stores it into var #in milliseconds audio_offset = int(df_sliced.tail(1)['segment_offset']) # [REVIEW] the code above assumes segments are sorted, which is true in the present case, but you might want to do df_sliced['segment_offset'].max() anyway (it's always better when your code relies on fewer/weaker assumptions about the input). #extracts recording at desired length and exports it to new file recording_sliced = recording[audio_onset:audio_offset] recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav') #create textgrid grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000) #iterate through each row for speaker, segments in df_sliced.groupby('speaker_id'): aTier = grid.add_tier(speaker) for i in segments.index.values: print(i) if not segments.loc[i, 'transcription']: continue aTier.add_interval( (segments.loc[i, 'segment_onset'] - audio_onset)/1000, (segments.loc[i, 'segment_offset'] - audio_onset)/1000, segments.loc[i, 'transcription'], False ) grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b)) #the end cut for this iteration becomes the starting point for next iteration audio_onset = audio_offset #increment row numbers a += 100 b += 100 ```

Voilà une manière de contourner ce problème:

import pandas as pd
import pympi.Praat as pr
from pydub import AudioSegment

#open csv as pandas dataframe and clean dataframe
df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
df.dropna(subset = ['transcription'], inplace = True)
df = df[df['transcription'].str.match(r"[a-zA-Z]")]

#open recording
recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav")

#initiate 
slices_length = 100
audio_onset = 0

for a in range(0, len(df), slices_length):
    #creates sliced dataframe with 100 rows
    b = min(a + slices_length, len(df)-1)
    df_sliced = df[a:b]
    print(a,b)

    #finds the segment offset of the 100th transcription entry and stores it into var 
    #in milliseconds
    audio_offset = df_sliced['segment_offset'].max()

    #extracts recording at desired length and exports it to new file
    recording_sliced = recording[audio_onset:audio_offset]
    recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav')
    
    #create textgrid
    grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000)
    #iterate through each row
    for speaker, segments in df_sliced.groupby('speaker_id'):
        aTier = grid.add_tier(speaker)

        for i in segments.index.values:
            if not segments.loc[i, 'transcription']: 
                continue

            aTier.add_interval(
                (segments.loc[i, 'segment_onset'] - audio_onset)/1000,
                (segments.loc[i, 'segment_offset'] - audio_onset)/1000,
                segments.loc[i, 'transcription'],
                False
              )

        grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b))
    
    #the end cut for this iteration becomes the starting point for next iteration
    audio_onset = audio_offset

Un retour plus général: dans le cas présent, la taille des slices est un peu arbitraire; et il est possible que l'on décide d'en changer pour voir dans quelle mesure cela améliore ou non la performance de l'algorithme. Le seul problème du code ci-dessus est qu'il faut faire trois changements pour changer la valeur du paramètre. Voilà une manière de contourner ce problème: ```python import pandas as pd import pympi.Praat as pr from pydub import AudioSegment #open csv as pandas dataframe and clean dataframe df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv") df.dropna(subset = ['transcription'], inplace = True) df = df[df['transcription'].str.match(r"[a-zA-Z]")] #open recording recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav") #initiate slices_length = 100 audio_onset = 0 for a in range(0, len(df), slices_length): #creates sliced dataframe with 100 rows b = min(a + slices_length, len(df)-1) df_sliced = df[a:b] print(a,b) #finds the segment offset of the 100th transcription entry and stores it into var #in milliseconds audio_offset = df_sliced['segment_offset'].max() #extracts recording at desired length and exports it to new file recording_sliced = recording[audio_onset:audio_offset] recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav') #create textgrid grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000) #iterate through each row for speaker, segments in df_sliced.groupby('speaker_id'): aTier = grid.add_tier(speaker) for i in segments.index.values: if not segments.loc[i, 'transcription']: continue aTier.add_interval( (segments.loc[i, 'segment_onset'] - audio_onset)/1000, (segments.loc[i, 'segment_offset'] - audio_onset)/1000, segments.loc[i, 'transcription'], False ) grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b)) #the end cut for this iteration becomes the starting point for next iteration audio_onset = audio_offset ```

Merci pour ce retour! En effet, je n'avais pas pensé au changement de la taille des slices ni à l'ordre des transcriptions. C'est bien plus clean comme ça, merci. Je vais incorporer ces changements dans master!

lucasgautheron closed 3 years ago

mfrebo referenced this issue from a commit 3 years ago

update (recovery from race #1; expected commit parent Ref "3fb06a5d2cdfb3353ba3d845a0653c2674e0ce97" but found [Ref "a2349e57886bf09241a5005eab144408add6bc10"] )

#1 short review