#1 short review

Geschlossen
vor 3 Jahren geöffnet von lucasgautheron · 4 Kommentare
import pandas as pd
import pympi.Praat as pr

#open csv as pandas dataframe
df = pd.read_csv(r"C:\Users\Martin\Desktop\LAAC\alignment-project\inputs\testCHA.csv")

# to make your code reproducible, use relative paths:
df = pd.read_csv(r"inputs/vandam-data/annotations/cha/converted/etc.csv")

#create textgrid
grid = pr.TextGrid(xmax = 300)

#specify number of rows
rows = 9000

#iterate through each row
for i in range(rows):

  #load data from dataframe
    speaker = df.iloc[i, 0] # you could use .loc instead, so you can refer to the column names, which make your code more transparent
    transcription = df.iloc[i, 3] 
   
    #loads onset and offset and converts from milliseconds to seconds
    onset = df.iloc[i,5]/1000
    offset = df.iloc[i, 1]/1000

    # their may be a way to check if a tier is present without throwing exceptions
    #checks if tier already exists. If not, create a new tier
    try:
        aTier = grid.get_tier(speaker)
    except IndexError:
        aTier = grid.add_tier(speaker)

    #creates interval and adds it to the tier  
    pr.Tier.add_interval(aTier, onset, offset, transcription, False)

    # the following should work too:
    aTier.add_interval(onset, offset, transcription, False)

print(list(pr.TextGrid.get_tier_name_num(grid)))

# use relative paths here too
grid.to_file(r"C:\Users\Martin\Desktop\LAAC\alignment-project\outputs\converterTest.txt")
```python import pandas as pd import pympi.Praat as pr #open csv as pandas dataframe df = pd.read_csv(r"C:\Users\Martin\Desktop\LAAC\alignment-project\inputs\testCHA.csv") # to make your code reproducible, use relative paths: df = pd.read_csv(r"inputs/vandam-data/annotations/cha/converted/etc.csv") #create textgrid grid = pr.TextGrid(xmax = 300) #specify number of rows rows = 9000 #iterate through each row for i in range(rows): #load data from dataframe speaker = df.iloc[i, 0] # you could use .loc instead, so you can refer to the column names, which make your code more transparent transcription = df.iloc[i, 3] #loads onset and offset and converts from milliseconds to seconds onset = df.iloc[i,5]/1000 offset = df.iloc[i, 1]/1000 # their may be a way to check if a tier is present without throwing exceptions #checks if tier already exists. If not, create a new tier try: aTier = grid.get_tier(speaker) except IndexError: aTier = grid.add_tier(speaker) #creates interval and adds it to the tier pr.Tier.add_interval(aTier, onset, offset, transcription, False) # the following should work too: aTier.add_interval(onset, offset, transcription, False) print(list(pr.TextGrid.get_tier_name_num(grid))) # use relative paths here too grid.to_file(r"C:\Users\Martin\Desktop\LAAC\alignment-project\outputs\converterTest.txt") ```
mfrebo kommentierte vor 3 Jahren
Besitzer

Thanks for the review!

  • I'll check out the .loc. you're right that is better to use column names.
  • I did the try-except block because I found it to be the easiest since they are no is_tier function for example. There is a function to get all tiers but it returns a list of pairs with the number of the tier and the tier. Since the number is auto-generated we can't look it up in that list and there is no Tier.get_name() function so we have tier objects and can't look up their names.

  • I had put down aTier.add_interval(...) at first but VS Code did not recognize it as a function belonging to Praat which is why I did it the other way around but there is actually no issue when running the code so I'll switch it back to that.

Thanks for the review! - I'll check out the .loc. you're right that is better to use column names. - I did the try-except block because I found it to be the easiest since they are no is_tier function for example. There is a function to get all tiers but it returns a list of pairs with the number of the tier and the tier. Since the number is auto-generated we can't look it up in that list and there is no Tier.get_name() function so we have tier objects and can't look up their names. - I had put down aTier.add_interval(...) at first but VS Code did not recognize it as a function belonging to Praat which is why I did it the other way around but there is actually no issue when running the code so I'll switch it back to that.
Lucas Gautheron kommentierte vor 3 Jahren
Mitarbeiter

I've wrote a few comments (ctrl+f "[REVIEW]" to find them easily)

import pandas as pd
import pympi.Praat as pr
from pydub import AudioSegment

#open csv as pandas dataframe and clean dataframe
df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
df.dropna(subset = ['transcription'], inplace = True)
df = df[df['transcription'].str.match(r"[a-zA-Z]")]

#open recording
recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav")

#initiate 
a = 0
b = 99
audio_onset = 0
end = False

while end == False: 

    #check if b is not out of range
    #if it is, place last row # in b and change boolean to True to stop loop
    if(b > int(df.count()[0])): # [REVIEW] here we'd usually do len(df) or df.shape[0] instead
        b = int(df.count()[0])
        end = True

    #creates sliced dataframe with 100 rows
    df_sliced = df[a:b]

    #finds the segment offset of the 100th transcription entry and stores it into var 
    #in milliseconds
    audio_offset = int(df_sliced.tail(1)['segment_offset'])
    
    # [REVIEW] the code above assumes segments are sorted, which is true in the present case, but you might want to do df_sliced['segment_offset'].max() anyway (it's always better when your code relies on fewer/weaker assumptions about the input).

    #extracts recording at desired length and exports it to new file
    recording_sliced = recording[audio_onset:audio_offset]
    recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav')
    
    #create textgrid
    grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000)
    #iterate through each row
    for speaker, segments in df_sliced.groupby('speaker_id'):
        aTier = grid.add_tier(speaker)

        for i in segments.index.values:
            print(i)
            if not segments.loc[i, 'transcription']: 
                continue

            aTier.add_interval(
                (segments.loc[i, 'segment_onset'] - audio_onset)/1000,
                (segments.loc[i, 'segment_offset'] - audio_onset)/1000,
                segments.loc[i, 'transcription'],
                False
              )

        grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b))
    
    
    #the end cut for this iteration becomes the starting point for next iteration
    audio_onset = audio_offset
    
    #increment row numbers
    a += 100
    b += 100
I've wrote a few comments (ctrl+f "[REVIEW]" to find them easily) ```python import pandas as pd import pympi.Praat as pr from pydub import AudioSegment #open csv as pandas dataframe and clean dataframe df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv") df.dropna(subset = ['transcription'], inplace = True) df = df[df['transcription'].str.match(r"[a-zA-Z]")] #open recording recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav") #initiate a = 0 b = 99 audio_onset = 0 end = False while end == False: #check if b is not out of range #if it is, place last row # in b and change boolean to True to stop loop if(b > int(df.count()[0])): # [REVIEW] here we'd usually do len(df) or df.shape[0] instead b = int(df.count()[0]) end = True #creates sliced dataframe with 100 rows df_sliced = df[a:b] #finds the segment offset of the 100th transcription entry and stores it into var #in milliseconds audio_offset = int(df_sliced.tail(1)['segment_offset']) # [REVIEW] the code above assumes segments are sorted, which is true in the present case, but you might want to do df_sliced['segment_offset'].max() anyway (it's always better when your code relies on fewer/weaker assumptions about the input). #extracts recording at desired length and exports it to new file recording_sliced = recording[audio_onset:audio_offset] recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav') #create textgrid grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000) #iterate through each row for speaker, segments in df_sliced.groupby('speaker_id'): aTier = grid.add_tier(speaker) for i in segments.index.values: print(i) if not segments.loc[i, 'transcription']: continue aTier.add_interval( (segments.loc[i, 'segment_onset'] - audio_onset)/1000, (segments.loc[i, 'segment_offset'] - audio_onset)/1000, segments.loc[i, 'transcription'], False ) grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b)) #the end cut for this iteration becomes the starting point for next iteration audio_onset = audio_offset #increment row numbers a += 100 b += 100 ```
Lucas Gautheron kommentierte vor 3 Jahren
Mitarbeiter

Un retour plus général: dans le cas présent, la taille des slices est un peu arbitraire; et il est possible que l'on décide d'en changer pour voir dans quelle mesure cela améliore ou non la performance de l'algorithme. Le seul problème du code ci-dessus est qu'il faut faire trois changements pour changer la valeur du paramètre.

Voilà une manière de contourner ce problème:

import pandas as pd
import pympi.Praat as pr
from pydub import AudioSegment

#open csv as pandas dataframe and clean dataframe
df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv")
df.dropna(subset = ['transcription'], inplace = True)
df = df[df['transcription'].str.match(r"[a-zA-Z]")]

#open recording
recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav")

#initiate 
slices_length = 100
audio_onset = 0

for a in range(0, len(df), slices_length):
    #creates sliced dataframe with 100 rows
    b = min(a + slices_length, len(df)-1)
    df_sliced = df[a:b]
    print(a,b)

    #finds the segment offset of the 100th transcription entry and stores it into var 
    #in milliseconds
    audio_offset = df_sliced['segment_offset'].max()

    #extracts recording at desired length and exports it to new file
    recording_sliced = recording[audio_onset:audio_offset]
    recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav')
    
    #create textgrid
    grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000)
    #iterate through each row
    for speaker, segments in df_sliced.groupby('speaker_id'):
        aTier = grid.add_tier(speaker)

        for i in segments.index.values:
            if not segments.loc[i, 'transcription']: 
                continue

            aTier.add_interval(
                (segments.loc[i, 'segment_onset'] - audio_onset)/1000,
                (segments.loc[i, 'segment_offset'] - audio_onset)/1000,
                segments.loc[i, 'transcription'],
                False
              )

        grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b))
    
    #the end cut for this iteration becomes the starting point for next iteration
    audio_onset = audio_offset
Un retour plus général: dans le cas présent, la taille des slices est un peu arbitraire; et il est possible que l'on décide d'en changer pour voir dans quelle mesure cela améliore ou non la performance de l'algorithme. Le seul problème du code ci-dessus est qu'il faut faire trois changements pour changer la valeur du paramètre. Voilà une manière de contourner ce problème: ```python import pandas as pd import pympi.Praat as pr from pydub import AudioSegment #open csv as pandas dataframe and clean dataframe df = pd.read_csv("inputs/vandam-data/annotations/cha/converted/BN32_010007_0_0.csv") df.dropna(subset = ['transcription'], inplace = True) df = df[df['transcription'].str.match(r"[a-zA-Z]")] #open recording recording = AudioSegment.from_wav("inputs/vandam-data/recordings/converted/standard/BN32_010007.wav") #initiate slices_length = 100 audio_onset = 0 for a in range(0, len(df), slices_length): #creates sliced dataframe with 100 rows b = min(a + slices_length, len(df)-1) df_sliced = df[a:b] print(a,b) #finds the segment offset of the 100th transcription entry and stores it into var #in milliseconds audio_offset = df_sliced['segment_offset'].max() #extracts recording at desired length and exports it to new file recording_sliced = recording[audio_onset:audio_offset] recording_sliced.export("outputs/BN_{0}-{1}.wav".format(a,b), format='wav') #create textgrid grid = pr.TextGrid(xmax = (audio_offset-audio_onset)/1000) #iterate through each row for speaker, segments in df_sliced.groupby('speaker_id'): aTier = grid.add_tier(speaker) for i in segments.index.values: if not segments.loc[i, 'transcription']: continue aTier.add_interval( (segments.loc[i, 'segment_onset'] - audio_onset)/1000, (segments.loc[i, 'segment_offset'] - audio_onset)/1000, segments.loc[i, 'transcription'], False ) grid.to_file("outputs/BN_{0}-{1}.TextGrid".format(a,b)) #the end cut for this iteration becomes the starting point for next iteration audio_onset = audio_offset ```
mfrebo kommentierte vor 3 Jahren
Besitzer

Merci pour ce retour! En effet, je n'avais pas pensé au changement de la taille des slices ni à l'ordre des transcriptions. C'est bien plus clean comme ça, merci. Je vais incorporer ces changements dans master!

Merci pour ce retour! En effet, je n'avais pas pensé au changement de la taille des slices ni à l'ordre des transcriptions. C'est bien plus clean comme ça, merci. Je vais incorporer ces changements dans master!
lucasgautheron hat vor 3 Jahren geschlossen
Anmelden, um an der Diskussion teilzunehmen.
Kein Label
Kein Meilenstein
Niemand zuständig
2 Beteiligte
Laden…
Abbrechen
Speichern
Hier gibt es bis jetzt noch keinen Inhalt.