123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- #!usr/bin/env python
- # -*- coding: utf8 -*-
- # -----------------------------------------------------------------------------
- # File: utils_annotations.py (as part of project URUMETRICS)
- # Created: 01/06/2022 17:15
- # Last Modified: 01/06/2022 17:15
- # -----------------------------------------------------------------------------
- # Author: William N. Havard
- # Postdoctoral Researcher
- #
- # Mail : william.havard@ens.fr / william.havard@gmail.com
- #
- # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
- #
- # ------------------------------------------------------------------------------
- # Description:
- # •
- # -----------------------------------------------------------------------------
- import librosa
- import numpy as np
- import pandas as pd
- import utils_audio
- from math import ceil, floor
- from utils_audio import get_audio_slice, read_audio
- def _compute_file_acoustic_annotation(audio_path, audio_segments, target_sr):
- """
- Compute the acoustic annotations for the segments audio_segment of the file audio_path with sr target_sr
- :param audio_path: path to the audio file to be read
- :type audio_path: str
- :param audio_segments: dataframe of segments for which we want to compute annotations
- :type audio_segments: pd.DataFrame
- :param target_sr: target sampling rate of the recording
- :type target_sr: int
- :return: annotations
- :rtype: pd.DataFrame
- """
- audio_time_series, sampling_rate = read_audio(audio_path, target_sr=target_sr)
- # Computes the start frame and end frame of the given segments given is on/offset in seconds
- audio_segments['frame_onset'] = audio_segments['segment_onset'].apply(
- lambda onset: floor(onset / 1000 * sampling_rate))
- audio_segments['frame_offset'] = audio_segments['segment_offset'].apply(
- lambda offset: ceil(offset / 1000 * sampling_rate))
- # Find better solution if more acoustic annotations are added in the future (concat dfs)
- annotations = _annotation_pitch(audio_segments, audio_time_series, target_sr)
- annotations.drop(columns=['frame_onset',
- 'frame_offset'],
- inplace=True)
- return annotations
- def _annotation_pitch(audio_segments, audio_time_series, sampling_rate):
- """
- Extract pitch related information for the audio segments audio_segments of the time series audio_time_series
- with sr sampling_rate
- :param audio_segments: dataframe of segments for which we want to compute annotations
- :type audio_segments: pd.DataFrame
- :param audio_time_series: audio time series
- :type audio_time_series: np.matrix
- :param sampling_rate: sampling rate
- :type sampling_rate: int
- :return: pitch annotations
- :rtype: pd.DataFrame
- """
- pitch = pd.DataFrame.from_records(audio_segments.apply(lambda row:
- get_pitch(
- get_audio_slice(
- audio_time_series, row['frame_onset'],
- row['frame_offset']
- ),
- sampling_rate, func=utils_audio.f2st
- ), axis=1).tolist())
- # Drop raw pitch values
- pitch.drop(list(pitch.filter(regex='raw_')), axis=1, inplace=True)
- pitch.index = audio_segments.index
- audio_segments = pd.concat([audio_segments, pitch], axis=1)
- return audio_segments
- def get_pitch(audio_time_series, sampling_rate, func=None):
- """
- Returns pitch-related annotations.
- Regarding pitch range, we use the 5-th percentile as the bottom of the range, and the 95-th percentile as the top.
- (see https://www.ibm.com/docs/en/wvs/6.1.1?topic=guide-introduction-pitch-its-use-ssml or
- https://languagelog.ldc.upenn.edu/nll/?p=40788 who also use the same methodology)
- :param audio_time_series: real-valued vector
- :type audio_time_series: np.array
- :param sampling_rate: sampling rate
- :type sampling_rate: int
- :param func: transformation function to apply to the fundamental frequency
- :type func: callable
- :return: raw pitch, mean pitch, median pitch, 5-th percentile, 95-th percentile, pitch range
- :rtype: dict
- """
- f0 = librosa.yin(audio_time_series,
- fmin=60,
- fmax=500,
- sr=sampling_rate) # pyin does not work, why?
- pitch = func(f0) if callable(func) else f0
- mean_pitch, median_pitch, p5_pitch, p95_pitch = pitch.mean(), np.quantile(pitch, .5), \
- np.percentile(pitch, 5), np.percentile(pitch, 95)
- pitch_type = "f0" if not callable(func) else func.__name__
- return {"raw_pitch_{}".format(pitch_type): f0,
- "mean_pitch_{}".format(pitch_type): mean_pitch,
- "median_pitch_{}".format(pitch_type): median_pitch,
- "p5_pitch_{}".format(pitch_type): p5_pitch,
- "p95_pitch_{}".format(pitch_type): p95_pitch,
- "pitch_range_{}".format(pitch_type): p95_pitch - p5_pitch}
|