LAAC-LSCP
/
URUMETRICS-CODE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
							#!usr/bin/env python
# -*- coding: utf8 -*-

# -----------------------------------------------------------------------------
#   File: utils_annotations.py (as part of project URUMETRICS)
#   Created: 01/06/2022 17:15
#   Last Modified: 01/06/2022 17:15
# -----------------------------------------------------------------------------
#   Author: William N. Havard
#           Postdoctoral Researcher
#
#   Mail  : william.havard@ens.fr / william.havard@gmail.com
#  
#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
#
# ------------------------------------------------------------------------------
#   Description: 
#       • 
# -----------------------------------------------------------------------------

import librosa
import numpy as np
import pandas as pd
import utils_audio

from math import ceil, floor
from utils_audio import get_audio_slice, read_audio


def _compute_file_acoustic_annotation(audio_path, audio_segments, target_sr):
    """
    Compute the acoustic annotations for the segments audio_segment of the file audio_path with sr target_sr
    :param audio_path: path to the audio file to be read
    :type audio_path: str
    :param audio_segments: dataframe of segments for which we want to compute annotations
    :type audio_segments: pd.DataFrame
    :param target_sr: target sampling rate of the recording
    :type target_sr: int
    :return: annotations
    :rtype: pd.DataFrame
    """
    audio_time_series, sampling_rate = read_audio(audio_path, target_sr=target_sr)

    # Computes the start frame and end frame of the given segments given is on/offset in seconds
    audio_segments['frame_onset'] = audio_segments['segment_onset'].apply(
        lambda onset: floor(onset / 1000 * sampling_rate))
    audio_segments['frame_offset'] = audio_segments['segment_offset'].apply(
        lambda offset: ceil(offset / 1000 * sampling_rate))

    # Find better solution if more acoustic annotations are added in the future (concat dfs)
    annotations = _annotation_pitch(audio_segments, audio_time_series, target_sr)

    annotations.drop(columns=['frame_onset',
                              'frame_offset'],
                     inplace=True)

    return annotations


def _annotation_pitch(audio_segments, audio_time_series, sampling_rate):
    """
    Extract pitch related information for the audio segments audio_segments of the time series audio_time_series
    with sr sampling_rate
    :param audio_segments: dataframe of segments for which we want to compute annotations
    :type audio_segments: pd.DataFrame
    :param audio_time_series: audio time series
    :type audio_time_series: np.matrix
    :param sampling_rate: sampling rate
    :type sampling_rate: int
    :return: pitch annotations
    :rtype: pd.DataFrame
    """
    pitch = pd.DataFrame.from_records(audio_segments.apply(lambda row:
                                                           get_pitch(
                                                               get_audio_slice(
                                                                   audio_time_series, row['frame_onset'],
                                                                   row['frame_offset']
                                                               ),
                                                               sampling_rate, func=utils_audio.f2st
                                                           ), axis=1).tolist())

    # Drop raw pitch values
    pitch.drop(list(pitch.filter(regex='raw_')), axis=1, inplace=True)

    pitch.index = audio_segments.index
    audio_segments = pd.concat([audio_segments, pitch], axis=1)

    return audio_segments


def get_pitch(audio_time_series, sampling_rate, func=None):
    """
    Returns pitch-related annotations.
    Regarding pitch range, we use the 5-th percentile as the bottom of the range, and the 95-th percentile as the top.
    (see https://www.ibm.com/docs/en/wvs/6.1.1?topic=guide-introduction-pitch-its-use-ssml or
    https://languagelog.ldc.upenn.edu/nll/?p=40788 who also use the same methodology)
    :param audio_time_series: real-valued vector
    :type audio_time_series: np.array
    :param sampling_rate: sampling rate
    :type sampling_rate: int
    :param func: transformation function to apply to the fundamental frequency
    :type func: callable
    :return: raw pitch, mean pitch, median pitch, 5-th percentile, 95-th percentile, pitch range
    :rtype: dict
    """
    f0 = librosa.yin(audio_time_series,
                     fmin=60,
                     fmax=500,
                     sr=sampling_rate)  # pyin does not work, why?
    pitch = func(f0) if callable(func) else f0
    mean_pitch, median_pitch, p5_pitch, p95_pitch = pitch.mean(), np.quantile(pitch, .5), \
                                                    np.percentile(pitch, 5), np.percentile(pitch, 95)

    pitch_type = "f0" if not callable(func) else func.__name__

    return {"raw_pitch_{}".format(pitch_type): f0,
            "mean_pitch_{}".format(pitch_type): mean_pitch,
            "median_pitch_{}".format(pitch_type): median_pitch,
            "p5_pitch_{}".format(pitch_type): p5_pitch,
            "p95_pitch_{}".format(pitch_type): p95_pitch,
            "pitch_range_{}".format(pitch_type): p95_pitch - p5_pitch}