1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- #!usr/bin/env python
- # -*- coding: utf8 -*-
- # -----------------------------------------------------------------------------
- # File: utils_audio.py (as part of project URUMETRICS)
- # Created: 01/06/2022 17:15
- # Last Modified: 01/06/2022 17:15
- # -----------------------------------------------------------------------------
- # Author: William N. Havard
- # Postdoctoral Researcher
- #
- # Mail : william.havard@ens.fr / william.havard@gmail.com
- #
- # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
- #
- # ------------------------------------------------------------------------------
- # Description:
- # •
- # -----------------------------------------------------------------------------
- import librosa
- import numpy as np
- from utils import vectorise
- @vectorise
- def f2st(f_Hz, base=50):
- """
- Returns the semitone of the frequency given as input adapted from https://rdrr.io/cran/hqmisc/src/R/hqmisc.R
- itself adapted from http://ldc.upenn.edu/myl/llog/semitones.R (no longer available).
- See https://www.internationalphoneticassociation.org/icphs-proceedings/ICPhS2003/papers/p15_0771.pdf for reference
- :param f_Hz: frequency to convert (in Herz)
- :type f_Hz: int
- :param base: baseline frequency relative to which semitones are expressed
- :type base: int
- :return: semitone corresponding to the frequency given as input
- :rtype: float
- """
- # Use a more explicit designation in annotation title
- f2st.__name__ = 'semitone'
- semi = np.log(2 ** (1 / 12))
- return (np.log(f_Hz) - np.log(base)) / semi
- @vectorise
- def f2erb(f_Hz):
- """
- Return the ERB value of the frequency given as input
- :param f_Hz: frequency to convert (in Herz)
- :type f_Hz: int
- :return: ERB value of the frequency given as input
- :rtype: float
- """
- f2erb.__name__ = 'erb'
-
- f_kHz = f_Hz * 1e-3
- return 24.7 * (4.37 * f_kHz + 1)
- def get_audio_slice(audio_time_series, begin, end):
- """
- Returns the acoustic vector between begin and end
- :param audio_time_series: audio time series
- :type audio_time_series: Numpy 2D matrix [time, dim]
- :param begin: start frame
- :type begin: int
- :param end: end frame
- :type end: int
- :return: sliced time series of size (end-begin, dim)
- :rtype: Numpy 2D Matrix
- """
- return audio_time_series[begin:end]
- def read_audio(file_path, target_sr=16000):
- """
- Read an audio file and returns the audio time series and its sampling rate
- :param file_path: path to an audio file
- :type file_path: str
- :return: (audio time series, sampling rate)
- :rtype: np.array
- """
- file_sr = librosa.get_samplerate(file_path)
- assert file_sr == target_sr, ValueError("Mismatch between file's true sampling rate ({}) and "
- "target sampling rate ({})!".format(file_sr, target_sr))
- return librosa.load(file_path, mono=True, sr=target_sr)
|