utils_audio.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. #!usr/bin/env python
  2. # -*- coding: utf8 -*-
  3. # -----------------------------------------------------------------------------
  4. # File: utils_audio.py (as part of project URUMETRICS)
  5. # Created: 01/06/2022 17:15
  6. # Last Modified: 01/06/2022 17:15
  7. # -----------------------------------------------------------------------------
  8. # Author: William N. Havard
  9. # Postdoctoral Researcher
  10. #
  11. # Mail : william.havard@ens.fr / william.havard@gmail.com
  12. #
  13. # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
  14. #
  15. # ------------------------------------------------------------------------------
  16. # Description:
  17. # •
  18. # -----------------------------------------------------------------------------
  19. import librosa
  20. import numpy as np
  21. from utils import vectorise
  22. @vectorise
  23. def f2st(f_Hz, base=50):
  24. """
  25. Returns the semitone of the frequency given as input adapted from https://rdrr.io/cran/hqmisc/src/R/hqmisc.R
  26. itself adapted from http://ldc.upenn.edu/myl/llog/semitones.R (no longer available).
  27. See https://www.internationalphoneticassociation.org/icphs-proceedings/ICPhS2003/papers/p15_0771.pdf for reference
  28. :param f_Hz: frequency to convert (in Herz)
  29. :type f_Hz: int
  30. :param base: baseline frequency relative to which semitones are expressed
  31. :type base: int
  32. :return: semitone corresponding to the frequency given as input
  33. :rtype: float
  34. """
  35. # Use a more explicit designation in annotation title
  36. f2st.__name__ = 'semitone'
  37. semi = np.log(2 ** (1 / 12))
  38. return (np.log(f_Hz) - np.log(base)) / semi
  39. @vectorise
  40. def f2erb(f_Hz):
  41. """
  42. Return the ERB value of the frequency given as input
  43. :param f_Hz: frequency to convert (in Herz)
  44. :type f_Hz: int
  45. :return: ERB value of the frequency given as input
  46. :rtype: float
  47. """
  48. f2erb.__name__ = 'erb'
  49. f_kHz = f_Hz * 1e-3
  50. return 24.7 * (4.37 * f_kHz + 1)
  51. def get_audio_slice(audio_time_series, begin, end):
  52. """
  53. Returns the acoustic vector between begin and end
  54. :param audio_time_series: audio time series
  55. :type audio_time_series: Numpy 2D matrix [time, dim]
  56. :param begin: start frame
  57. :type begin: int
  58. :param end: end frame
  59. :type end: int
  60. :return: sliced time series of size (end-begin, dim)
  61. :rtype: Numpy 2D Matrix
  62. """
  63. return audio_time_series[begin:end]
  64. def read_audio(file_path, target_sr=16000):
  65. """
  66. Read an audio file and returns the audio time series and its sampling rate
  67. :param file_path: path to an audio file
  68. :type file_path: str
  69. :return: (audio time series, sampling rate)
  70. :rtype: np.array
  71. """
  72. file_sr = librosa.get_samplerate(file_path)
  73. assert file_sr == target_sr, ValueError("Mismatch between file's true sampling rate ({}) and "
  74. "target sampling rate ({})!".format(file_sr, target_sr))
  75. return librosa.load(file_path, mono=True, sr=target_sr)