Browse Source

First commit.

William N. Havard 1 year ago
commit
6fa72f247e

+ 168 - 0
.gitignore

@@ -0,0 +1,168 @@
+# Project related items #
+.idea
+
+# Byte-compiled / optimized / DLL files #
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions #
+*.so
+
+# Distribution / packaging #
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller #
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject desired_date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs #
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports #
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations #
+*.mo
+*.pot
+
+# Django stuff #
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff #
+instance/
+.webassets-cache
+
+# Scrapy stuff #
+.scrapy
+
+# Sphinx documentation #
+doc/_build/
+
+# PyBuilder #
+target/
+
+# Jupyter Notebook #
+.ipynb_checkpoints
+
+# IPython #
+profile_default/
+ipython_config.py
+
+# pyenv #
+.python-version
+
+# pipenv #
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock data_set version control.
+#   However, data_set case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow #
+__pypackages__/
+
+# Celery stuff #
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files #
+*.sage.py
+
+# Environments #
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings #
+.spyderproject
+.spyproject
+
+# Rope project settings #
+.ropeproject
+
+# mkdocs documentation #
+/site
+
+# mypy #
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker #
+.pyre/
+
+
+# Compiled source #
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+
+# Packages #
+# it's better to unpack these files and commit the raw source
+# git has its own built data_set compression methods
+*.7z
+*.dmg
+*.gz
+*.iso
+*.jar
+*.rar
+*.tar
+*.zip
+
+# Logs and databases #
+*.log
+*.sql
+*.sqlite
+
+# OS generated files #
+######################
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db

+ 21 - 0
LICENSE

@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Language acquisition across cultures
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

+ 106 - 0
README.md

@@ -0,0 +1,106 @@
+# URUMETRICS
+
+Uruguayan Chatbot Project
+
+## Description
+
+This repository contains the code to extract the metrics defined in the Uruguayan Chatbot Project.
+
+## Installation
+
+Clone this repository:
+
+```bash
+git clone git@github.com:LAAC-LSCP/URUMETRICS.git
+```
+
+If you which to install the dependencies directly you can run:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Repository structure 
+
+
+* `src` contains the code that is necessary to handle the data
+    * `src/acoustic_annotations` contains the code that compute the acoustic annotations
+    * `src/import_data` contains code that allows to convert the data to the ChildProject format
+    * `src/compute_metrics` contains the code that extract the metrics
+    * `src/generate_messages` is the code that reads the metrics file and generates the messages sent to the families
+* `egs` contains example files that allow you to test your installation
+* `tst` contains the code that allows you to test your installation
+* `dat` is the directory where the input data should be stored
+    * `dat/data_set` is the directory where the input files should be stored (if the directory does not exist, see step 1 of [How to use?](#how-to-use))
+    * `dat/out` is the directory that contains the CSV files of the messages to send to the families
+    * `dat/utility` contains utility files (such as the definition of the messages sent to the families)
+
+## Data requirements
+
+### Recording file names
+
+Recording file names should be **WAV files** that follow the following **naming convention**
+
+```
+CHILD-ID_[info1_info2_..._infoX]_YYYYMMDD_HHMMSS.wav
+```
+where 
+* `YYYYMMDD` corresponds to the date formatted according to the ISO 8601 format (i.e. `YYYY` for the year, `MM` for the month (from 01 to 12), and `DD` for the day (from 01 to 31)) and 
+* `HHMMSS` date formatted according to the ISO 8601 format (`HH` for hours (from 00 to 23, 24-hour clock system), `MM` for minutes (from 00 to 59), and `SS` for seconds (from 00 to 59)).
+* `[info1_info2_..._infoX]` corresponds to **optional** information (without `[` and `]) separated by underscores `_`
+* `CHILD-ID` may use **any character except the underscore character (`_`)**.
+
+Additional information will be store in the metadata file `metadata/recordings.csv` in the column `experiment_stage`.
+
+## How to use?
+
+1. Set up the `dat` directory by running `python -um src.import_data.prepare_data_set`
+
+
+This command only needs to be run once, however if won't break anything if it is run several times. This commands creates a ChildProject `data_set` directory with several subdirectories:
+* `recordings`: stores the recordings for which we need to run the pipeline
+* `annotations`: contains the annotations pertaining to the recordings
+* `metadata`: stores the metadata of the whole data set (children, recordings, annotations, etc.)
+* `extra`: used to store extra item (used to store `metrics.csv`)
+
+2. Place the recordings in `dat/in/recordings/raw` and run `python -um src.import_data.import_recordings`
+
+This command will look at the new recordings found in the `raw` directory and add them to the metadata file `metadata/recordings.csv`. If some of the recordings belong to previously unknown children, they will be added to the metadata file `metadata/children.csv`.
+
+Note that the recording file names **should comply with the file naming convention described above**!
+
+4. Compute the acoustic annotations using the following command `python -um src.acoustic_annotations.compute_acoustic_annotations --path-vtc /path/to/vtc/file.rttm --path-recordings /path/to/recordings/raw --save-path /path/where/to/save/the/annotations`.
+
+This command will compute acoustic annotations (mean pitch, pitch range) given *raw* VTC annotations. The output file will be named according to the following pattern `acoustic_annotations_YYYYMMDD_HHMMSS.csv`.
+
+5. Place the annotations in their respective folder in `dat/in/annotations/{acoustic|vtc|vcm|alice}/raw`
+
+Note that the annotation files should have **unique names** (e.g. like the acoustic annotations) and **should by no means overwrite the files already present** in the aforementioned directories.
+
+6. Run the following command `python -um src.import_data.import_annotations` to convert the annotations to the ChildProject format.
+
+7. Run the following command `python -um src.compute_metrics.metrics` to compute ACLEW metrics as well as additional metrics defined in `compute_metrics/metrics_functions.py`
+
+This command will generate a file `metrics.csv` which will be stored in `dat/data_set/extra`. If the file already exists, new lines will be added at the end.
+
+Note that the metrics are only computed for newly imported recordings and not for all the files. If not annotations are linked to the new files (e.g. you forgot to import them) the columns will be empty.
+
+8. Generate the message using the following command `python -um src.generate_messages.messages [--date YYYYMMDD]`
+
+This command will create a file in `dat/out` with the following name pattern `messages_YYYYMMDD.csv`
+
+The file will contain the message that correspond to each new audio file. The date parameter is used to specify the date for which to generate messages. If the date is before the current date, only recording available at the specified date will be considered to generate the messages. This allows to re-generate past messages if needed. If no date is specified, the current date is used.
+
+## Return codes
+
+Every command returns either a `0` (i.e. no problem) or `1` (i.e. problem) return code. They might print information, warning and error messages to STDERR.
+
+## Test
+
+TO DO!
+
+## Version Requirements
+
+* VTC: [66f87c2a8cef25c80c9d9b91f4023ab4757413da](https://github.com/MarvinLvn/voice-type-classifier/tree/66f87c2a8cef25c80c9d9b91f4023ab4757413da)
+* VCM: [37e27e75c613ef78f375ff43f1d69940b02d0713](https://github.com/LAAC-LSCP/vcm/tree/37e27e75c613ef78f375ff43f1d69940b02d0713)
+* Alice: [f7962f46615a6a433f0da5398f61282d9961c101](https://github.com/orasanen/ALICE/tree/f7962f46615a6a433f0da5398f61282d9961c101)

+ 2 - 0
__init__.py

@@ -0,0 +1,2 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-

+ 5 - 0
requirements.txt

@@ -0,0 +1,5 @@
+pandas~=1.4.2
+ChildProject~=0.0.4
+numpy~=1.21.5
+PyYAML~=6.0
+librosa~=0.9.1

+ 2 - 0
src/__init__.py

@@ -0,0 +1,2 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-

+ 19 - 0
src/acoustic_annotations/__init__.py

@@ -0,0 +1,19 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: __init__.py (as part of project URUMETRICS)
+#   Created: 01/06/2022 16:34
+#   Last Modified: 01/06/2022 16:34
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------

+ 130 - 0
src/acoustic_annotations/compute_acoustic_annotations.py

@@ -0,0 +1,130 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: compute_acoustic_annotations.py (as part of project URUMETRICS)
+#   Created: 01/06/2022 15:25
+#   Last Modified: 01/06/2022 15:25
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • This files computes acoustic annotations for each segment identified
+#         by the VTC.
+# -----------------------------------------------------------------------------
+
+import logging
+import os
+from datetime import datetime
+from math import ceil, floor
+
+import pandas as pd
+
+import utils_audio
+from utils import list_audio_files, read_vtc
+from utils_annotations import get_pitch
+from utils_audio import get_audio_slice, read_audio
+
+logger = logging.getLogger(__name__)
+
+def _annotation_pitch(audio_segments, audio_time_series, sampling_rate):
+    pitch = pd.DataFrame.from_records(audio_segments.apply(lambda row:
+                                                           get_pitch(
+                                                               get_audio_slice(
+                                                                   audio_time_series, row['frame_onset'],
+                                                                   row['frame_offset']
+                                                               ),
+                                                               sampling_rate, func=utils_audio.f2st
+                                                           ), axis=1).tolist())
+
+    # Drop raw pitch values
+    pitch.drop(list(pitch.filter(regex='raw_')), axis=1, inplace=True)
+
+    pitch.index = audio_segments.index
+    audio_segments = pd.concat([audio_segments, pitch], axis=1)
+
+    return audio_segments
+
+def _compute_file_acoustic_annotation(audio_path, audio_segments, target_sr):
+    audio_time_series, sampling_rate = read_audio(audio_path, target_sr=target_sr)
+
+    audio_segments['frame_onset'] = audio_segments['segment_onset'].apply(
+        lambda onset: floor(onset / 1000 * sampling_rate))
+    audio_segments['frame_offset'] = audio_segments['segment_offset'].apply(
+        lambda offset: ceil(offset / 1000 * sampling_rate))
+
+    # Find better solution if more acoustic annotations are added in the future (concat dfs)
+    annotations = _annotation_pitch(audio_segments, audio_time_series, target_sr)
+
+    annotations.drop(columns=['frame_onset',
+                              'frame_offset'],
+                     inplace=True)
+
+    return annotations
+
+
+def compute_acoustic_annotations(path_vtc, path_recordings, target_sr=16_000):
+    vtc_data = read_vtc(path_vtc, drop_na=True)
+    audio_file_list = list_audio_files(path_recordings)
+
+    annotations = []
+    vtc_audio_files = vtc_data.groupby(by='file')
+    for audio_file_name, audio_segments in vtc_audio_files:
+        if not audio_file_list.get(audio_file_name, False): continue
+        file_anns = _compute_file_acoustic_annotation(audio_file_list[audio_file_name], audio_segments, target_sr)
+        annotations.append(file_anns)
+
+    df_annotations = pd.concat(annotations, axis=0)
+    return df_annotations
+
+def _parse_args(argv):
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Compute acoustic annotations.')
+    parser.add_argument('--path-vtc', required=True,
+                        help='Path to the VTC files for which acoustic annotations be computed.')
+    parser.add_argument('--path-recordings', required=True,
+                        help='Path to the recordings corresponding to the recording filenames contained '
+                             'in the VTC file.')
+    parser.add_argument('--save-path', required=True,
+                        help='Path were the annotations should be saved.')
+    args = parser.parse_args(argv)
+
+    return vars(args)
+
+
+def main(**kwargs):
+    save_path = kwargs.pop('save_path')
+    assert not os.path.exists(save_path), IOError('Path {} does not exist!'.format(save_path))
+
+    annotations = compute_acoustic_annotations(**kwargs)
+
+    date_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    full_save_path = os.path.join(save_path, 'acoustic_annotations_{}.csv'.format(date_time))
+    if os.path.exists(full_save_path):
+        existing_annotations = pd.read_csv(full_save_path)
+        annotations = pd.concat([existing_annotations, annotations])
+
+    annotations.to_csv(full_save_path, index=False)
+    logger.info('Saved to {}.'.format(full_save_path))
+
+
+if __name__ == '__main__':
+    import sys
+
+    pgrm_name, argv = sys.argv[0], sys.argv[1:]
+    args = _parse_args(argv)
+
+    logging.basicConfig(level=logging.INFO)
+
+    try:
+        main(**args)
+        sys.exit(0)
+    except:
+        sys.exit(1)

+ 114 - 0
src/acoustic_annotations/utils.py

@@ -0,0 +1,114 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: utils.py (as part of project URUMETRICS)
+#   Created: 01/06/2022 16:36
+#   Last Modified: 01/06/2022 16:36
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+import os
+from functools import wraps
+
+import numpy as np
+import pandas as pd
+from ChildProject.converters import VtcConverter
+
+
+def vectorise(func):
+    @wraps(func)
+    def wrapper(args, **kwargs):
+        return np.vectorize(func)(args, **kwargs)
+
+    return wrapper
+
+
+def list_audio_files(recordings_path, ext='wav'):
+    file_list = walk_dir(recordings_path, ext=ext, return_full_path=True)
+    file_dict = {
+        get_raw_filename(fp): fp for fp in file_list
+    }
+    return file_dict
+
+
+def read_vtc(path_vtc, drop_na=False):
+    """
+    Reads a VTC file and returns a DataFrame (code borrowed from ChildProject.converters.VtcConverter
+    :param path_vtc: path to VTC file to be read
+    :return: DataFrame of the annotations contained in the file
+    """
+    assert os.path.exists(path_vtc), IOError('Path to VTC file {} does not exist!'.format(path_vtc))
+
+    rttm = pd.read_csv(
+        path_vtc,
+        sep=" ",
+        names=[
+            "type",
+            "file",
+            "chnl",
+            "tbeg",
+            "tdur",
+            "ortho",
+            "stype",
+            "name",
+            "conf",
+            "unk",
+        ],
+    )
+
+    df = rttm
+    df["segment_onset"] = df["tbeg"].mul(1000).round().astype(int)
+    df["segment_offset"] = (df["tbeg"] + df["tdur"]).mul(1000).round().astype(int)
+    df["speaker_type"] = df["name"].map(VtcConverter.SPEAKER_TYPE_TRANSLATION)
+
+    df.drop(
+        [
+            "type",
+            "chnl",
+            "tbeg",
+            "tdur",
+            "ortho",
+            "stype",
+            "name",
+            "conf",
+            "unk",
+        ],
+        axis=1,
+        inplace=True,
+    )
+
+    if drop_na:
+        df = df[~df['speaker_type'].isin(['NA'])]
+
+    return df
+
+
+def get_raw_filename(fp):
+    return os.path.basename(os.path.splitext(fp)[0])
+
+
+def walk_dir(path, ext=[], return_full_path=True):
+    if type(ext) == str: ext = [ext]
+
+    files = []
+    for p, d, f in os.walk(path):
+        for ff in f:
+            _, file_extension = os.path.splitext(ff)
+            if not len(ext) or file_extension[1:] in ext:
+                path_suffix = p.replace(os.path.commonprefix([path, p]), '').lstrip(os.sep)
+                if return_full_path:
+                    files.append(os.path.join(path, path_suffix, ff))
+                else:
+                    files.append(os.path.join(path_suffix, ff))
+    return sorted(files)

+ 55 - 0
src/acoustic_annotations/utils_annotations.py

@@ -0,0 +1,55 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: utils_annotations.py (as part of project URUMETRICS)
+#   Created: 01/06/2022 17:15
+#   Last Modified: 01/06/2022 17:15
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+import librosa
+import numpy as np
+
+
+def get_pitch(audio_time_series, sampling_rate, func=None):
+    """
+    Returns pitch-related annotations.
+    Regarding pitch range, we use the 5-th percentile as the bottom of the range, and the 95-th percentile as the top.
+    (see https://www.ibm.com/docs/en/wvs/6.1.1?topic=guide-introduction-pitch-its-use-ssml or
+    https://languagelog.ldc.upenn.edu/nll/?p=40788 who also use the same methodology)
+    :param audio_time_series: real-valued vector
+    :type audio_time_series: np.array
+    :param sampling_rate: sampling rate
+    :type sampling_rate: int
+    :param func: transformation function to apply to the fundamental frequency
+    :type func: callable
+    :return: raw pitch, mean pitch, median pitch, 5-th percentile, 95-th percentile, pitch range
+    :rtype: dict
+    """
+    f0 = librosa.yin(audio_time_series,
+                     fmin=60,
+                     fmax=500,
+                     sr=sampling_rate)  # pyin does not work, why?
+    pitch = func(f0) if callable(func) else f0
+    mean_pitch, median_pitch, p5_pitch, p95_pitch = pitch.mean(), np.quantile(pitch, .5), \
+                                                    np.percentile(pitch, 5), np.percentile(pitch, 95)
+
+    pitch_type = "f0" if not callable(func) else func.__name__
+
+    return {"raw_pitch_{}".format(pitch_type): f0,
+            "mean_pitch_{}".format(pitch_type): mean_pitch,
+            "median_pitch_{}".format(pitch_type): median_pitch,
+            "p5_pitch_{}".format(pitch_type): p5_pitch,
+            "p95_pitch_{}".format(pitch_type): p95_pitch,
+            "pitch_range_{}".format(pitch_type): p95_pitch - p5_pitch}

+ 78 - 0
src/acoustic_annotations/utils_audio.py

@@ -0,0 +1,78 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: utils_audio.py (as part of project URUMETRICS)
+#   Created: 01/06/2022 17:15
+#   Last Modified: 01/06/2022 17:15
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+import librosa
+import numpy as np
+
+from utils import vectorise
+
+
+@vectorise
+def f2st(f_Hz, base=50):
+    """
+    Returns the semitone of the frequency given as input adapted from https://rdrr.io/cran/hqmisc/src/R/hqmisc.R
+    itself adapted from http://ldc.upenn.edu/myl/llog/semitones.R (no longer available).
+    See https://www.internationalphoneticassociation.org/icphs-proceedings/ICPhS2003/papers/p15_0771.pdf for reference
+    :param f_Hz: frequency to convert (in Herz)
+    :type f_Hz: int
+    :param base: baseline frequency relative to which semitones are expressed
+    :type base: int
+    :return: semitone corresponding to the frequency given as input
+    :rtype: float
+    """
+
+    # Use a more explicit designation in annotation title
+    f2st.__name__ = 'semitone'
+
+    semi = np.log(2 ** (1 / 12))
+    return (np.log(f_Hz) - np.log(base)) / semi
+
+
+@vectorise
+def f2erb(f_Hz):
+    """
+    Return the ERB value of the frequency given as input
+    :param f_Hz: frequency to convert (in Herz)
+    :type f_Hz: int
+    :return: ERB value of the frequency given as input
+    :rtype: float
+    """
+    f2erb.__name__ = 'erb'
+    
+    f_kHz = f_Hz * 1e-3
+    return 24.7 * (4.37 * f_kHz + 1)
+
+
+def get_audio_slice(audio_time_series, begin, end):
+    return audio_time_series[begin:end]
+
+
+def read_audio(file_path, target_sr=16000):
+    """
+    Read an audio file and returns the audio time series and its sampling rate
+    :param file_path: path to an audio file
+    :type file_path: str
+    :return: (audio time series, sampling rate)
+    :rtype: np.array
+    """
+    file_sr = librosa.get_samplerate(file_path)
+    assert file_sr == target_sr, ValueError("Mismatch between file's true sampling rate ({}) and "
+                                            "target sampling rate ({})!".format(file_sr, target_sr))
+    return librosa.load(file_path, mono=True, sr=target_sr)

+ 19 - 0
src/compute_metrics/__init__.py

@@ -0,0 +1,19 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: __init__.py (as part of project URUMETRICS)
+#   Created: 01/06/2022 16:35
+#   Last Modified: 01/06/2022 16:35
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------

+ 178 - 0
src/compute_metrics/metrics.py

@@ -0,0 +1,178 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+import logging
+import os
+# -----------------------------------------------------------------------------
+#   File: metrics.py (as part of project URUMETRICS)
+#   Created: 28/07/2022 13:58
+#   Last Modified: 28/07/2022 13:58
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description:
+#       •
+# -----------------------------------------------------------------------------
+import sys
+from typing import List, Union
+
+import numpy as np
+import pandas as pd
+from ChildProject.annotations import AnnotationManager
+from ChildProject.pipelines.metrics import Metrics
+from ChildProject.projects import ChildProject
+
+from .metrics_functions import avg_pr_pm_speaker, avg_sr_pm_speaker, avg_wr_pm_speaker, \
+    mean_mean_pitch_speaker, mean_pitch_range_speaker, mlup_speaker, mlus_speaker, mluw_speaker
+
+logger = logging.getLogger(__name__)
+
+class UruguayMetrics(Metrics):
+    SUBCOMMAND = "uruguay"
+
+    def __init__(
+            self,
+            project: ChildProject,
+            vtc: str = "vtc",
+            alice: str = "alice",
+            vcm: str = "vcm",
+            acoustic: str="acoustic",
+            recordings: Union[str, List[str], pd.DataFrame] = None,
+            from_time: str = None,
+            to_time: str = None,
+            rec_cols: str = None,
+            child_cols: str = None,
+            period: str = None,
+            by: str = "recording_filename",
+            threads: int = 1,
+    ):
+
+        self.vtc = vtc
+        self.alice = alice
+        self.vcm = vcm
+        self.acoustic = acoustic
+        self.alice_vtc = 'alice_vtc'
+
+        # Creating merged alice_vtc set
+        am = AnnotationManager(project)
+        am.read()
+        if self.alice_vtc in set(am.annotations['set']):
+            am.remove_set(self.alice_vtc)
+
+        am.merge_sets(
+            left_set="vtc",
+            right_set="alice",
+            left_columns=["speaker_type"],
+            right_columns=["phonemes", "syllables", "words"],
+            output_set=self.alice_vtc,
+        )
+
+        # Define metrics we want to have
+        METRICS = np.array(
+            [["voc_speaker_ph", self.vtc, 'FEM'],
+             ["voc_speaker_ph", self.vtc, 'CHI'],
+             ["voc_dur_speaker_ph", self.vtc, 'FEM'],
+             ["voc_dur_speaker_ph", self.vtc, 'CHI'],
+             ["avg_voc_dur_speaker", self.vtc, 'FEM'],
+             ["avg_voc_dur_speaker", self.vtc, 'CHI'],
+             ])
+
+        if self.alice not in am.annotations["set"].values:
+            print(f"The ALICE set ('{self.alice}') was not found in the index.")
+        else:
+            # METRICS = np.concatenate((METRICS, np.array(
+            #     [
+            #      ["wc_adu_ph", self.alice, pd.NA],
+            #      ["sc_adu_ph", self.alice, pd.NA],
+            #      ["pc_adu_ph", self.alice, pd.NA],
+            #      ])))
+            pass
+
+        if self.vcm not in am.annotations["set"].values:
+            print(f"The vcm set ('{self.vcm}') was not found in the index.")
+        else:
+            METRICS = np.concatenate((METRICS, np.array(
+                [["cry_voc_speaker_ph", self.vcm, 'CHI'],
+                 ["cry_voc_dur_speaker_ph", self.vcm, 'CHI'],
+                 ["avg_cry_voc_dur_speaker", self.vcm, 'CHI'],
+                 ["can_voc_speaker_ph", self.vcm, 'CHI'],
+                 ["can_voc_dur_speaker_ph", self.vcm, 'CHI'],
+                 ["avg_can_voc_dur_speaker", self.vcm, 'CHI'],
+                 ["non_can_voc_speaker_ph", self.vcm, 'CHI'],
+                 ["non_can_voc_dur_speaker_ph", self.vcm, 'CHI'],
+                 ["avg_non_can_voc_dur_speaker", self.vcm, 'CHI'],
+                 ["lp_n", self.vcm, pd.NA],
+                 ["lp_dur", self.vcm, pd.NA],
+                 ["cp_n", self.vcm, pd.NA],
+                 ["cp_dur", self.vcm, pd.NA],
+                 ])))
+
+        if self.alice_vtc not in am.annotations['set'].values:
+            print(f"The alice_vtc set ('{self.alice_vtc }') was not found in the index.")
+        else:
+            METRICS = np.concatenate((METRICS, np.array(
+                [
+                    [avg_pr_pm_speaker, self.alice_vtc, 'FEM'],
+                    [avg_sr_pm_speaker, self.alice_vtc, 'FEM'],
+                    [avg_wr_pm_speaker, self.alice_vtc, 'FEM'],
+                    ["wc_speaker_ph", self.alice_vtc, 'FEM'],
+                    ["sc_speaker_ph", self.alice_vtc, 'FEM'],
+                    ["pc_speaker_ph", self.alice_vtc, 'FEM'],
+                    [mluw_speaker, self.alice_vtc, 'FEM'],
+                    [mlus_speaker, self.alice_vtc, 'FEM'],
+                    [mlup_speaker, self.alice_vtc, 'FEM'],
+                 ])))
+
+        if self.acoustic not in am.annotations['set'].values:
+            print(f"The alice_vtc set ('{self.acoustic }') was not found in the index.")
+        else:
+            METRICS = np.concatenate((METRICS, np.array(
+                [
+                    [mean_mean_pitch_speaker, self.acoustic, 'FEM'],
+                    [mean_mean_pitch_speaker, self.acoustic, 'CHI'],
+                    [mean_pitch_range_speaker, self.acoustic, 'FEM'],
+                    [mean_pitch_range_speaker, self.acoustic, 'CHI'],
+                 ])))
+
+        METRICS = pd.DataFrame(METRICS, columns=["callable", "set", "speaker"])
+
+        super().__init__(project, METRICS, by=by, recordings=recordings,
+                         period=period, from_time=from_time, to_time=to_time,
+                         rec_cols=rec_cols, child_cols=child_cols, threads=threads)
+
+def main():
+    project_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set'))
+    metrics_file = os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set', 'extra', 'metrics.csv')
+
+    project = ChildProject(project_path)
+    project.read()
+
+    if not os.path.exists(metrics_file):
+        new_recordings = set(project.recordings['recording_filename'])
+        existing_metrics = None
+    else:
+        existing_metrics = pd.read_csv(metrics_file)
+        new_recordings = set(project.recordings['recording_filename']) - set(existing_metrics['recording_filename'])
+
+    if new_recordings:
+        urumetrics = UruguayMetrics(project=project, recordings=list(new_recordings))
+        metrics = urumetrics.extract()
+
+        if isinstance(existing_metrics, pd.DataFrame):
+            metrics = pd.concat([existing_metrics, metrics])
+
+        metrics.to_csv(metrics_file, index=False)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+        sys.exit(0)
+    except Exception as e:
+        logger.error(str(e))
+        sys.exit(1)

+ 86 - 0
src/compute_metrics/metrics_functions.py

@@ -0,0 +1,86 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: metrics_functions.py (as part of project URUMETRICS)
+#   Created: 03/06/2022 17:13
+#   Last Modified: 03/06/2022 17:13
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description:
+#       •
+# -----------------------------------------------------------------------------
+
+import pandas as pd
+
+pd.options.display.max_rows = 500
+pd.options.display.min_rows = 500
+
+from ChildProject.pipelines.metricsFunctions import metricFunction
+
+@metricFunction({"speaker"}, {"speaker_type", "phonemes", "duration"})
+def avg_pr_pm_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    """
+    Average phoneme rate (pr) per minute by speaker
+    """
+    unit_count = annotations[annotations["speaker_type"] == kwargs["speaker"]]["phonemes"]
+    segment_duration = annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"] / 1000 / 60
+    return (unit_count/segment_duration).mean()
+
+
+@metricFunction({"speaker"}, {"speaker_type", "words", "duration"})
+def avg_wr_pm_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    """
+    Average phoneme rate (pr) per minute by speaker
+    """
+    unit_count = annotations[annotations["speaker_type"] == kwargs["speaker"]]["words"]
+    segment_duration = annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"] / 1000 / 60
+    return (unit_count/segment_duration).mean()
+
+
+@metricFunction({"speaker"}, {"speaker_type", "syllables", "duration"})
+def avg_sr_pm_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    """
+    Average phoneme rate (pr) per minute by speaker
+    """
+    unit_count = annotations[annotations["speaker_type"] == kwargs["speaker"]]["syllables"]
+    segment_duration = annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"] / 1000 / 60
+    return (unit_count/segment_duration).mean()
+
+
+@metricFunction({"speaker"}, {"speaker_type", "mean_pitch_semitone"})
+def mean_mean_pitch_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    return (annotations[annotations["speaker_type"] == kwargs["speaker"]]["mean_pitch_semitone"]).mean()
+
+
+@metricFunction({"speaker"}, {"speaker_type", "pitch_range_semitone"})
+def mean_pitch_range_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    return (annotations[annotations["speaker_type"] == kwargs["speaker"]]["pitch_range_semitone"]).mean()
+
+
+@metricFunction({"speaker"}, {"speaker_type", "words"})
+def mluw_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    """Mean length of utterance in words for a given speaker
+    """
+    return annotations[annotations["speaker_type"] == kwargs["speaker"]]["words"].mean()
+
+
+@metricFunction({"speaker"}, {"speaker_type", "syllables"})
+def mlus_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    """Mean length of utterance in syllables for a given speaker
+    """
+    return annotations[annotations["speaker_type"] == kwargs["speaker"]]["syllables"].mean()
+
+
+@metricFunction({"speaker"}, {"speaker_type", "phonemes"})
+def mlup_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
+    """Mean length of utterance in phonemes for a given speaker
+    """
+    return annotations[annotations["speaker_type"] == kwargs["speaker"]]["phonemes"].mean()

+ 19 - 0
src/generate_messages/__init__.py

@@ -0,0 +1,19 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: __init__.py (as part of project URUMETRICS)
+#   Created: 29/07/2022 15:35
+#   Last Modified: 29/07/2022 15:35
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------

+ 175 - 0
src/generate_messages/messages.py

@@ -0,0 +1,175 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: messages.py (as part of project URUMETRICS)
+#   Created: 29/07/2022 15:35
+#   Last Modified: 29/07/2022 15:35
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+import logging
+import os
+import sys
+from datetime import datetime
+from pprint import pprint
+
+import pandas as pd
+import yaml
+from ChildProject.annotations import AnnotationManager
+from ChildProject.projects import ChildProject
+
+logger = logging.getLogger(__name__)
+
+def _read_yaml(yaml_path):
+    with open(yaml_path, 'r') as in_yaml:
+        data = yaml.load(in_yaml, Loader=yaml.FullLoader)
+    return data
+
+
+def get_metrics(project_path, metrics_file):
+    project = ChildProject(project_path)
+    am = AnnotationManager(project)
+    am.read()
+
+    # Read metrics file and get metrics columns
+    metrics = pd.read_csv(metrics_file)
+    metrics_columns = list(set(metrics.columns) - set(['recording_filename', 'child_id']))
+
+    # Merge with recordings to get date_iso
+    metrics_recordings = pd.merge(metrics, project.recordings, on='recording_filename', suffixes=('', '_drop'))
+    metrics_recordings.drop([col for col in metrics_recordings.columns if 'drop' in col], axis=1, inplace=True)
+
+    # Handle file with the same child_id that have the same date -> keep the longest one
+    metrics_recordings = (metrics_recordings.groupby(['child_id', 'date_iso'], as_index=False)
+                          # Keep only the first segment for each candidate speaker
+                          .apply(lambda rows: (rows.sort_values(by='duration', ascending=False) # take last instead
+                                               .head(n=1))))
+
+    return metrics_recordings, metrics_columns
+
+
+def fill_template(template_key, messages, metrics_evolution):
+    template = messages['_{}_{}'.format(*template_key)]
+
+    for positivity_item_index, (positivity_item, _, positivity_direction) in enumerate(metrics_evolution, 1):
+        template = template.replace('#{}'.format(positivity_item_index),
+                                    messages[positivity_item][positivity_direction])
+
+    message_variables = [msg for msg in messages if msg.startswith('_')]
+    for message_variable in message_variables:
+        message_variable = message_variable[1:]
+        template = template.replace('#{}'.format(message_variable),
+                                    messages['_{}'.format(message_variable)])
+    return template
+
+
+def build_messages(metrics_recordings, metrics_columns, message_file_path, date):
+    date = datetime.strptime(date, "%Y%m%d").strftime("%Y-%m-%d")
+
+    # Get metrics of interest and messages
+    metric_messages = _read_yaml(message_file_path)
+    metrics_of_interest = [item for item in list(metric_messages.keys()) if not item.startswith('_')]
+
+    # Keep only rows for which the date is below or equal to the one we want
+    metrics_recordings = metrics_recordings[metrics_recordings['date_iso'] <= date]
+
+    # Generate messages
+    output_messages = []
+    metrics_grouped = metrics_recordings.groupby('child_id', as_index=False)
+    for _, metrics_grouped_item in metrics_grouped:
+        sorted_metrics_grouped_items = metrics_grouped_item.sort_values(by=['date_iso', 'imported_at'],
+                                                                        ascending=False)
+
+        # If the first row is not the desired date, skip as no message was/will be generated for this family as
+        # this recording is too old
+        if sorted_metrics_grouped_items.iloc[0]['date_iso'] != date:
+            continue
+
+        # Only one audio (first week), generated default message
+        if len(metrics_grouped_item) == 1:
+            recording_filename = metrics_grouped_item.iloc[0]['recording_filename']
+            message = metric_messages['_default']
+        # More than one audio file: generate a message
+        else:
+            todays_row = sorted_metrics_grouped_items.iloc[0]
+            previous_row = sorted_metrics_grouped_items.iloc[1]
+
+            # Compute the difference between the two sets of metrics
+            diff_metrics = (todays_row[metrics_columns] - previous_row[metrics_columns])[metrics_of_interest]
+            diff_metrics = diff_metrics.to_dict()
+
+            metrics_evolution = [(metric, diff_metrics[metric], diff_metrics[metric] > 0) for metric in metrics_of_interest]
+
+            # Message sorting
+            metrics_evolution = sorted(metrics_evolution, key=lambda tup: (abs(tup[1]), tup[2]))
+            template_key = [item[2] for item in metrics_evolution]
+
+            recording_filename =  metrics_grouped_item.iloc[0]['recording_filename']
+            message = fill_template(template_key, metric_messages, metrics_evolution)
+
+        output_messages.append({'recording_filename': recording_filename,
+                               'message': message})
+
+    df_out = pd.DataFrame(output_messages)
+    return df_out
+
+
+def generate_messages(date):
+    project_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set'))
+    metrics_file_path = os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set', 'extra', 'metrics.csv')
+    message_file_path = os.path.join(os.path.dirname(__file__), '../..', 'dat', 'utility', 'metrics_messages.yaml')
+    message_out_path = os.path.join(os.path.dirname(__file__), '../..', 'dat', 'out', 'messages_{}.csv'.format(date))
+
+    # Make sure we have all the files we need
+    assert os.path.exists(metrics_file_path), IOError('Metrics file ({}) not found!'.format(metrics_file_path))
+    assert os.path.exists(message_file_path), IOError('Message file ({}) not found!'.format(message_file_path))
+
+
+    metrics_recordings, metrics_columns = get_metrics(project_path, metrics_file_path)
+    messages = build_messages(metrics_recordings, metrics_columns, message_file_path, date)
+
+    if not os.path.exists(message_out_path):
+        if len(messages):
+            messages.to_csv(message_out_path, index=False)
+        else:
+            logger.warning('No message needs to be generated for date {}.'.format(date))
+    else:
+        raise IOError('File {} already exists!'.format(message_out_path))
+
+
+def main(**kwargs):
+    generate_messages(**kwargs)
+
+
+def _parse_args(argv):
+    import argparse
+
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--date', type=str, default=datetime.now().strftime("%Y%m%d"),
+                        help='Date for which to generate messages.')
+    args = parser.parse_args(argv)
+
+    return vars(args)
+
+
+if __name__ == '__main__':
+    import sys
+    pgrm_name, argv = sys.argv[0], sys.argv[1:]
+    args = _parse_args(argv)
+
+    try:
+        generate_messages(**args)
+        sys.exit(0)
+    except Exception as e:
+        logger.error(e)
+        sys.exit(1)

+ 19 - 0
src/import_data/__init__.py

@@ -0,0 +1,19 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: __init__.py (as part of project URUMETRICS)
+#   Created: 01/06/2022 16:35
+#   Last Modified: 01/06/2022 16:35
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------

+ 51 - 0
src/import_data/acoustic_converter.py

@@ -0,0 +1,51 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: acoustic_converter.py (as part of project URUMETRICS)
+#   Created: 28/07/2022 11:10
+#   Last Modified: 28/07/2022 11:10
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+import pandas as pd
+from ChildProject.annotations import AnnotationConverter
+
+class AcousticConverter(AnnotationConverter):
+    FORMAT = "acoustic"
+
+    @staticmethod
+    def convert(filename: str, source_file: str = "", **kwargs) -> pd.DataFrame:
+        df = pd.read_csv(
+            filename,
+            sep=r",",
+            header=0,
+            engine="python",
+        )
+
+        n_recordings = len(df["file"].unique())
+        if  n_recordings > 1 and not source_file:
+            print(
+                f"""WARNING: {filename} contains annotations from {n_recordings} different audio files, """
+                """but no filter was specified which means all of these annotations will be imported.\n"""
+                """as if they belonged to the same recording. Please make sure this is the intended behavior """
+                """(it probably isn't)."""
+            )
+
+        if source_file:
+            df = df[df["file"].str.contains(source_file)]
+
+        df.drop(columns=["file"], inplace=True)
+
+        return df
+

+ 34 - 0
src/import_data/consts.py

@@ -0,0 +1,34 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: consts.py (as part of project URUMETRICS)
+#   Created: 20/05/2022 16:16
+#   Last Modified: 20/05/2022 16:16
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+from enum import Enum
+
+CHILDREN_DEFAULT_DOB = '1000-01-01'
+
+
+class ANNOTATION_TYPES(Enum):
+    VTC = ('vtc_rttm', 'vtc', 'rttm',)
+    VCM = ('vcm_rttm', 'vcm', 'rttm',)
+    ALICE = ('alice', 'txt',)
+    ACOUSTIC = ('acoustic', 'csv',)
+
+    @classmethod
+    def asdict(cls):
+        return dict(list(map(lambda c: (c.name, c.value), cls)))

+ 229 - 0
src/import_data/import_annotations.py

@@ -0,0 +1,229 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: import_annotations.py (as part of project URUMETRICS)
+#   Created: 23/05/2022 11:28
+#   Last Modified: 23/05/2022 11:28
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+# !usr/bin/env python
+# -*- coding: utf8 -*-
+
+#
+# Author: William N. Havard (base on various files by Lucas Gautheron)
+#
+
+import logging
+import os
+import sys
+
+import pandas as pd
+from ChildProject.annotations import AnnotationManager
+from ChildProject.projects import ChildProject
+
+from .consts import ANNOTATION_TYPES
+from .utils import get_raw_filename
+from .acoustic_converter import AcousticConverter # /!\ Do not remove
+
+logger = logging.getLogger(__name__)
+pd.set_option('mode.chained_assignment', None)  # Silences pandas' complaints
+
+
+def _filter_missing_annotation_files(path_project, input):
+    """
+    Checks whether some annotation files are missing or not and returns only the rows for which the annotation
+    file could be found.
+    :param path_project: path to ChildProject project directory
+    :type path_project: str
+    :param input: dataframe containing the list of recordings and their corresponding annotation file
+    :type input: pandas.DataFrame
+    :return: dataframe containing only row for which an annotation file was found
+    :rtype: pandas.DataFrame
+    """
+    annotation_path = lambda row: os.path.join(path_project, 'annotations', row['set'], 'raw', row['raw_filename'])
+    input['exists'] = input.apply(lambda row: os.path.exists(annotation_path(row)), axis=1)
+
+    missing_annotations = input[input['exists'] == False]
+    existing_annotations = input[input['exists'] == True]
+
+    if len(missing_annotations):
+        missing_annotations['expected_path'] = missing_annotations.apply(lambda row: annotation_path(row), axis=1)
+        missing_annotations_path = sorted(missing_annotations['expected_path'].tolist())
+        missing_message = "Some annotations you expected to have are missing.\n" \
+                          "Check whether these annotations exist and if so, if their expected path " \
+                          "reflect their true path.\n\t - {}".format('\n\t - '.join(missing_annotations_path))
+        logger.warning(missing_message)
+    return existing_annotations
+
+
+def _check_importation(am, imported, expected_ann_number):
+    """
+    Checks whether the importation was carried out successfully or not. Returns the number of imported annotations files
+    imported so far and the number of segments they contain.
+    :param am: ChildProject annotation manager object
+    :type am: ChildProject.AnnotationManager
+    :param imported: DataFrame object containing the annotation files that were successfully imported
+    :type imported: pandas.DataFrame
+    :param expected_ann_number: containing the expected number of annotation files that should have been imported
+    :type expected_ann_number: int
+    :return: number of imported annotations, number of imported segments
+    :rtype: tuple of int
+    """
+    if len(imported) != expected_ann_number:
+        logger.warning('Expected to import {} annotations, only found {}!'.format(len(imported), expected_ann_number))
+
+    annotations_segments = am.get_segments(imported)
+    if len(annotations_segments) == 0:
+        logger.warning('Annotations were imported, but they either contain no segments '
+                       'or the segments were not imported properly!')
+
+    return len(imported), len(annotations_segments)
+
+
+def _get_recordings(project, annotation_set, annotation_format):
+    """
+    Returns a DataFrame of recordings already imported for the project `project` and prepare the data frame indicating
+    the format and the set of annotations that will be imported.
+    :param project: path to ChildProject project directory
+    :type project: str
+    :param annotation_set: set of annotation to import
+    :type annotation_set: str
+    :param annotation_format: format of the annotations that will be imported
+    :type annotation_format: str
+    :return: dataframe containing the recordings found for the project
+    :rtype: pandas.DataFrame
+    """
+    input = project.recordings[['recording_filename', 'duration', 'child_id']]
+    input.dropna(inplace=True)
+    input = input[input['recording_filename'] != 'NA']
+    input['set'] = annotation_set
+    input['format'] = annotation_format
+    input['time_seek'] = 0
+    input['range_onset'] = 0
+    input['range_offset'] = input['duration']
+    input.drop(['duration'], axis=1, inplace=True)
+
+    return input
+
+
+def _build_raw_filename(input, annotation_format, filename='', extension=''):
+    """
+    Build the expected annotation filename path containing the annotation corresponding to a given recording
+    :param input: DataFrame containing a list of recordings
+    :type input: pandas.DataFrame
+    :param annotation_format: format of the annotation that will be imported
+    :type annotation_format: str
+    :param filename: filename of the annotation file where to look for the annotations for a specific recording
+    :type filename: str
+    :param extension: file extension of the annotation file
+    :type extension: str
+    :return: dataframe containing the expected name the annotation file should have
+    :rtype: pandas.DataFrame
+    """
+    # Set up 'raw_filename' and 'filter' depending on the annotation set to import
+    annotation_format = annotation_format.removesuffix('_rttm')
+    annotation_format_extension = ANNOTATION_TYPES.asdict()[annotation_format.upper()][0] \
+        if not extension else extension
+    if annotation_format in ['vtc', 'vcm', 'alice', 'acoustic']:
+        # Annotations have the same name as the recording filename. Update if it's not the case for you.
+        input['raw_filename'] = input['recording_filename'].apply(
+            lambda f: '{}.{}'.format(get_raw_filename(f), annotation_format_extension)) if not filename else filename
+        # We only keep lines for which the 'file' column is equal to filter
+        input['filter'] = input['recording_filename'].apply(lambda f: os.path.basename(get_raw_filename(f)))
+    elif annotation_format in ['cha', 'its', 'eaf']:
+        # CHA/ITS/EAF files do not need filtering as they only contain annotations for the file they are linked to
+        input['raw_filename'] = input['recording_filename'].apply(
+            lambda f: '{}.{}'.format(get_raw_filename(f), annotation_format_extension))
+    else:
+        raise ValueError('Unknown annotation format `{}`!'.format(annotation_format))
+
+    return input
+
+
+def import_annotations(data_path):
+    """
+    Imports all the new annotations files that are found in the `annotations` directory. This directory will
+    be recursively explored.
+    :param data_path: Path to a ChildProject project directory
+    :type data_path: str
+    :return: None
+    :rtype: None
+    """
+
+    annotation_path = os.path.join(data_path, 'annotations')
+    annotation_metadata_path = os.path.join(data_path, 'metadata', 'annotations.csv')
+
+    # Load project
+    project = ChildProject(data_path)
+    am = AnnotationManager(project)
+    am.read()
+
+    for annotation_type in os.listdir(annotation_path):
+        annotation_raw_files = os.path.join(annotation_path, annotation_type, 'raw')
+        if annotation_type.upper() not in ANNOTATION_TYPES.asdict().keys(): continue
+
+        for annotation_raw_file in os.listdir(annotation_raw_files):
+            _, extension = os.path.splitext(annotation_raw_file)
+
+            annotation_set = annotation_type.lower()
+            annotation_format = ANNOTATION_TYPES.asdict()[annotation_set.upper()][0]
+
+            # Get recordings and set up df
+            input = _get_recordings(project, annotation_set, annotation_format)
+
+            # Build raw file names
+            input = _build_raw_filename(input, annotation_format, annotation_raw_file, extension)
+
+            # Filter out rows for which we do not find the matching annotation file
+            input = _filter_missing_annotation_files(data_path, input)
+
+            # We make sure we remove annotation files from the DataFrame if they were already previously imported
+            if os.path.exists(annotation_metadata_path):
+                already_imported_metadata = pd.read_csv(annotation_metadata_path)
+                already_imported_metadata = already_imported_metadata[['recording_filename', 'set', 'raw_filename']]
+
+                # Drop annotation files that were already imported
+                input = (input.merge(already_imported_metadata, how='left', indicator=True,
+                                     on=['recording_filename', 'set', 'raw_filename'])
+                         .loc[lambda x: x['_merge'] == 'left_only']
+                         .drop(columns="_merge"))
+
+            # Do importation
+            if len(input) > 0:
+                imported = am.import_annotations(input)
+
+                expected_ann_number = len(input)
+                len_ann, len_seg = _check_importation(am, imported, expected_ann_number)
+                logger.info('Imported {} new annotation files resulting '
+                            'in {} new {} segments!'.format(len_ann, len_seg, annotation_type))
+
+            else:
+                logger.warning('Nothing to import for annotation type {}!'.format(annotation_format))
+
+
+def main():
+    data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set'))
+    import_annotations(data_path)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+
+    try:
+        main()
+        sys.exit(0)
+    except Exception as e:
+        print(e)
+        sys.exit(1)

+ 206 - 0
src/import_data/import_recordings.py

@@ -0,0 +1,206 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: import_recordings.py (as part of project URUMETRICS)
+#   Created: 20/05/2022 16:25
+#   Last Modified: 20/05/2022 16:25
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+import csv
+import logging
+import os
+import sys
+from datetime import datetime
+
+import pandas as pd
+from ChildProject.utils import get_audio_duration
+
+from .consts import CHILDREN_DEFAULT_DOB
+from .utils import walk_dir
+
+logger = logging.getLogger(__name__)
+
+
+def _get_recordings(recordings_path):
+    """
+    Returns a DataFrame of all the recordings already imported or a empty DataFrame if `recordings.csv` does not
+    exist
+    :param recordings_path: Path to the `recordings.csv` metadata file
+    :type recordings_path: str
+    :return: dataframe of already imported recordings or empty dataframe
+    :rtype: pandas.DataFrame
+    """
+    try:
+        data = pd.read_csv(recordings_path)
+    except:
+        columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
+                   'recording_device_type', 'recording_filename', 'session_id']
+        data = pd.DataFrame(columns=columns)
+    return data
+
+
+def _get_children(children_path):
+    """
+    Returns a DataFrame of all the children already imported or a empty DataFrame if `children.csv` does not
+    exist
+    :param recordings_path: Path to the `children.csv` metadata file
+    :type children_path: str
+    :return: dataframe of already imported children or empty dataframe
+    :rtype: pandas.DataFrame
+    """
+    try:
+        data = pd.read_csv(children_path)
+    except:
+        columns = ['experiment', 'child_id', 'child_dob']
+        data = pd.DataFrame(columns=columns)
+    return data
+
+
+def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type):
+    """
+    Return the metadata corresponding to a given file (date, time, duration, etc.)
+    :param recordings_path: path to the directory storing the WAV files
+    :type recordings_path: str
+    :param recording: name of the WAV file
+    :type recording: str
+    :param experiment: name of the experiment the recording belongs to
+    :type experiment: str
+    :param recording_device_type: type of recording device used
+    :type recording_device_type: str
+    :return: metadata for the given file (possibly none)
+    :rtype: dict or bool
+    """
+    raw_filename, _ = os.path.splitext(recording)
+    try:
+        child_id, *experiment_stage, date_iso_, start_time_ = raw_filename.split('_')
+        date_iso = datetime.strptime(date_iso_, '%Y%m%d').strftime('%Y-%m-%d')
+        start_time = datetime.strptime(start_time_, '%H%M%S').strftime('%H:%M:%S')
+        session_id = '{}_{}'.format(child_id, date_iso_)
+        duration = int(get_audio_duration(os.path.join(recordings_path, recording)) * 1000)
+
+        return {'experiment': experiment,
+                'experiment_stage': '_'.join(experiment_stage),
+                'child_id': child_id,
+                'date_iso': date_iso,
+                'start_time': start_time,
+                'recording_device_type': recording_device_type,
+                'recording_filename': recording,
+                'session_id': session_id,
+                'duration': duration,
+                'imported_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                }
+    except Exception as e:
+        logger.error(f'{raw_filename} raised an error. This file will be discarded. (Exception: {str(e)})')
+        return False
+
+
+def import_recordings(data_path, experiment, recording_device_type):
+    """
+    This function creates or update the metadata file `recordings.csv`
+    :param data_path: Path to `dat/data_set` directory:
+    :type data_path: str
+    :param experiment: name of the experiment
+    :type experiment: str
+    :param recording_device_type: type of device used to record the data
+    :type  recording_device_type: str
+    :return: None
+    :rtype: None
+    """
+    recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv')
+    recordings = _get_recordings(recordings_metadata_path)
+    recordings_count = len(recordings)
+
+    recordings_path = os.path.join(data_path, 'recordings', 'raw')
+    recording_file_list = walk_dir(recordings_path, ext='wav', return_full_path=False)
+
+    for recording_file in recording_file_list:
+        if recording_file in recordings['recording_filename'].values: continue
+
+        recording_metadata = _build_recording_metadata(recordings_path, recording_file,
+                                                       experiment, recording_device_type)
+        # Add new recordings only
+        if not recording_metadata:
+            continue
+        else:
+            recordings = pd.concat([recordings, pd.DataFrame.from_dict([recording_metadata])], ignore_index=True)
+
+    recordings.to_csv(recordings_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
+    logger.info('{} new recordings imported ({} recordings altogether).'.format(len(recordings) - recordings_count,
+                                                                                len(recordings)))
+
+
+def import_children(data_path, experiment):
+    """
+    This function creates or update the metadata file `children.csv`
+    :param data_path: Path to `dat/data_set` directory
+    :type data_path: str
+    :param experiment: name of the experiment
+    :type experiment: str
+    :return: None
+    :rtype: None
+    """
+    recordings_metadata_path = os.path.join(data_path, 'metadata', 'recordings.csv')
+    children_metadata_path = os.path.join(data_path, 'metadata', 'children.csv')
+
+    recordings = _get_recordings(recordings_metadata_path)
+    children = _get_children(children_metadata_path)
+    children_count = len(children)
+
+    child_id_recordings = set(recordings['child_id'])
+    missing_children = child_id_recordings - set(children['child_id'])
+
+    for child_id in missing_children:
+        child_metadata = {
+            'experiment': experiment,
+            'child_id': child_id,
+            'child_dob': CHILDREN_DEFAULT_DOB
+        }
+        children = pd.concat([children, pd.DataFrame.from_dict([child_metadata])], ignore_index=True)
+
+    children.to_csv(children_metadata_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
+    logger.info('{} new children imported ({} children altogether).'.format(len(children) - children_count,
+                                                                            len(children)))
+
+
+def data_importation(data_path, experiment, recording_device_type):
+    """
+    This functions imports new recordings and updates `recordings.csv` and updates `children.csv` if necessary.
+    :param data_path: Path to `dat/data_set` directory
+    :type data_path: str
+    :param experiment: name of the experiment
+    :type experiment: str
+    :param recording_device_type: type of device used to record the data
+    :type recording_device_type: str
+    :return: None
+    :rtype: None
+    """
+    import_recordings(data_path, experiment, recording_device_type)
+    import_children(data_path, experiment)
+
+
+def main(experiment='URU22', recording_device_type='unknown'):
+    data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'dat', 'data_set'))
+    data_importation(data_path, experiment, recording_device_type)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+
+    try:
+        main()
+        sys.exit(0)
+    except Exception as e:
+        print(e)
+        sys.exit(1)

+ 73 - 0
src/import_data/prepare_data_set.py

@@ -0,0 +1,73 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: prepare_data_set.py (as part of project URUMETRICS)
+#   Created: 20/05/2022 15:48
+#   Last Modified: 20/05/2022 15:48
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • This script creates all the necessary directories to drop and
+#         process the annotation files.
+# -----------------------------------------------------------------------------
+import logging
+import os
+import sys
+
+from .consts import ANNOTATION_TYPES
+
+logger = logging.getLogger(__name__)
+
+def create_child_project_directories(data_path):
+    """
+    Creates all the directories required by ChildProject
+    :param data_path: path where the directories should be created
+    :type data_path: str
+    :return: None
+    :rtype: None
+    """
+    child_project_dirs = [os.path.join(data_path, 'out'),
+                          os.path.join(data_path, 'data_set', 'extra'),
+                          os.path.join(data_path, 'data_set', 'metadata'),
+                          os.path.join(data_path, 'data_set', 'annotations'),
+                          os.path.join(data_path, 'data_set', 'recordings'), ]
+    annotations_dir = [os.path.join(data_path, 'data_set', 'annotations', ann_type.name.lower(), 'raw')
+                       for ann_type in ANNOTATION_TYPES]
+    all_dirs = child_project_dirs + annotations_dir
+
+    for p in all_dirs:
+        if not os.path.exists(p):
+            os.makedirs(p)
+        else:
+            logger.warning('{} already exists.'.format(p))
+
+
+def check_data_directory():
+    """
+    Set up a ChildProject project in the dat directory
+    :return: None
+    :rtype: None
+    """
+    script_path = os.path.dirname(__file__)
+    data_path = os.path.abspath(os.path.join(script_path, '../..', 'dat'))
+
+    create_child_project_directories(data_path)
+
+def main():
+    check_data_directory()
+
+if __name__ == '__main__':
+    try:
+        main()
+        sys.exit(0)
+    except Exception as e:
+        print(e)
+        sys.exit(1)

+ 59 - 0
src/import_data/utils.py

@@ -0,0 +1,59 @@
+#!usr/bin/env python
+# -*- coding: utf8 -*-
+
+# -----------------------------------------------------------------------------
+#   File: utils.py (as part of project URUMETRICS)
+#   Created: 04/05/2022 15:47
+#   Last Modified: 04/05/2022 15:47
+# -----------------------------------------------------------------------------
+#   Author: William N. Havard
+#           Postdoctoral Researcher
+#
+#   Mail  : william.havard@ens.fr / william.havard@gmail.com
+#  
+#   Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
+#
+# ------------------------------------------------------------------------------
+#   Description: 
+#       • 
+# -----------------------------------------------------------------------------
+
+import os
+
+def get_raw_filename(fp):
+    """
+    Return the raw filename of file given its path (removes path and extension)
+    :param fp: file path
+    :type fp: str
+    :return: raw filename
+    :rtype: str
+    """
+    return os.path.basename(os.path.splitext(fp)[0])
+
+
+def walk_dir(path, ext=[], return_full_path=True):
+    """
+    Recursively explore a directory and return all the files that end with a given extension. If no extension is
+    specified, all the files are returned
+    :param path: path to the directory to explore
+    :type path: str
+    :param ext: extensions that should be considered
+    :type ext: list of str or str
+    :param return_full_path: should the full path to the files be returned
+    :type return_full_path: bool
+    :return: list of files
+    :rtype: list of str
+    """
+    if type(ext) == str: ext = [ext]
+
+    files = []
+    for p, d, f in os.walk(path):
+        for ff in f:
+            _, file_extension = os.path.splitext(ff)
+            if not len(ext) or file_extension[1:] in ext:
+                path_suffix = p.replace(os.path.commonprefix([path, p]), '').lstrip(os.sep)
+                if return_full_path:
+                    files.append(os.path.join(path, path_suffix, ff))
+                else:
+                    files.append(os.path.join(path_suffix, ff))
+    return sorted(files)