William-N-Havard
/
tsimane-glottal-public


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
							#!usr/bin/env python
# -*- coding: utf8 -*-

import os
import re
import pandas as pd
from pprint import pprint
from pympi.Praat import TextGrid

from utils import text_read, text_dump, walk_dir, extract_source_sentence_file, SENT_ID_PATTERN


def annotate_textgrids(path):
    all_textgrids = walk_dir(path, ext='.TextGrid')

    for textgrid_file in all_textgrids:
        tg_path, tg_filename = os.path.split(textgrid_file)
        dir_name = tg_path.split(os.sep)[-1]

        # Get metadata (apply twice if respeak session)
        target_sentence_file = extract_source_sentence_file(dir_name)
        if 'respeak' in textgrid_file:
            target_sentence_file = extract_source_sentence_file(target_sentence_file)

        # Get sentid, wordid, senttype
        tg_raw_filename = tg_filename.replace('.TextGrid', '')
        _, sentid, *_ = tg_raw_filename.split('_')
        sentid_matched = re.search(SENT_ID_PATTERN, sentid)
        if not sentid_matched: continue
        pair_id, word_id, sent_type = sentid_matched.group(1), sentid_matched.group(2), sentid_matched.group(3)

        # Read summary sentences
        target_summary_all = pd.read_csv('./extra/sentences/summary/{}.csv'.format(target_sentence_file))

        row_idx = target_summary_all[target_summary_all['pair_index'] == 'pair{}'.format(pair_id)].index

        assert len(row_idx) == 1, ValueError("More than one pair found for that pairID!")  # should never happen
        row_idx = int(row_idx[0])

        # Get sentence, target word, and target syllable
        target_summary_sent = target_summary_all.loc[row_idx, 'word{}.{}'.format(word_id, sent_type)]
        target_summary_word = target_summary_all.loc[row_idx, 'word{}.{}'.format(word_id, 'raw')]
        target_summary_syl = target_summary_all.loc[row_idx, "key_syllables"]

        # Read TextGrid files
        textgrid = TextGrid(textgrid_file)
        tier_list = [n for _, n in list(textgrid.get_tier_name_num())]

        # Remove extra tiers
        if len(tier_list) > 1:
            for tier in tier_list:
                if tier == 'transcription': continue
                textgrid.remove_tier(tier)

        # Read transcription interval
        trans_tier = textgrid.get_tier('transcription')
        tg_intervals = trans_tier.get_all_intervals()
        assert len(tg_intervals) == 1
        tg_start, tg_end, tg_trans = tg_intervals[0]

        # Get key syllables to annotate for that word
        words_key_syl = []
        for item in target_summary_syl.split(','):
            item = item.strip()
            for syl in [item.split('/')]:
                words_key_syl.append(syl[int(word_id)-1])

        # Add this information to the target word to annotate
        word_annotated = target_summary_word[:]
        for syl in words_key_syl:
            word_annotated=word_annotated.replace(syl, '[{}]'.format(syl))

        # Add information to TextGrid
        if tg_trans != target_summary_sent:
            print(tg_trans, target_summary_sent)
            trans_tier.clear_intervals()
            print(trans_tier.tier_type)
            trans_tier.add_interval(tg_start, tg_end, target_summary_sent)

        kw_tier = textgrid.add_tier('key-word')
        kw_tier.add_interval(textgrid.xmin, textgrid.xmax, word_annotated)

        textgrid.add_tier('key-syll-segment')
        textgrid.add_tier('key-syll-segment-sound')

        ordered_tiers = [textgrid.get_tier('transcription'),
                         textgrid.get_tier('key-word'),
                         textgrid.get_tier('key-syll-segment'),
                         textgrid.get_tier('key-syll-segment-sound')]
        textgrid.tiers = list(reversed(ordered_tiers))
        textgrid.to_file(textgrid_file)


def main(**kwargs):
    annotate_textgrids(**kwargs)


def _parse_args(argv):
    import argparse

    parser = argparse.ArgumentParser(description='Pre-annotate blank generated TextGrid by adding key-word, key-syll, '
                                                 'and key-syll-segment tiers and enclose target word in square brackets.')
    parser.add_argument('--path', required=True,
                        help='Path to the directory whose file will be annotated.')
    args = parser.parse_args(argv)

    return vars(args)


if __name__ == '__main__':
    import sys
    import logging

    pgrm_name, argv = sys.argv[0], sys.argv[1:]
    args = _parse_args(argv)

    try:
        main(**args)
        sys.exit(0)
    except Exception as e:
        logging.exception(e)
        sys.exit(1)