123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- #!usr/bin/env python
- # -*- coding: utf8 -*-
- import os
- import re
- import pandas as pd
- from pprint import pprint
- from pympi.Praat import TextGrid
- from utils import text_read, text_dump, walk_dir, extract_source_sentence_file, SENT_ID_PATTERN
- def annotate_textgrids(path):
- all_textgrids = walk_dir(path, ext='.TextGrid')
- for textgrid_file in all_textgrids:
- tg_path, tg_filename = os.path.split(textgrid_file)
- dir_name = tg_path.split(os.sep)[-1]
- # Get metadata (apply twice if respeak session)
- target_sentence_file = extract_source_sentence_file(dir_name)
- if 'respeak' in textgrid_file:
- target_sentence_file = extract_source_sentence_file(target_sentence_file)
- # Get sentid, wordid, senttype
- tg_raw_filename = tg_filename.replace('.TextGrid', '')
- _, sentid, *_ = tg_raw_filename.split('_')
- sentid_matched = re.search(SENT_ID_PATTERN, sentid)
- if not sentid_matched: continue
- pair_id, word_id, sent_type = sentid_matched.group(1), sentid_matched.group(2), sentid_matched.group(3)
- # Read summary sentences
- target_summary_all = pd.read_csv('./extra/sentences/summary/{}.csv'.format(target_sentence_file))
- row_idx = target_summary_all[target_summary_all['pair_index'] == 'pair{}'.format(pair_id)].index
- assert len(row_idx) == 1, ValueError("More than one pair found for that pairID!") # should never happen
- row_idx = int(row_idx[0])
- # Get sentence, target word, and target syllable
- target_summary_sent = target_summary_all.loc[row_idx, 'word{}.{}'.format(word_id, sent_type)]
- target_summary_word = target_summary_all.loc[row_idx, 'word{}.{}'.format(word_id, 'raw')]
- target_summary_syl = target_summary_all.loc[row_idx, "key_syllables"]
- # Read TextGrid files
- textgrid = TextGrid(textgrid_file)
- tier_list = [n for _, n in list(textgrid.get_tier_name_num())]
- # Remove extra tiers
- if len(tier_list) > 1:
- for tier in tier_list:
- if tier == 'transcription': continue
- textgrid.remove_tier(tier)
- # Read transcription interval
- trans_tier = textgrid.get_tier('transcription')
- tg_intervals = trans_tier.get_all_intervals()
- assert len(tg_intervals) == 1
- tg_start, tg_end, tg_trans = tg_intervals[0]
- # Get key syllables to annotate for that word
- words_key_syl = []
- for item in target_summary_syl.split(','):
- item = item.strip()
- for syl in [item.split('/')]:
- words_key_syl.append(syl[int(word_id)-1])
- # Add this information to the target word to annotate
- word_annotated = target_summary_word[:]
- for syl in words_key_syl:
- word_annotated=word_annotated.replace(syl, '[{}]'.format(syl))
- # Add information to TextGrid
- if tg_trans != target_summary_sent:
- print(tg_trans, target_summary_sent)
- trans_tier.clear_intervals()
- print(trans_tier.tier_type)
- trans_tier.add_interval(tg_start, tg_end, target_summary_sent)
- kw_tier = textgrid.add_tier('key-word')
- kw_tier.add_interval(textgrid.xmin, textgrid.xmax, word_annotated)
- textgrid.add_tier('key-syll-segment')
- textgrid.add_tier('key-syll-segment-sound')
- ordered_tiers = [textgrid.get_tier('transcription'),
- textgrid.get_tier('key-word'),
- textgrid.get_tier('key-syll-segment'),
- textgrid.get_tier('key-syll-segment-sound')]
- textgrid.tiers = list(reversed(ordered_tiers))
- textgrid.to_file(textgrid_file)
- def main(**kwargs):
- annotate_textgrids(**kwargs)
- def _parse_args(argv):
- import argparse
- parser = argparse.ArgumentParser(description='Pre-annotate blank generated TextGrid by adding key-word, key-syll, '
- 'and key-syll-segment tiers and enclose target word in square brackets.')
- parser.add_argument('--path', required=True,
- help='Path to the directory whose file will be annotated.')
- args = parser.parse_args(argv)
- return vars(args)
- if __name__ == '__main__':
- import sys
- import logging
- pgrm_name, argv = sys.argv[0], sys.argv[1:]
- args = _parse_args(argv)
- try:
- main(**args)
- sys.exit(0)
- except Exception as e:
- logging.exception(e)
- sys.exit(1)
|