123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344 |
- # -*- coding: utf-8 -*-
- """
- Created on Mon Jul 15 12:01:39 2024
- @author: test
- """
- ''' modified from the original included in the library
- ::vowel:: = a|ä|e|i|o|ö|u|ü
- ::consonant:: = b|c|ch|ck|d|dt|f|g|h|j|k|l|m|n|p|pf|r|s|sch|t|tsch|tz|tzsch|v|w|z|ʀ
- % -tion for Latin loanwords
- t -> <t> / _ ion
- % Retraction of initial <s> before plosives
- s -> <s> / # _ (p|t)
- % Initial voicing of <s>
- s -> <zed> / # _ (::vowel::)
- % Initial voicing of <c>
- c -> <c> / # _ (::vowel::)
- % Final obstruent devoicing
- b -> <b> / _ #|(::consonant::)(::vowel::)
- d -> <d> / _ #|(::consonant::)(::vowel::)
- g -> <g> / _ #|(::consonant::)(::vowel::)
- % Handling of r
- er -> ɐ / _ #
- r -> <r1> / [äeioöuü]h? _ #|(::consonant::)
- r -> 0 / a _ #|(::consonant::)
- r -> <r3> / (::vowel::)h? _ (::vowel::)
- % Final schwa
- e -> ə / _ #
- % Open syllable lengthening
- i -> ie / _ #|(::consonant::)(::vowel::)
- e -> ee / [^ei] _ #|(::consonant::)(::vowel::)
- ü -> üh / _ #|(::consonant::)(::vowel::)
- ö -> öo / _ #|(::consonant::)(::vowel::)
- u -> uh / _ #|(::consonant::)(::vowel::)
- o -> oo / [^oö] _ #|(::consonant::)(::vowel::)
- a -> aa / [^a] _ #|(::consonant::)(::vowel::)
- '''
- '''
- reference
- David R. Mortensen, Siddharth Dalmia, and Patrick Littell. 2018.
- Epitran: Precision G2P for Many Languages.
- In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018), Miyazaki, Japan. European Language Resources Association (ELRA).
- '''
- custom_lettercode_2_letter = {'<t>': ['t'],
- '<s>': ['s'],
- '<zed>': ['s'],
- '<c>': ['c'],
- '<b>': ['b'],
- '<d>': ['d'],
- '<g>': ['g'],
- 'ɐ': ['e', 'r'],
- '<r1>': ['r'],
- '<r3>': ['r'],
- 'ə': ['e'],
- 'ie': ['i'],
- 'ee': ['e'],
- 'üh': ['ü'],
- 'öo': ['ö'],
- 'uh': ['u'],
- 'oo': ['o'],
- 'aa': ['a'],
- 'qu': ['q', 'u']} # qu added by Jack
- # %%
- import epitran
- import epitran.vector
- import pandas as pd
- from tqdm import tqdm
- import unicodedata
- import numpy as np
- import scipy as sp
- import os.path as op
- from string import ascii_lowercase
- # from matplotlib import pyplot as plt
- # copy over the custom epitran mapping files
- import shutil
- epitran_loc = op.dirname(epitran.__file__)
- shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.csv'), op.join(epitran_loc, 'data', 'map', 'deu-Latn-nar_custom.csv'))
- shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.txt'), op.join(epitran_loc, 'data', 'pre', 'deu-Latn-nar_custom.txt'))
- epi = epitran.Epitran('deu-Latn-nar_custom')
- freq_df = pd.read_csv('SUBTLEX-DE_cleaned_with_Google00_frequencies.csv')
- chars = [*ascii_lowercase, 'ä', 'ö', 'ü', 'ß'] # this order must match the RDM order of indices
- # ensure NFC encoding
- chars = [unicodedata.normalize('NFC', c) for c in chars]
- # %%
- # get letter frequency
- subtlex_chars = np.array([L for w in freq_df.Word for L in w])
- subtlex_char_wts = np.array([r.WFfreqcount for i, r in tqdm(freq_df.iterrows(), desc='Getting letter frequencies', total=len(freq_df)) for j in range(len(r.Word))])
- # # check irrelevant characters
- # x = np.unique(subtlex_chars)
- # x[~np.isin(x, chars)]
- subtlex_char_counts = np.array([subtlex_char_wts[subtlex_chars==c].sum() for c in chars])
- freq_rdm = np.zeros((len(chars), len(chars)))
- for i in range(freq_rdm.shape[0]):
- for j in range(freq_rdm.shape[1]):
- freq_rdm[i, j] = np.abs(subtlex_char_counts[i]-subtlex_char_counts[j])
- # plt.figure()
- # plt.bar(chars, subtlex_char_counts)
- # plt.ylabel('Count')
- # plt.figure()
- # plt.imshow(freq_rdm, interpolation='none')
- # plt.colorbar(label='Letter Frequency Distance (Count)')
- # plt.xticks(range(len(chars)), labels=chars)
- # plt.yticks(range(len(chars)), labels=chars)
- # %%
- # get frequencies of letter-phoneme mapping, and store the vectors for each phoneme
- g2p_counts = {c: {} for c in chars}
- phon_vecs = {} # will contain the vectors for each phoneme
- for i, r in tqdm(freq_df.iterrows(), total=len(freq_df), desc='Counting letter-phoneme mappings'):
-
- f = r.WFfreqcount
- w = r.Word.lower()
-
- # get segments and phonemes for this word
- epi_ph = epi.word_to_tuples(w)
- w_segs = [[unicodedata.normalize('NFC', x[2])] for x in epi_ph] # ensure NFC encoding
- w_seg_phons = [[y[0] for y in x[4]] for x in epi_ph] # this accounts for IPA phonemes like the voiceless alveolar sibilant affricate t͡s, which is technically 3 characters
- ph_vecs = [x[4] for x in epi_ph]
- # for each pair of segments of letters, and associated phonemes
- for seg_j, ph_j, vecs_j in zip(w_segs, w_seg_phons, ph_vecs):
- ph_code_j = '_'.join(ph_j) # an underscore-separated ID for the phoneme(s)
- # store the phoneme vectors, if not done already, for each phoneme associated with this segment
- # (dealing with one-to-many letter-to-phoneme mappings is done later)
- for ph_vec_k in vecs_j:
- if ph_vec_k[0] not in phon_vecs.keys():
- phon_vecs[ph_vec_k[0]] = np.array(ph_vec_k[1])
- else:
- # if already in the dictionary, just double check that the vector matches that from other iterations
- assert np.all(phon_vecs[ph_vec_k[0]] == np.array(ph_vec_k[1]))
-
- # for each lettercode in the segment
- for k, seg_lettercode_k in enumerate(seg_j):
- # convert the letter code to the letter(s)
- if seg_lettercode_k in custom_lettercode_2_letter.keys():
- # if a special case, look up the associated character
- seg_letter_k = custom_lettercode_2_letter[seg_lettercode_k]
- else:
- # otherwise, the lettercode *is* the letter
- seg_letter_k = [seg_lettercode_k]
- # if there is just one letter...
- if len(seg_letter_k)==1:
- # ...and it is in the list of letters we're interested in...
- if seg_letter_k[0] in g2p_counts.keys():
- # ...then count the associated phonemes
- if ph_code_j in g2p_counts[seg_letter_k[0]].keys(): # seg_letter_k[0], to extract the one letter from its list, as a string
- g2p_counts[seg_letter_k[0]][ph_code_j] += f # weight the count by word frequency
- else:
- g2p_counts[seg_letter_k[0]][ph_code_j] = f
- # THIS NEXT BIT MAKES SOME ASSUMPTIONS, BUT OTHERWISE UNSURE HOW TO DEAL WITH "qu", AND WE WOULD HAVE NO PHONEMES FOR "q"
- # Specifically, I assume that, e.g., in segment "qu", comprising the phonemes, ['k', 'v'], "q" is mapped to 'k', and "u" is mapped to "v"
- # if there are multiple letters, but the number of letters matches the number of phonemes
- elif len(seg_letter_k)==len(ph_j):
- # then map each letter to the phoneme at the same position:
- for letter_kl, ph_kl in zip(seg_letter_k, ph_j):
- # if this letter is in the list of letters we're interested in...
- if letter_kl in g2p_counts.keys():
- # ...then count the associated phonemes
- if ph_kl in g2p_counts[letter_kl].keys():
- g2p_counts[letter_kl][ph_kl] += f # weight the count by word frequency
- else:
- g2p_counts[letter_kl][ph_kl] = f
- # %%
- # put together a dictionary that looks up the vector associated with the dominant pronunciation of each letter
- letter_to_phoneme_vec = {}
- for L, ph_counts in g2p_counts.items():
- # find the most common phoneme, in the dictionary for this letter
- max_count_idx = np.argmax(ph_counts.values())
- max_count_phoneme_code = list(ph_counts.keys())[max_count_idx]
- max_count_phonemes = max_count_phoneme_code.split('_') # above, we use underscores to signify discrete phonemes
- # look up the vector associated with the dominant phoneme
-
- if len(max_count_phonemes) > 1:
- # handle the cases where one letter maps to multiple phonemes (e.g., letter x mapping to phonemes k, s)
- # average the vectors for the phonemes associated with this letter
- max_count_vec = np.stack([phon_vecs[ph] for ph in max_count_phonemes]).mean(axis=0)
- else:
- # if there's only one phoneme, then just index it
- max_count_vec = phon_vecs[max_count_phonemes[0]]
- # store the vector in the dictionary
- letter_to_phoneme_vec[L] = max_count_vec
- # %%
- # get RDM matrix for phonological representations of the dominant phonemes associated with each letter
- def euc_dist(a, b):
- return np.linalg.norm(a-b)
- cos_dist = sp.spatial.distance.cosine
- def cor_dist(a, b):
- return 1 - np.corrcoef(np.stack([a, b], axis=0))[0, 1]
- # set which distance function to use to calculate distances between vectors
- dist_fun = cor_dist
- phon_rdm = np.zeros((len(chars), len(chars)))
- for i in range(phon_rdm.shape[0]):
- for j in range(phon_rdm.shape[1]):
- phon_rdm[i, j] = dist_fun(
- letter_to_phoneme_vec[chars[i]],
- letter_to_phoneme_vec[chars[j]]
- )
- # plt.figure()
- # plt.imshow(phon_rdm, interpolation='none')
- # plt.colorbar(label='Phonological Distance between Dominant Phonemes')
- # plt.xticks(range(len(chars)), labels=chars)
- # plt.yticks(range(len(chars)), labels=chars)
- # %%
- # get RDM matrix for letter name transcriptions
- letter_names_df = pd.read_csv('transcribed_phonological_letter_name_vectors.csv', index_col=0)
- # dictionary for looking up a letter name's vectors
- letter_name_vecs = {r.letter: [x[1] for x in eval(r.vectors)] for _, r in letter_names_df.iterrows()}
- # dictionary for corresponding average vectors
- letter_name_avg_vecs = {k: np.mean(v, axis=0) for k, v in letter_name_vecs.items()}
- name_phon_rdm = np.zeros((len(chars), len(chars)))
- for i in range(name_phon_rdm.shape[0]):
- for j in range(name_phon_rdm.shape[1]):
- name_phon_rdm[i, j] = dist_fun(
- letter_name_avg_vecs[chars[i]],
- letter_name_avg_vecs[chars[j]]
- )
- # plt.figure()
- # plt.imshow(name_phon_rdm, interpolation='none')
- # plt.colorbar(label='Phonological Distance between Letter Name Phonemes')
- # plt.xticks(range(len(chars)), labels=chars)
- # plt.yticks(range(len(chars)), labels=chars)
- # %%
- # save RDMs
- # get indices of the RDM's lower triangle
- tril_idx = np.tril_indices(len(chars), k=-1)
- tril_idx_long = np.array(tril_idx).T
- # get corresponding character IDs
- char1_ids = np.array(chars)[tril_idx[0]]
- char2_ids = np.array(chars)[tril_idx[1]]
- # save the frequency RDM
- file_path_freq = op.join('stim_sim', 'frequency')
- np.save(op.join(file_path_freq, 'frequency.npy'), freq_rdm)
- letter_freq_vec = freq_rdm[tril_idx]
- letter_freq_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'freq_dist': letter_freq_vec})
- letter_freq_df.to_csv(op.join(file_path_freq, 'frequency.csv'), index=False, encoding='utf-8')
- # save the frequency features
- freq_features_df = pd.DataFrame( {'char': chars, 'freq': subtlex_char_counts} )
- freq_features_df.to_csv(op.join(file_path_freq, 'frequency_features.csv'), index=False, encoding='utf-8')
- # save the dominant phoneme phonological RDM
- file_path_phon = op.join('stim_sim', 'phonology')
- np.save(op.join(file_path_phon, 'dominant_phonemes.npy'), phon_rdm)
- phon_vec = phon_rdm[tril_idx]
- phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'phon_dist': phon_vec})
- phon_df.to_csv(op.join(file_path_phon, 'dominant_phonemes.csv'), index=False, encoding='utf-8')
- # save the dominant phoneme features
- phon_features_df = pd.DataFrame( {'char': chars} )
- n_phon_feats = len(letter_to_phoneme_vec['a'])
- for i in range(n_phon_feats):
- phon_features_df[f'feat_{i}'] = [letter_to_phoneme_vec[c][i] for c in chars]
- phon_features_df.to_csv(op.join(file_path_phon, 'dominant_phonemes_features.csv'), index=False, encoding='utf-8')
- # save the letter name phonological RDM
- np.save(op.join(file_path_phon, 'letter_names.npy'), name_phon_rdm)
- name_phon_vec = name_phon_rdm[tril_idx]
- name_phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'name_phon_dist': name_phon_vec})
- name_phon_df.to_csv(op.join(file_path_phon, 'letter_names.csv'), index=False, encoding='utf-8')
- # save the letter name features
- name_phon_features_df = pd.DataFrame( {'char': chars} )
- n_phon_feats = len(letter_name_avg_vecs['a'])
- for i in range(n_phon_feats):
- name_phon_features_df[f'feat_{i}'] = [letter_name_avg_vecs[c][i] for c in chars]
- name_phon_features_df.to_csv(op.join(file_path_phon, 'letter_names_features.csv'), index=False, encoding='utf-8')
|