# -*- coding: utf-8 -*- """ Created on Mon Jul 15 12:01:39 2024 @author: test """ ''' modified from the original included in the library ::vowel:: = a|ä|e|i|o|ö|u|ü ::consonant:: = b|c|ch|ck|d|dt|f|g|h|j|k|l|m|n|p|pf|r|s|sch|t|tsch|tz|tzsch|v|w|z|ʀ % -tion for Latin loanwords t -> / _ ion % Retraction of initial before plosives s -> / # _ (p|t) % Initial voicing of s -> / # _ (::vowel::) % Initial voicing of c -> / # _ (::vowel::) % Final obstruent devoicing b -> / _ #|(::consonant::)(::vowel::) d -> / _ #|(::consonant::)(::vowel::) g -> / _ #|(::consonant::)(::vowel::) % Handling of r er -> ɐ / _ # r -> / [äeioöuü]h? _ #|(::consonant::) r -> 0 / a _ #|(::consonant::) r -> / (::vowel::)h? _ (::vowel::) % Final schwa e -> ə / _ # % Open syllable lengthening i -> ie / _ #|(::consonant::)(::vowel::) e -> ee / [^ei] _ #|(::consonant::)(::vowel::) ü -> üh / _ #|(::consonant::)(::vowel::) ö -> öo / _ #|(::consonant::)(::vowel::) u -> uh / _ #|(::consonant::)(::vowel::) o -> oo / [^oö] _ #|(::consonant::)(::vowel::) a -> aa / [^a] _ #|(::consonant::)(::vowel::) ''' ''' reference David R. Mortensen, Siddharth Dalmia, and Patrick Littell. 2018. Epitran: Precision G2P for Many Languages. In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018), Miyazaki, Japan. European Language Resources Association (ELRA). ''' custom_lettercode_2_letter = {'': ['t'], '': ['s'], '': ['s'], '': ['c'], '': ['b'], '': ['d'], '': ['g'], 'ɐ': ['e', 'r'], '': ['r'], '': ['r'], 'ə': ['e'], 'ie': ['i'], 'ee': ['e'], 'üh': ['ü'], 'öo': ['ö'], 'uh': ['u'], 'oo': ['o'], 'aa': ['a'], 'qu': ['q', 'u']} # qu added by Jack # %% import epitran import epitran.vector import pandas as pd from tqdm import tqdm import unicodedata import numpy as np import scipy as sp import os.path as op from string import ascii_lowercase # from matplotlib import pyplot as plt # copy over the custom epitran mapping files import shutil epitran_loc = op.dirname(epitran.__file__) shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.csv'), op.join(epitran_loc, 'data', 'map', 'deu-Latn-nar_custom.csv')) shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.txt'), op.join(epitran_loc, 'data', 'pre', 'deu-Latn-nar_custom.txt')) epi = epitran.Epitran('deu-Latn-nar_custom') freq_df = pd.read_csv('SUBTLEX-DE_cleaned_with_Google00_frequencies.csv') chars = [*ascii_lowercase, 'ä', 'ö', 'ü', 'ß'] # this order must match the RDM order of indices # ensure NFC encoding chars = [unicodedata.normalize('NFC', c) for c in chars] # %% # get letter frequency subtlex_chars = np.array([L for w in freq_df.Word for L in w]) subtlex_char_wts = np.array([r.WFfreqcount for i, r in tqdm(freq_df.iterrows(), desc='Getting letter frequencies', total=len(freq_df)) for j in range(len(r.Word))]) # # check irrelevant characters # x = np.unique(subtlex_chars) # x[~np.isin(x, chars)] subtlex_char_counts = np.array([subtlex_char_wts[subtlex_chars==c].sum() for c in chars]) freq_rdm = np.zeros((len(chars), len(chars))) for i in range(freq_rdm.shape[0]): for j in range(freq_rdm.shape[1]): freq_rdm[i, j] = np.abs(subtlex_char_counts[i]-subtlex_char_counts[j]) # plt.figure() # plt.bar(chars, subtlex_char_counts) # plt.ylabel('Count') # plt.figure() # plt.imshow(freq_rdm, interpolation='none') # plt.colorbar(label='Letter Frequency Distance (Count)') # plt.xticks(range(len(chars)), labels=chars) # plt.yticks(range(len(chars)), labels=chars) # %% # get frequencies of letter-phoneme mapping, and store the vectors for each phoneme g2p_counts = {c: {} for c in chars} phon_vecs = {} # will contain the vectors for each phoneme for i, r in tqdm(freq_df.iterrows(), total=len(freq_df), desc='Counting letter-phoneme mappings'): f = r.WFfreqcount w = r.Word.lower() # get segments and phonemes for this word epi_ph = epi.word_to_tuples(w) w_segs = [[unicodedata.normalize('NFC', x[2])] for x in epi_ph] # ensure NFC encoding w_seg_phons = [[y[0] for y in x[4]] for x in epi_ph] # this accounts for IPA phonemes like the voiceless alveolar sibilant affricate t͡s, which is technically 3 characters ph_vecs = [x[4] for x in epi_ph] # for each pair of segments of letters, and associated phonemes for seg_j, ph_j, vecs_j in zip(w_segs, w_seg_phons, ph_vecs): ph_code_j = '_'.join(ph_j) # an underscore-separated ID for the phoneme(s) # store the phoneme vectors, if not done already, for each phoneme associated with this segment # (dealing with one-to-many letter-to-phoneme mappings is done later) for ph_vec_k in vecs_j: if ph_vec_k[0] not in phon_vecs.keys(): phon_vecs[ph_vec_k[0]] = np.array(ph_vec_k[1]) else: # if already in the dictionary, just double check that the vector matches that from other iterations assert np.all(phon_vecs[ph_vec_k[0]] == np.array(ph_vec_k[1])) # for each lettercode in the segment for k, seg_lettercode_k in enumerate(seg_j): # convert the letter code to the letter(s) if seg_lettercode_k in custom_lettercode_2_letter.keys(): # if a special case, look up the associated character seg_letter_k = custom_lettercode_2_letter[seg_lettercode_k] else: # otherwise, the lettercode *is* the letter seg_letter_k = [seg_lettercode_k] # if there is just one letter... if len(seg_letter_k)==1: # ...and it is in the list of letters we're interested in... if seg_letter_k[0] in g2p_counts.keys(): # ...then count the associated phonemes if ph_code_j in g2p_counts[seg_letter_k[0]].keys(): # seg_letter_k[0], to extract the one letter from its list, as a string g2p_counts[seg_letter_k[0]][ph_code_j] += f # weight the count by word frequency else: g2p_counts[seg_letter_k[0]][ph_code_j] = f # THIS NEXT BIT MAKES SOME ASSUMPTIONS, BUT OTHERWISE UNSURE HOW TO DEAL WITH "qu", AND WE WOULD HAVE NO PHONEMES FOR "q" # Specifically, I assume that, e.g., in segment "qu", comprising the phonemes, ['k', 'v'], "q" is mapped to 'k', and "u" is mapped to "v" # if there are multiple letters, but the number of letters matches the number of phonemes elif len(seg_letter_k)==len(ph_j): # then map each letter to the phoneme at the same position: for letter_kl, ph_kl in zip(seg_letter_k, ph_j): # if this letter is in the list of letters we're interested in... if letter_kl in g2p_counts.keys(): # ...then count the associated phonemes if ph_kl in g2p_counts[letter_kl].keys(): g2p_counts[letter_kl][ph_kl] += f # weight the count by word frequency else: g2p_counts[letter_kl][ph_kl] = f # %% # put together a dictionary that looks up the vector associated with the dominant pronunciation of each letter letter_to_phoneme_vec = {} for L, ph_counts in g2p_counts.items(): # find the most common phoneme, in the dictionary for this letter max_count_idx = np.argmax(ph_counts.values()) max_count_phoneme_code = list(ph_counts.keys())[max_count_idx] max_count_phonemes = max_count_phoneme_code.split('_') # above, we use underscores to signify discrete phonemes # look up the vector associated with the dominant phoneme if len(max_count_phonemes) > 1: # handle the cases where one letter maps to multiple phonemes (e.g., letter x mapping to phonemes k, s) # average the vectors for the phonemes associated with this letter max_count_vec = np.stack([phon_vecs[ph] for ph in max_count_phonemes]).mean(axis=0) else: # if there's only one phoneme, then just index it max_count_vec = phon_vecs[max_count_phonemes[0]] # store the vector in the dictionary letter_to_phoneme_vec[L] = max_count_vec # %% # get RDM matrix for phonological representations of the dominant phonemes associated with each letter def euc_dist(a, b): return np.linalg.norm(a-b) cos_dist = sp.spatial.distance.cosine def cor_dist(a, b): return 1 - np.corrcoef(np.stack([a, b], axis=0))[0, 1] # set which distance function to use to calculate distances between vectors dist_fun = cor_dist phon_rdm = np.zeros((len(chars), len(chars))) for i in range(phon_rdm.shape[0]): for j in range(phon_rdm.shape[1]): phon_rdm[i, j] = dist_fun( letter_to_phoneme_vec[chars[i]], letter_to_phoneme_vec[chars[j]] ) # plt.figure() # plt.imshow(phon_rdm, interpolation='none') # plt.colorbar(label='Phonological Distance between Dominant Phonemes') # plt.xticks(range(len(chars)), labels=chars) # plt.yticks(range(len(chars)), labels=chars) # %% # get RDM matrix for letter name transcriptions letter_names_df = pd.read_csv('transcribed_phonological_letter_name_vectors.csv', index_col=0) # dictionary for looking up a letter name's vectors letter_name_vecs = {r.letter: [x[1] for x in eval(r.vectors)] for _, r in letter_names_df.iterrows()} # dictionary for corresponding average vectors letter_name_avg_vecs = {k: np.mean(v, axis=0) for k, v in letter_name_vecs.items()} name_phon_rdm = np.zeros((len(chars), len(chars))) for i in range(name_phon_rdm.shape[0]): for j in range(name_phon_rdm.shape[1]): name_phon_rdm[i, j] = dist_fun( letter_name_avg_vecs[chars[i]], letter_name_avg_vecs[chars[j]] ) # plt.figure() # plt.imshow(name_phon_rdm, interpolation='none') # plt.colorbar(label='Phonological Distance between Letter Name Phonemes') # plt.xticks(range(len(chars)), labels=chars) # plt.yticks(range(len(chars)), labels=chars) # %% # save RDMs # get indices of the RDM's lower triangle tril_idx = np.tril_indices(len(chars), k=-1) tril_idx_long = np.array(tril_idx).T # get corresponding character IDs char1_ids = np.array(chars)[tril_idx[0]] char2_ids = np.array(chars)[tril_idx[1]] # save the frequency RDM file_path_freq = op.join('stim_sim', 'frequency') np.save(op.join(file_path_freq, 'frequency.npy'), freq_rdm) letter_freq_vec = freq_rdm[tril_idx] letter_freq_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'freq_dist': letter_freq_vec}) letter_freq_df.to_csv(op.join(file_path_freq, 'frequency.csv'), index=False, encoding='utf-8') # save the frequency features freq_features_df = pd.DataFrame( {'char': chars, 'freq': subtlex_char_counts} ) freq_features_df.to_csv(op.join(file_path_freq, 'frequency_features.csv'), index=False, encoding='utf-8') # save the dominant phoneme phonological RDM file_path_phon = op.join('stim_sim', 'phonology') np.save(op.join(file_path_phon, 'dominant_phonemes.npy'), phon_rdm) phon_vec = phon_rdm[tril_idx] phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'phon_dist': phon_vec}) phon_df.to_csv(op.join(file_path_phon, 'dominant_phonemes.csv'), index=False, encoding='utf-8') # save the dominant phoneme features phon_features_df = pd.DataFrame( {'char': chars} ) n_phon_feats = len(letter_to_phoneme_vec['a']) for i in range(n_phon_feats): phon_features_df[f'feat_{i}'] = [letter_to_phoneme_vec[c][i] for c in chars] phon_features_df.to_csv(op.join(file_path_phon, 'dominant_phonemes_features.csv'), index=False, encoding='utf-8') # save the letter name phonological RDM np.save(op.join(file_path_phon, 'letter_names.npy'), name_phon_rdm) name_phon_vec = name_phon_rdm[tril_idx] name_phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'name_phon_dist': name_phon_vec}) name_phon_df.to_csv(op.join(file_path_phon, 'letter_names.csv'), index=False, encoding='utf-8') # save the letter name features name_phon_features_df = pd.DataFrame( {'char': chars} ) n_phon_feats = len(letter_name_avg_vecs['a']) for i in range(n_phon_feats): name_phon_features_df[f'feat_{i}'] = [letter_name_avg_vecs[c][i] for c in chars] name_phon_features_df.to_csv(op.join(file_path_phon, 'letter_names_features.csv'), index=False, encoding='utf-8')