JackEdTaylor
/
lettersim-ot-rsa


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
							# -*- coding: utf-8 -*-

"""
Created on Mon Jul 15 12:01:39 2024

@author: test
"""

''' modified from the original included in the library 

::vowel:: = a|ä|e|i|o|ö|u|ü
::consonant:: = b|c|ch|ck|d|dt|f|g|h|j|k|l|m|n|p|pf|r|s|sch|t|tsch|tz|tzsch|v|w|z|ʀ

% -tion for Latin loanwords
t -> <t> / _ ion

% Retraction of initial <s> before plosives
s -> <s> / # _ (p|t)

% Initial voicing of <s>
s -> <zed> / # _ (::vowel::)

% Initial voicing of <c>
c -> <c> / # _ (::vowel::)


% Final obstruent devoicing
b -> <b> / _ #|(::consonant::)(::vowel::)
d -> <d> / _ #|(::consonant::)(::vowel::)
g -> <g> / _ #|(::consonant::)(::vowel::)

% Handling of r
er -> ɐ / _ #
r -> <r1> / [äeioöuü]h? _ #|(::consonant::)
r -> 0 / a _ #|(::consonant::)
r -> <r3> / (::vowel::)h? _ (::vowel::)

% Final schwa
e -> ə / _ #

% Open syllable lengthening
i -> ie /  _ #|(::consonant::)(::vowel::)
e -> ee / [^ei] _ #|(::consonant::)(::vowel::)
ü -> üh /  _ #|(::consonant::)(::vowel::)
ö -> öo /  _ #|(::consonant::)(::vowel::)
u -> uh /  _ #|(::consonant::)(::vowel::)
o -> oo / [^oö] _ #|(::consonant::)(::vowel::)
a -> aa / [^a] _ #|(::consonant::)(::vowel::)

'''
'''
reference 

David R. Mortensen, Siddharth Dalmia, and Patrick Littell. 2018. 
Epitran: Precision G2P for Many Languages. 
In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018), Miyazaki, Japan. European Language Resources Association (ELRA).
'''

custom_lettercode_2_letter = {'<t>': ['t'],
                            '<s>': ['s'],
                            '<zed>': ['s'],
                            '<c>': ['c'],
                            '<b>': ['b'],
                            '<d>': ['d'],
                            '<g>': ['g'],
                            'ɐ': ['e', 'r'],
                            '<r1>': ['r'],
                            '<r3>': ['r'],
                            'ə': ['e'],
                            'ie': ['i'],
                            'ee': ['e'],
                            'üh': ['ü'],
                            'öo': ['ö'],
                            'uh': ['u'],
                            'oo': ['o'],
                            'aa': ['a'],
                            'qu': ['q', 'u']}  # qu added by Jack

# %%
import epitran
import epitran.vector
import pandas as pd
from tqdm import tqdm
import unicodedata
import numpy as np
import scipy as sp
import os.path as op
from string import ascii_lowercase
# from matplotlib import pyplot as plt

# copy over the custom epitran mapping files
import shutil
epitran_loc = op.dirname(epitran.__file__)
shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.csv'), op.join(epitran_loc, 'data', 'map', 'deu-Latn-nar_custom.csv'))
shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.txt'), op.join(epitran_loc, 'data', 'pre', 'deu-Latn-nar_custom.txt'))

epi = epitran.Epitran('deu-Latn-nar_custom')

freq_df = pd.read_csv('SUBTLEX-DE_cleaned_with_Google00_frequencies.csv')

chars = [*ascii_lowercase, 'ä', 'ö', 'ü', 'ß']  #  this order must match the RDM order of indices

# ensure NFC encoding
chars = [unicodedata.normalize('NFC', c) for c in chars]

# %%

# get letter frequency

subtlex_chars = np.array([L for w in freq_df.Word for L in w])
subtlex_char_wts = np.array([r.WFfreqcount for i, r in tqdm(freq_df.iterrows(), desc='Getting letter frequencies', total=len(freq_df)) for j in range(len(r.Word))])

# # check irrelevant characters
# x = np.unique(subtlex_chars)
# x[~np.isin(x, chars)]

subtlex_char_counts = np.array([subtlex_char_wts[subtlex_chars==c].sum() for c in chars])

freq_rdm = np.zeros((len(chars), len(chars)))

for i in range(freq_rdm.shape[0]):
    for j in range(freq_rdm.shape[1]):
        freq_rdm[i, j] = np.abs(subtlex_char_counts[i]-subtlex_char_counts[j])

# plt.figure()
# plt.bar(chars, subtlex_char_counts)
# plt.ylabel('Count')
# plt.figure()
# plt.imshow(freq_rdm, interpolation='none')
# plt.colorbar(label='Letter Frequency Distance (Count)')
# plt.xticks(range(len(chars)), labels=chars)
# plt.yticks(range(len(chars)), labels=chars)

# %%

# get frequencies of letter-phoneme mapping, and store the vectors for each phoneme

g2p_counts = {c: {} for c in chars}
phon_vecs = {}  # will contain the vectors for each phoneme

for i, r in tqdm(freq_df.iterrows(), total=len(freq_df), desc='Counting letter-phoneme mappings'): 
    
    f = r.WFfreqcount
    w = r.Word.lower()
    
    # get segments and phonemes for this word
    epi_ph = epi.word_to_tuples(w)

    w_segs = [[unicodedata.normalize('NFC', x[2])] for x in epi_ph]  # ensure NFC encoding
    w_seg_phons = [[y[0] for y in x[4]] for x in epi_ph]  # this accounts for IPA phonemes like the voiceless alveolar sibilant affricate t͡s, which is technically 3 characters
    ph_vecs = [x[4] for x in epi_ph]

    # for each pair of segments of letters, and associated phonemes
    for seg_j, ph_j, vecs_j in zip(w_segs, w_seg_phons, ph_vecs):

        ph_code_j = '_'.join(ph_j)  # an underscore-separated ID for the phoneme(s)

        # store the phoneme vectors, if not done already, for each phoneme associated with this segment
        # (dealing with one-to-many letter-to-phoneme mappings is done later)
        for ph_vec_k in vecs_j:
            if ph_vec_k[0] not in phon_vecs.keys():
                phon_vecs[ph_vec_k[0]] = np.array(ph_vec_k[1])
            else:
                # if already in the dictionary, just double check that the vector matches that from other iterations
                assert np.all(phon_vecs[ph_vec_k[0]] == np.array(ph_vec_k[1]))
        
        # for each lettercode in the segment
        for k, seg_lettercode_k in enumerate(seg_j):
            # convert the letter code to the letter(s)
            if seg_lettercode_k in custom_lettercode_2_letter.keys():
                # if a special case, look up the associated character
                seg_letter_k = custom_lettercode_2_letter[seg_lettercode_k]
            else:
                # otherwise, the lettercode *is* the letter
                seg_letter_k = [seg_lettercode_k]

            # if there is just one letter...
            if len(seg_letter_k)==1:
                # ...and it is in the list of letters we're interested in...
                if seg_letter_k[0] in g2p_counts.keys():
                    # ...then count the associated phonemes                    
                    if ph_code_j in g2p_counts[seg_letter_k[0]].keys():  # seg_letter_k[0], to extract the one letter from its list, as a string
                        g2p_counts[seg_letter_k[0]][ph_code_j] += f  # weight the count by word frequency
                    else:
                        g2p_counts[seg_letter_k[0]][ph_code_j] = f

            # THIS NEXT BIT MAKES SOME ASSUMPTIONS, BUT OTHERWISE UNSURE HOW TO DEAL WITH "qu", AND WE WOULD HAVE NO PHONEMES FOR "q"
            # Specifically, I assume that, e.g., in segment "qu", comprising the phonemes, ['k', 'v'], "q" is mapped to 'k', and "u" is mapped to "v"

            # if there are multiple letters, but the number of letters matches the number of phonemes
            elif len(seg_letter_k)==len(ph_j):
                # then map each letter to the phoneme at the same position:
                for letter_kl, ph_kl in zip(seg_letter_k, ph_j):
                    # if this letter is in the list of letters we're interested in...
                    if letter_kl in g2p_counts.keys():
                        # ...then count the associated phonemes                    
                        if ph_kl in g2p_counts[letter_kl].keys():
                            g2p_counts[letter_kl][ph_kl] += f  # weight the count by word frequency
                        else:
                            g2p_counts[letter_kl][ph_kl] = f

# %%

# put together a dictionary that looks up the vector associated with the dominant pronunciation of each letter

letter_to_phoneme_vec = {}

for L, ph_counts in g2p_counts.items():
    # find the most common phoneme, in the dictionary for this letter
    max_count_idx = np.argmax(ph_counts.values())
    max_count_phoneme_code = list(ph_counts.keys())[max_count_idx]
    max_count_phonemes = max_count_phoneme_code.split('_')  # above, we use underscores to signify discrete phonemes

    # look up the vector associated with the dominant phoneme
    
    if len(max_count_phonemes) > 1:
        # handle the cases where one letter maps to multiple phonemes (e.g., letter x mapping to phonemes k, s)
        # average the vectors for the phonemes associated with this letter
        max_count_vec = np.stack([phon_vecs[ph] for ph in max_count_phonemes]).mean(axis=0)
    else:
        # if there's only one phoneme, then just index it
        max_count_vec = phon_vecs[max_count_phonemes[0]]

    # store the vector in the dictionary
    letter_to_phoneme_vec[L] = max_count_vec

# %%

# get RDM matrix for phonological representations of the dominant phonemes associated with each letter

def euc_dist(a, b):
    return np.linalg.norm(a-b)

cos_dist = sp.spatial.distance.cosine

def cor_dist(a, b):
    return 1 - np.corrcoef(np.stack([a, b], axis=0))[0, 1]

# set which distance function to use to calculate distances between vectors
dist_fun = cor_dist

phon_rdm = np.zeros((len(chars), len(chars)))

for i in range(phon_rdm.shape[0]):
    for j in range(phon_rdm.shape[1]):
        phon_rdm[i, j] = dist_fun(
            letter_to_phoneme_vec[chars[i]],
            letter_to_phoneme_vec[chars[j]]
        )

# plt.figure()
# plt.imshow(phon_rdm, interpolation='none')
# plt.colorbar(label='Phonological Distance between Dominant Phonemes')
# plt.xticks(range(len(chars)), labels=chars)
# plt.yticks(range(len(chars)), labels=chars)

# %%

# get RDM matrix for letter name transcriptions

letter_names_df = pd.read_csv('transcribed_phonological_letter_name_vectors.csv', index_col=0)

# dictionary for looking up a letter name's vectors
letter_name_vecs = {r.letter: [x[1] for x in eval(r.vectors)] for _, r in letter_names_df.iterrows()}

# dictionary for corresponding average vectors
letter_name_avg_vecs = {k: np.mean(v, axis=0) for k, v in letter_name_vecs.items()}

name_phon_rdm = np.zeros((len(chars), len(chars)))

for i in range(name_phon_rdm.shape[0]):
    for j in range(name_phon_rdm.shape[1]):
        name_phon_rdm[i, j] = dist_fun(
            letter_name_avg_vecs[chars[i]],
            letter_name_avg_vecs[chars[j]]
        )

# plt.figure()
# plt.imshow(name_phon_rdm, interpolation='none')
# plt.colorbar(label='Phonological Distance between Letter Name Phonemes')
# plt.xticks(range(len(chars)), labels=chars)
# plt.yticks(range(len(chars)), labels=chars)

# %%

# save RDMs

# get indices of the RDM's lower triangle
tril_idx = np.tril_indices(len(chars), k=-1)
tril_idx_long = np.array(tril_idx).T

# get corresponding character IDs
char1_ids = np.array(chars)[tril_idx[0]]
char2_ids = np.array(chars)[tril_idx[1]]

# save the frequency RDM
file_path_freq = op.join('stim_sim', 'frequency')
np.save(op.join(file_path_freq, 'frequency.npy'), freq_rdm)

letter_freq_vec = freq_rdm[tril_idx]
letter_freq_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'freq_dist': letter_freq_vec})

letter_freq_df.to_csv(op.join(file_path_freq, 'frequency.csv'), index=False, encoding='utf-8')

# save the frequency features
freq_features_df = pd.DataFrame( {'char': chars, 'freq': subtlex_char_counts} )
freq_features_df.to_csv(op.join(file_path_freq, 'frequency_features.csv'), index=False, encoding='utf-8')

# save the dominant phoneme phonological RDM
file_path_phon = op.join('stim_sim', 'phonology')
np.save(op.join(file_path_phon, 'dominant_phonemes.npy'), phon_rdm)

phon_vec = phon_rdm[tril_idx]
phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'phon_dist': phon_vec})

phon_df.to_csv(op.join(file_path_phon, 'dominant_phonemes.csv'), index=False, encoding='utf-8')

# save the dominant phoneme features
phon_features_df = pd.DataFrame( {'char': chars} )

n_phon_feats = len(letter_to_phoneme_vec['a'])

for i in range(n_phon_feats):
    phon_features_df[f'feat_{i}'] = [letter_to_phoneme_vec[c][i] for c in chars]

phon_features_df.to_csv(op.join(file_path_phon, 'dominant_phonemes_features.csv'), index=False, encoding='utf-8')

# save the letter name phonological RDM
np.save(op.join(file_path_phon, 'letter_names.npy'), name_phon_rdm)

name_phon_vec = name_phon_rdm[tril_idx]
name_phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'name_phon_dist': name_phon_vec})

name_phon_df.to_csv(op.join(file_path_phon, 'letter_names.csv'), index=False, encoding='utf-8')

# save the letter name features
name_phon_features_df = pd.DataFrame( {'char': chars} )

n_phon_feats = len(letter_name_avg_vecs['a'])

for i in range(n_phon_feats):
    name_phon_features_df[f'feat_{i}'] = [letter_name_avg_vecs[c][i] for c in chars]

name_phon_features_df.to_csv(op.join(file_path_phon, 'letter_names_features.csv'), index=False, encoding='utf-8')