01_get_corpus_model_rdms.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Mon Jul 15 12:01:39 2024
  4. @author: test
  5. """
  6. ''' modified from the original included in the library
  7. ::vowel:: = a|ä|e|i|o|ö|u|ü
  8. ::consonant:: = b|c|ch|ck|d|dt|f|g|h|j|k|l|m|n|p|pf|r|s|sch|t|tsch|tz|tzsch|v|w|z|ʀ
  9. % -tion for Latin loanwords
  10. t -> <t> / _ ion
  11. % Retraction of initial <s> before plosives
  12. s -> <s> / # _ (p|t)
  13. % Initial voicing of <s>
  14. s -> <zed> / # _ (::vowel::)
  15. % Initial voicing of <c>
  16. c -> <c> / # _ (::vowel::)
  17. % Final obstruent devoicing
  18. b -> <b> / _ #|(::consonant::)(::vowel::)
  19. d -> <d> / _ #|(::consonant::)(::vowel::)
  20. g -> <g> / _ #|(::consonant::)(::vowel::)
  21. % Handling of r
  22. er -> ɐ / _ #
  23. r -> <r1> / [äeioöuü]h? _ #|(::consonant::)
  24. r -> 0 / a _ #|(::consonant::)
  25. r -> <r3> / (::vowel::)h? _ (::vowel::)
  26. % Final schwa
  27. e -> ə / _ #
  28. % Open syllable lengthening
  29. i -> ie / _ #|(::consonant::)(::vowel::)
  30. e -> ee / [^ei] _ #|(::consonant::)(::vowel::)
  31. ü -> üh / _ #|(::consonant::)(::vowel::)
  32. ö -> öo / _ #|(::consonant::)(::vowel::)
  33. u -> uh / _ #|(::consonant::)(::vowel::)
  34. o -> oo / [^oö] _ #|(::consonant::)(::vowel::)
  35. a -> aa / [^a] _ #|(::consonant::)(::vowel::)
  36. '''
  37. '''
  38. reference
  39. David R. Mortensen, Siddharth Dalmia, and Patrick Littell. 2018.
  40. Epitran: Precision G2P for Many Languages.
  41. In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018), Miyazaki, Japan. European Language Resources Association (ELRA).
  42. '''
  43. custom_lettercode_2_letter = {'<t>': ['t'],
  44. '<s>': ['s'],
  45. '<zed>': ['s'],
  46. '<c>': ['c'],
  47. '<b>': ['b'],
  48. '<d>': ['d'],
  49. '<g>': ['g'],
  50. 'ɐ': ['e', 'r'],
  51. '<r1>': ['r'],
  52. '<r3>': ['r'],
  53. 'ə': ['e'],
  54. 'ie': ['i'],
  55. 'ee': ['e'],
  56. 'üh': ['ü'],
  57. 'öo': ['ö'],
  58. 'uh': ['u'],
  59. 'oo': ['o'],
  60. 'aa': ['a'],
  61. 'qu': ['q', 'u']} # qu added by Jack
  62. # %%
  63. import epitran
  64. import epitran.vector
  65. import pandas as pd
  66. from tqdm import tqdm
  67. import unicodedata
  68. import numpy as np
  69. import scipy as sp
  70. import os.path as op
  71. from string import ascii_lowercase
  72. # from matplotlib import pyplot as plt
  73. # copy over the custom epitran mapping files
  74. import shutil
  75. epitran_loc = op.dirname(epitran.__file__)
  76. shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.csv'), op.join(epitran_loc, 'data', 'map', 'deu-Latn-nar_custom.csv'))
  77. shutil.copyfile(op.join('custom_epitran_files', 'deu-Latn-nar_custom.txt'), op.join(epitran_loc, 'data', 'pre', 'deu-Latn-nar_custom.txt'))
  78. epi = epitran.Epitran('deu-Latn-nar_custom')
  79. freq_df = pd.read_csv('SUBTLEX-DE_cleaned_with_Google00_frequencies.csv')
  80. chars = [*ascii_lowercase, 'ä', 'ö', 'ü', 'ß'] # this order must match the RDM order of indices
  81. # ensure NFC encoding
  82. chars = [unicodedata.normalize('NFC', c) for c in chars]
  83. # %%
  84. # get letter frequency
  85. subtlex_chars = np.array([L for w in freq_df.Word for L in w])
  86. subtlex_char_wts = np.array([r.WFfreqcount for i, r in tqdm(freq_df.iterrows(), desc='Getting letter frequencies', total=len(freq_df)) for j in range(len(r.Word))])
  87. # # check irrelevant characters
  88. # x = np.unique(subtlex_chars)
  89. # x[~np.isin(x, chars)]
  90. subtlex_char_counts = np.array([subtlex_char_wts[subtlex_chars==c].sum() for c in chars])
  91. freq_rdm = np.zeros((len(chars), len(chars)))
  92. for i in range(freq_rdm.shape[0]):
  93. for j in range(freq_rdm.shape[1]):
  94. freq_rdm[i, j] = np.abs(subtlex_char_counts[i]-subtlex_char_counts[j])
  95. # plt.figure()
  96. # plt.bar(chars, subtlex_char_counts)
  97. # plt.ylabel('Count')
  98. # plt.figure()
  99. # plt.imshow(freq_rdm, interpolation='none')
  100. # plt.colorbar(label='Letter Frequency Distance (Count)')
  101. # plt.xticks(range(len(chars)), labels=chars)
  102. # plt.yticks(range(len(chars)), labels=chars)
  103. # %%
  104. # get frequencies of letter-phoneme mapping, and store the vectors for each phoneme
  105. g2p_counts = {c: {} for c in chars}
  106. phon_vecs = {} # will contain the vectors for each phoneme
  107. for i, r in tqdm(freq_df.iterrows(), total=len(freq_df), desc='Counting letter-phoneme mappings'):
  108. f = r.WFfreqcount
  109. w = r.Word.lower()
  110. # get segments and phonemes for this word
  111. epi_ph = epi.word_to_tuples(w)
  112. w_segs = [[unicodedata.normalize('NFC', x[2])] for x in epi_ph] # ensure NFC encoding
  113. w_seg_phons = [[y[0] for y in x[4]] for x in epi_ph] # this accounts for IPA phonemes like the voiceless alveolar sibilant affricate t͡s, which is technically 3 characters
  114. ph_vecs = [x[4] for x in epi_ph]
  115. # for each pair of segments of letters, and associated phonemes
  116. for seg_j, ph_j, vecs_j in zip(w_segs, w_seg_phons, ph_vecs):
  117. ph_code_j = '_'.join(ph_j) # an underscore-separated ID for the phoneme(s)
  118. # store the phoneme vectors, if not done already, for each phoneme associated with this segment
  119. # (dealing with one-to-many letter-to-phoneme mappings is done later)
  120. for ph_vec_k in vecs_j:
  121. if ph_vec_k[0] not in phon_vecs.keys():
  122. phon_vecs[ph_vec_k[0]] = np.array(ph_vec_k[1])
  123. else:
  124. # if already in the dictionary, just double check that the vector matches that from other iterations
  125. assert np.all(phon_vecs[ph_vec_k[0]] == np.array(ph_vec_k[1]))
  126. # for each lettercode in the segment
  127. for k, seg_lettercode_k in enumerate(seg_j):
  128. # convert the letter code to the letter(s)
  129. if seg_lettercode_k in custom_lettercode_2_letter.keys():
  130. # if a special case, look up the associated character
  131. seg_letter_k = custom_lettercode_2_letter[seg_lettercode_k]
  132. else:
  133. # otherwise, the lettercode *is* the letter
  134. seg_letter_k = [seg_lettercode_k]
  135. # if there is just one letter...
  136. if len(seg_letter_k)==1:
  137. # ...and it is in the list of letters we're interested in...
  138. if seg_letter_k[0] in g2p_counts.keys():
  139. # ...then count the associated phonemes
  140. if ph_code_j in g2p_counts[seg_letter_k[0]].keys(): # seg_letter_k[0], to extract the one letter from its list, as a string
  141. g2p_counts[seg_letter_k[0]][ph_code_j] += f # weight the count by word frequency
  142. else:
  143. g2p_counts[seg_letter_k[0]][ph_code_j] = f
  144. # THIS NEXT BIT MAKES SOME ASSUMPTIONS, BUT OTHERWISE UNSURE HOW TO DEAL WITH "qu", AND WE WOULD HAVE NO PHONEMES FOR "q"
  145. # Specifically, I assume that, e.g., in segment "qu", comprising the phonemes, ['k', 'v'], "q" is mapped to 'k', and "u" is mapped to "v"
  146. # if there are multiple letters, but the number of letters matches the number of phonemes
  147. elif len(seg_letter_k)==len(ph_j):
  148. # then map each letter to the phoneme at the same position:
  149. for letter_kl, ph_kl in zip(seg_letter_k, ph_j):
  150. # if this letter is in the list of letters we're interested in...
  151. if letter_kl in g2p_counts.keys():
  152. # ...then count the associated phonemes
  153. if ph_kl in g2p_counts[letter_kl].keys():
  154. g2p_counts[letter_kl][ph_kl] += f # weight the count by word frequency
  155. else:
  156. g2p_counts[letter_kl][ph_kl] = f
  157. # %%
  158. # put together a dictionary that looks up the vector associated with the dominant pronunciation of each letter
  159. letter_to_phoneme_vec = {}
  160. for L, ph_counts in g2p_counts.items():
  161. # find the most common phoneme, in the dictionary for this letter
  162. max_count_idx = np.argmax(ph_counts.values())
  163. max_count_phoneme_code = list(ph_counts.keys())[max_count_idx]
  164. max_count_phonemes = max_count_phoneme_code.split('_') # above, we use underscores to signify discrete phonemes
  165. # look up the vector associated with the dominant phoneme
  166. if len(max_count_phonemes) > 1:
  167. # handle the cases where one letter maps to multiple phonemes (e.g., letter x mapping to phonemes k, s)
  168. # average the vectors for the phonemes associated with this letter
  169. max_count_vec = np.stack([phon_vecs[ph] for ph in max_count_phonemes]).mean(axis=0)
  170. else:
  171. # if there's only one phoneme, then just index it
  172. max_count_vec = phon_vecs[max_count_phonemes[0]]
  173. # store the vector in the dictionary
  174. letter_to_phoneme_vec[L] = max_count_vec
  175. # %%
  176. # get RDM matrix for phonological representations of the dominant phonemes associated with each letter
  177. def euc_dist(a, b):
  178. return np.linalg.norm(a-b)
  179. cos_dist = sp.spatial.distance.cosine
  180. def cor_dist(a, b):
  181. return 1 - np.corrcoef(np.stack([a, b], axis=0))[0, 1]
  182. # set which distance function to use to calculate distances between vectors
  183. dist_fun = cor_dist
  184. phon_rdm = np.zeros((len(chars), len(chars)))
  185. for i in range(phon_rdm.shape[0]):
  186. for j in range(phon_rdm.shape[1]):
  187. phon_rdm[i, j] = dist_fun(
  188. letter_to_phoneme_vec[chars[i]],
  189. letter_to_phoneme_vec[chars[j]]
  190. )
  191. # plt.figure()
  192. # plt.imshow(phon_rdm, interpolation='none')
  193. # plt.colorbar(label='Phonological Distance between Dominant Phonemes')
  194. # plt.xticks(range(len(chars)), labels=chars)
  195. # plt.yticks(range(len(chars)), labels=chars)
  196. # %%
  197. # get RDM matrix for letter name transcriptions
  198. letter_names_df = pd.read_csv('transcribed_phonological_letter_name_vectors.csv', index_col=0)
  199. # dictionary for looking up a letter name's vectors
  200. letter_name_vecs = {r.letter: [x[1] for x in eval(r.vectors)] for _, r in letter_names_df.iterrows()}
  201. # dictionary for corresponding average vectors
  202. letter_name_avg_vecs = {k: np.mean(v, axis=0) for k, v in letter_name_vecs.items()}
  203. name_phon_rdm = np.zeros((len(chars), len(chars)))
  204. for i in range(name_phon_rdm.shape[0]):
  205. for j in range(name_phon_rdm.shape[1]):
  206. name_phon_rdm[i, j] = dist_fun(
  207. letter_name_avg_vecs[chars[i]],
  208. letter_name_avg_vecs[chars[j]]
  209. )
  210. # plt.figure()
  211. # plt.imshow(name_phon_rdm, interpolation='none')
  212. # plt.colorbar(label='Phonological Distance between Letter Name Phonemes')
  213. # plt.xticks(range(len(chars)), labels=chars)
  214. # plt.yticks(range(len(chars)), labels=chars)
  215. # %%
  216. # save RDMs
  217. # get indices of the RDM's lower triangle
  218. tril_idx = np.tril_indices(len(chars), k=-1)
  219. tril_idx_long = np.array(tril_idx).T
  220. # get corresponding character IDs
  221. char1_ids = np.array(chars)[tril_idx[0]]
  222. char2_ids = np.array(chars)[tril_idx[1]]
  223. # save the frequency RDM
  224. file_path_freq = op.join('stim_sim', 'frequency')
  225. np.save(op.join(file_path_freq, 'frequency.npy'), freq_rdm)
  226. letter_freq_vec = freq_rdm[tril_idx]
  227. letter_freq_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'freq_dist': letter_freq_vec})
  228. letter_freq_df.to_csv(op.join(file_path_freq, 'frequency.csv'), index=False, encoding='utf-8')
  229. # save the frequency features
  230. freq_features_df = pd.DataFrame( {'char': chars, 'freq': subtlex_char_counts} )
  231. freq_features_df.to_csv(op.join(file_path_freq, 'frequency_features.csv'), index=False, encoding='utf-8')
  232. # save the dominant phoneme phonological RDM
  233. file_path_phon = op.join('stim_sim', 'phonology')
  234. np.save(op.join(file_path_phon, 'dominant_phonemes.npy'), phon_rdm)
  235. phon_vec = phon_rdm[tril_idx]
  236. phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'phon_dist': phon_vec})
  237. phon_df.to_csv(op.join(file_path_phon, 'dominant_phonemes.csv'), index=False, encoding='utf-8')
  238. # save the dominant phoneme features
  239. phon_features_df = pd.DataFrame( {'char': chars} )
  240. n_phon_feats = len(letter_to_phoneme_vec['a'])
  241. for i in range(n_phon_feats):
  242. phon_features_df[f'feat_{i}'] = [letter_to_phoneme_vec[c][i] for c in chars]
  243. phon_features_df.to_csv(op.join(file_path_phon, 'dominant_phonemes_features.csv'), index=False, encoding='utf-8')
  244. # save the letter name phonological RDM
  245. np.save(op.join(file_path_phon, 'letter_names.npy'), name_phon_rdm)
  246. name_phon_vec = name_phon_rdm[tril_idx]
  247. name_phon_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'name_phon_dist': name_phon_vec})
  248. name_phon_df.to_csv(op.join(file_path_phon, 'letter_names.csv'), index=False, encoding='utf-8')
  249. # save the letter name features
  250. name_phon_features_df = pd.DataFrame( {'char': chars} )
  251. n_phon_feats = len(letter_name_avg_vecs['a'])
  252. for i in range(n_phon_feats):
  253. name_phon_features_df[f'feat_{i}'] = [letter_name_avg_vecs[c][i] for c in chars]
  254. name_phon_features_df.to_csv(op.join(file_path_phon, 'letter_names_features.csv'), index=False, encoding='utf-8')