00_get_orth_model_rdms.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. # -*- coding: utf-8 -*-
  2. # %% import modules ======================
  3. import os
  4. n_threads = str(1)
  5. os.environ["OMP_NUM_THREADS"] = n_threads
  6. os.environ["OPENBLAS_NUM_THREADS"] = n_threads
  7. os.environ["MKL_NUM_THREADS"] = n_threads
  8. os.environ["VECLIB_MAXIMUM_THREADS"] = n_threads
  9. os.environ["NUMEXPR_NUM_THREADS"] = n_threads
  10. import scold
  11. from scold import draw, text_arr_sim, arr_sim, text_arr_sim_wasserstein
  12. import os.path as op
  13. import pandas as pd
  14. import numpy as np
  15. from tqdm import tqdm
  16. from string import ascii_lowercase
  17. from joblib import Parallel, delayed
  18. n_jobs = -1
  19. # %matplotlib qt
  20. # %% build character arrays
  21. chars = [*ascii_lowercase, 'ä', 'ö', 'ü', 'ß']
  22. font = 'Arial-Lgt.ttf'
  23. font_size = 250
  24. font_size_expl_geom = 75 # font size for the exploratory analyses examining geometric invariance (can be set to be smaller for feasibility)
  25. round_method = None
  26. char_arrs = [draw.text_array(c, font=font, size=font_size, method=round_method) for c in chars]
  27. char_arrs_smaller = [draw.text_array(c, font=font, size=font_size_expl_geom, method=round_method) for c in chars]
  28. out_path = 'stim_sim'
  29. res_path_ot = op.join(out_path, 'ot_variants')
  30. res_path_jacc_geom = op.join(out_path, 'jacc_geom')
  31. res_path_ot_geom = op.join(out_path, 'ot_geom')
  32. res_path_prereg = op.join(out_path, 'preregistered')
  33. # %% prepare for only processing one triangle
  34. tril_idx = np.tril_indices(len(chars), k=-1)
  35. tril_idx_long = np.array(tril_idx).T
  36. # %% complexity (size) distances (done serially as it's simple & fast)
  37. # char_ols = [draw.outline_shape(ca) for ca in char_arrs]
  38. # comp = [np.sum(co) for co in char_ols]
  39. comp = [np.sum(c) for c in char_arrs]
  40. comp_dists = [np.abs(c_i - c_j) for c_i in comp for c_j in comp]
  41. comp_dists_mat = np.array(comp_dists).reshape((len(chars), len(chars)))
  42. file_path = op.join('stim_sim', 'complexity')
  43. np.save(op.join(file_path, 'complexity.npy'), comp_dists_mat)
  44. # save to csv
  45. char1_ids = [c_i for c_i in chars for c_j in chars]
  46. char2_ids = [c_j for c_i in chars for c_j in chars]
  47. comp_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'comp_dist': comp_dists})
  48. comp_df.to_csv(op.join(file_path, 'complexity.csv'), index=False, encoding='utf-8')
  49. comp_features_df = pd.DataFrame( {'char': chars, 'pixel_sum': comp} )
  50. comp_features_df.to_csv(op.join(file_path, 'complexity_features.csv'), index=False, encoding='utf-8')
  51. # %% preregistered comparison: 1) Wasserstein Distance
  52. # cross-correlation-aligned, normalised Wasserstein distance
  53. desc = f'Preregistered Wasserstein Distance Measure'
  54. # calculate the similarities
  55. pw_res = Parallel(n_jobs=n_jobs)(
  56. delayed(arr_sim.partial_wasserstein_trans)(a_arr=char_arrs[i], b_arr=char_arrs[j], scale_mass=True, scale_mass_method='proportion', mass_normalise=False, distance_normalise=False, nb_dummies=2, del_weight=0.0, ins_weight=0.0, distance_metric='euclidean', translation='crosscor') for i, j in tqdm(tril_idx_long, desc=desc))
  57. # coerce into matrix
  58. pw_mat = np.zeros((len(chars), len(chars)))
  59. pw_mat[tril_idx] = np.array([pw_i['metric'] for pw_i in pw_res])
  60. pw_mat[tril_idx[1], tril_idx[0]] = pw_mat[tril_idx].T
  61. # check no ties in ranking for the preregistered analysis
  62. assert(len(np.unique(pw_mat[tril_idx])) == len(pw_mat[tril_idx]))
  63. # save to file
  64. file_name = f'ot'
  65. file_path = op.join(res_path_prereg, f'{file_name}.npy')
  66. np.save(file_path, pw_mat)
  67. # save to csv
  68. ot_vec = pw_mat[tril_idx]
  69. char1_ids = np.array(chars)[tril_idx[0]]
  70. char2_ids = np.array(chars)[tril_idx[1]]
  71. ot_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'ot': ot_vec})
  72. file_path = op.join(res_path_prereg, f'{file_name}.csv')
  73. ot_df.to_csv(file_path, index=False, encoding='utf-8')
  74. # %% preregistered comparison: 2) Jaccard Distance
  75. # cross-correlation-aligned Jaccard Distance
  76. desc = f'Preregistered Jaccard Distance Measure'
  77. # calculate the similarities
  78. j_res = Parallel(n_jobs=n_jobs)(
  79. delayed(text_arr_sim.opt_text_arr_sim)(
  80. chars[i], b_arr=char_arrs[j],
  81. font_a=font, size=font_size, method=round_method,
  82. measure='jaccard',
  83. translate=True,
  84. scale=False, fliplr=False, flipud=False, rotate=False)
  85. for i, j in tqdm(tril_idx_long, desc=desc))
  86. # coerce into matrix
  87. j_mat = np.ones((len(chars), len(chars)))
  88. j_mat[tril_idx] = np.array([j_i['jaccard'] for j_i in j_res])
  89. j_mat[tril_idx[1], tril_idx[0]] = j_mat[tril_idx].T
  90. # check no ties in ranking for the preregistered analysis
  91. assert(len(np.unique(j_mat[tril_idx])) == len(j_mat[tril_idx]))
  92. # invert to get distances (includes diagonals because np.ones was used)
  93. j_mat = 1 - j_mat
  94. # save to file
  95. file_name = f'jacc'
  96. file_path = op.join(res_path_prereg, f'{file_name}.npy')
  97. np.save(file_path, j_mat)
  98. # save to csv
  99. j_vec = j_mat[tril_idx]
  100. char1_ids = np.array(chars)[tril_idx[0]]
  101. char2_ids = np.array(chars)[tril_idx[1]]
  102. j_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'jacc': j_vec})
  103. file_path = op.join(res_path_prereg, f'{file_name}.csv')
  104. j_df.to_csv(file_path, index=False, encoding='utf-8')
  105. # %%
  106. # for exploratory analysis using Gromov-Wasserstein distance (geometric invariance)
  107. desc = f'Optimal Transport Gromov-Wasserstein Distance'
  108. pgw_res = Parallel(n_jobs=n_jobs)(
  109. delayed(arr_sim.partial_gromov_wasserstein)(a_arr=char_arrs_smaller[i], b_arr=char_arrs_smaller[j], scale_mass=True, scale_mass_method='proportion', mass_normalise=False, nb_dummies=2, del_weight=0.0, ins_weight=0.0) for i, j in tqdm(tril_idx_long, desc=desc))
  110. # coerce into array format
  111. pgw_mat = np.zeros((len(chars), len(chars)))
  112. pgw_mat[tril_idx] = pgw_res
  113. pgw_mat[tril_idx[1], tril_idx[0]] = pgw_mat[tril_idx].T
  114. # save to file
  115. file_name = 'ot_pgw'
  116. file_path = op.join(res_path_ot_geom, f'{file_name}.npy')
  117. np.save(file_path, pgw_mat)
  118. # save to csv
  119. ot_vec = pgw_mat[tril_idx]
  120. char1_ids = np.array(chars)[tril_idx[0]]
  121. char2_ids = np.array(chars)[tril_idx[1]]
  122. ot_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'ot': ot_vec})
  123. file_path = op.join(res_path_ot_geom, f'{file_name}.csv')
  124. ot_df.to_csv(file_path, index=False, encoding='utf-8')
  125. # %%
  126. # exploratory variants of Wasserstein distance
  127. for translation_opt in ('crosscor', 'opt'):
  128. for sq_euclidean in (False, True):
  129. desc = f'Optimal Transport sq_euclidean={int(sq_euclidean)} translation={translation_opt}'
  130. distance_metric = 'sqeuclidean' if sq_euclidean else 'euclidean'
  131. # calculate the similarities
  132. pw_res = Parallel(n_jobs=n_jobs)(
  133. delayed(arr_sim.partial_wasserstein_trans)(a_arr=char_arrs_smaller[i], b_arr=char_arrs_smaller[j], scale_mass=True, scale_mass_method='proportion', mass_normalise=False, distance_normalise=False, nb_dummies=2, del_weight=0.0, ins_weight=0.0, distance_metric=distance_metric, translation=translation_opt, n_startvals=5) for i, j in tqdm(tril_idx_long, desc=desc))
  134. # coerce into array format
  135. pw_mat = np.zeros((len(chars), len(chars)))
  136. pw_mat[tril_idx] = np.array([pw_i['metric'] for pw_i in pw_res])
  137. pw_mat[tril_idx[1], tril_idx[0]] = pw_mat[tril_idx].T
  138. # save to file
  139. file_name = f'ot_sq{int(sq_euclidean)}_t{translation_opt}'
  140. file_path = op.join(res_path_ot, f'{file_name}.npy')
  141. np.save(file_path, pw_mat)
  142. # save to csv
  143. ot_vec = pw_mat[tril_idx]
  144. char1_ids = np.array(chars)[tril_idx[0]]
  145. char2_ids = np.array(chars)[tril_idx[1]]
  146. ot_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, 'ot': ot_vec})
  147. file_path = op.join(res_path_ot, f'{file_name}.csv')
  148. ot_df.to_csv(file_path, index=False, encoding='utf-8')
  149. # %% exploratory variants of OT and Jaccard distance with geometric invariances
  150. for measure in ('jaccard', 'partial_wasserstein'):
  151. for translate in (False, True):
  152. for scale in (False, True):
  153. for rotate in (False, True):
  154. # for fliplr in (False, True):
  155. # for flipud in (False, True):
  156. fliplr = False
  157. flipud = False
  158. desc_lab = 'Jaccard' if measure=='jaccard' else 'Optimal Transport'
  159. desc = f'{desc_lab} T={int(translate)} S={int(scale)} R={int(rotate)}, Flr={int(fliplr)} Fud={int(flipud)}'
  160. # calculate the similarities
  161. if measure=='jaccard':
  162. opt_res = Parallel(n_jobs=n_jobs)(
  163. delayed(text_arr_sim.opt_text_arr_sim)(
  164. chars[i], b_arr=char_arrs_smaller[j],
  165. font_a=font, size=font_size_expl_geom, method=round_method,
  166. measure=measure,
  167. translate=translate,
  168. scale=scale, scale_eval_n=5, max_scale_change_factor=2.0,
  169. fliplr=fliplr, flipud=flipud,
  170. rotate=rotate, rotation_eval_n=5, rotation_bounds=(-np.inf, np.inf)
  171. ) for i, j in tqdm(tril_idx_long, desc=desc))
  172. elif measure=='partial_wasserstein':
  173. opt_res = Parallel(n_jobs=n_jobs)(
  174. delayed(text_arr_sim_wasserstein.opt_text_arr_sim)(
  175. chars[i], b_arr=char_arrs_smaller[j],
  176. font_a=font, size=font_size_expl_geom, method=round_method,
  177. measure=measure,
  178. translate=translate, translation_eval_n=5, max_translation_factor=0.99,
  179. scale=scale, scale_eval_n=5, max_scale_change_factor=2.0,
  180. fliplr=fliplr, flipud=flipud,
  181. rotate=rotate, rotation_eval_n=5, rotation_bounds=(-np.inf, np.inf),
  182. partial_wasserstein_kwargs={'scale_mass':True, 'scale_mass_method':'proportion', 'mass_normalise':False, 'distance_normalise':False, 'ins_weight':0.0, 'del_weight':0.0}
  183. ) for i, j in tqdm(tril_idx_long, desc=desc))
  184. # coerce into matrix
  185. opt_mat = np.ones((len(chars), len(chars)))
  186. opt_mat[tril_idx] = np.array([x_i[measure] for x_i in opt_res])
  187. opt_mat[tril_idx[1], tril_idx[0]] = opt_mat[tril_idx].T
  188. # invert to get distances (includes diagonals because np.ones was used)
  189. if measure == 'jaccard':
  190. opt_mat = 1 - opt_mat
  191. # save to file
  192. if measure == 'jaccard':
  193. file_name = f'jacc_T{int(translate)}_S{int(scale)}_R{int(rotate)}_Flr{int(fliplr)}_Fud{int(flipud)}'
  194. file_path = op.join(res_path_jacc_geom, f'{file_name}.npy')
  195. file_path_df = op.join(res_path_jacc_geom, f'{file_name}.csv')
  196. else:
  197. file_name = f'ot_T{int(translate)}_S{int(scale)}_R{int(rotate)}_Flr{int(fliplr)}_Fud{int(flipud)}'
  198. file_path = op.join(res_path_ot_geom, f'{file_name}.npy')
  199. file_path_df = op.join(res_path_ot_geom, f'{file_name}.csv')
  200. np.save(file_path, opt_mat)
  201. # save to csv
  202. char1_ids = np.array(chars)[tril_idx[0]]
  203. char2_ids = np.array(chars)[tril_idx[1]]
  204. opt_vec = opt_mat[tril_idx]
  205. if measure == 'jaccard':
  206. measure_lab = 'jacc'
  207. elif measure == 'partial_wasserstein':
  208. measure_lab = 'ot'
  209. opt_df = pd.DataFrame({'char1': char1_ids, 'char2': char2_ids, measure_lab: opt_vec})
  210. opt_df.to_csv(file_path_df, index=False, encoding='utf-8')