"""Convenience functions for working with Rumelhart-Siple representations of characters""" import numpy as np import matplotlib.pyplot as plt from scipy.signal import correlate # binary values for rumelhart characters - each character has a 14-bit representation rumelhart_vals = { 'a': np.array([1,1,1,1,1,0,1,0,1,0,0,0,0,0], dtype=bool), 'b': np.array([0,0,1,1,1,1,0,1,1,1,0,0,0,0], dtype=bool), 'c': np.array([1,1,1,0,0,1,0,0,0,0,0,0,0,0], dtype=bool), 'd': np.array([0,0,1,1,1,1,0,1,0,1,0,0,0,0], dtype=bool), 'e': np.array([1,1,1,0,0,1,1,0,0,0,0,0,0,0], dtype=bool), 'f': np.array([1,1,1,0,0,0,1,0,0,0,0,0,0,0], dtype=bool), 'g': np.array([1,1,1,0,1,1,0,0,1,0,0,0,0,0], dtype=bool), 'h': np.array([1,1,0,1,1,0,1,0,1,0,0,0,0,0], dtype=bool), 'i': np.array([0,0,1,0,0,1,0,1,0,1,0,0,0,0], dtype=bool), 'j': np.array([1,0,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool), 'k': np.array([1,1,0,0,0,0,1,0,0,0,0,1,1,0], dtype=bool), 'l': np.array([1,1,0,0,0,1,0,0,0,0,0,0,0,0], dtype=bool), 'm': np.array([1,1,0,1,1,0,0,0,0,0,1,1,0,0], dtype=bool), 'n': np.array([1,1,0,1,1,0,0,0,0,0,1,0,1,0], dtype=bool), 'o': np.array([1,1,1,1,1,1,0,0,0,0,0,0,0,0], dtype=bool), 'p': np.array([1,1,1,1,0,0,1,0,1,0,0,0,0,0], dtype=bool), 'q': np.array([1,1,1,1,1,1,0,0,0,0,0,0,1,0], dtype=bool), 'r': np.array([1,1,1,1,0,0,1,0,1,0,0,0,1,0], dtype=bool), 's': np.array([0,1,1,0,1,1,1,0,1,0,0,0,0,0], dtype=bool), 't': np.array([0,0,1,0,0,0,0,1,0,1,0,0,0,0], dtype=bool), 'u': np.array([1,1,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool), 'v': np.array([1,1,0,0,0,0,0,0,0,0,0,1,0,1], dtype=bool), 'w': np.array([1,1,0,1,1,0,0,0,0,0,0,0,1,1], dtype=bool), 'x': np.array([0,0,0,0,0,0,0,0,0,0,1,1,1,1], dtype=bool), 'y': np.array([0,0,0,0,0,0,0,0,0,1,1,1,0,0], dtype=bool), 'z': np.array([0,0,1,0,0,1,0,0,0,0,0,1,0,1], dtype=bool)} def plot_rumelhart(char=None, arr=None, w=12, h=18): """A simple function for plotting 14-bit Rumelhart representations Parameters ---------- char : str, optional The character to look up and plot. If absent, `arr` must be defined. If both `arr` and `char` are defined, `char` will take priority. arr : ndarray, optional A 1-D array of the 14 bits to translate into an image. If absent, `char` must be defined. w : int Width of the image, in pixels - should be even. h : int Height of the image, in pixels - should be even. """ c = np.zeros((h, w)) w_hlf = int(w/2) h_hlf = int(h/2) if char is not None: arr = rumelhart_vals[char] if arr[0] == 1: c[h_hlf:h, 0] = 1 if arr[1] == 1: c[0:h_hlf, 0] = 1 if arr[2] == 1: c[0, :] = 1 if arr[3] == 1: c[0:h_hlf, w-1] = 1 if arr[4] == 1: c[h_hlf:h, w-1] = 1 if arr[5] == 1: c[h-1, :] = 1 if arr[6] == 1: c[h_hlf-1, 0:w_hlf] = 1 if arr[7] == 1: c[0:h_hlf, w_hlf-1] = 1 if arr[8] == 1: c[h_hlf-1, w_hlf:w] = 1 if arr[9] == 1: c[h_hlf:h, w_hlf-1] = 1 if arr[10] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.round(np.arange(0, w_hlf, w/h)).astype('int')] = 1 if arr[11] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.flip(np.round(np.arange(w_hlf, w, w/h)).astype('int'))] = 1 if arr[12] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.round(np.arange(w_hlf, w, w/h)).astype('int')] = 1 if arr[13] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.flip(np.round(np.arange(0, w_hlf, w/h)).astype('int'))] = 1 plt.imshow(c, cmap='Greys', interpolation='none') # function for calculating bit-wise full word form similarity of words built from Rumelhart-Siple characters def scold_fullword_rumelhart_slidingwindow(a, b): if len(a)>len(b): i = a.lower() j = b.lower() else: i = b.lower() j = a.lower() i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int) j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int) # simple sliding window to find max overlap str_len_diff = len(i)-len(j) if str_len_diff>0: i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0))) j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1) j_shifts_r = np.flip(j_shifts_l) sh_res = np.zeros(len(j_shifts_l)) for sh in range(len(j_shifts_l)): j_pad = np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0) sh_res[sh] = np.sum(np.abs(i_arr ^ j_pad)) return(np.min(sh_res)) # a version of the function above that uses cross-correlation def scold_fullword_rumelhart_cc(a, b): if len(a)>len(b): i = a.lower() j = b.lower() else: i = b.lower() j = a.lower() i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int) j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int) # ensure same size str_len_diff = len(i)-len(j) if str_len_diff>0: j_arr = np.pad(j_arr, pad_width=((str_len_diff, 0), (0, 0)), constant_values=0) # ensure even indices if j_arr.shape[0] % 2 != 0: j_arr = np.pad(j_arr, ((0, 1), (0, 0))) i_arr = np.pad(i_arr, ((0, 1), (0, 0))) # pad to get all possible overlaps i_arr_pad = np.pad(i_arr, pad_width=((len(i_arr), len(i_arr)), (0, 0))) j_arr_pad = np.pad(j_arr, pad_width=((len(j_arr), len(j_arr)), (0, 0))) # cross correlate cc = correlate(i_arr_pad, j_arr_pad, mode='same') max_idx = np.argmax(cc[:, 7]) shift = np.negative(np.round((j_arr_pad.shape[0])/2 - max_idx)).astype(int) if shift>0: pad = (shift, 0) elif shift<0: pad = (0, np.abs(shift)) else: pad = (0, 0) i_res = np.pad(i_arr, pad_width=(np.flip(pad), (0, 0)), constant_values=0) j_res = np.pad(j_arr, pad_width=(pad, (0, 0)), constant_values=0) return(np.sum(np.abs(i_res ^ j_res))) # a version of the function above that tries to find the overlap in parallel def scold_fullword_rumelhart_mat(a, b): if len(a)>len(b): i = a.lower() j = b.lower() else: i = b.lower() j = a.lower() i_arr = np.array([rumelhart_vals[c] for c in i]) j_arr = np.array([rumelhart_vals[c] for c in j]) str_len_diff = len(i)-len(j) if str_len_diff>0: i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0))) j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1) j_shifts_r = np.flip(j_shifts_l) possible_js = np.array([np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0) for sh in range(len(j_shifts_l))]) diffs = np.sum(np.logical_xor(i_arr, possible_js), axis=(1, 2)) return(np.min(diffs)) # SCOLD20 for words in Rumelhart characters def scold20_fullword_rumelhart(w, words, n=20): if w in words: words.remove(w) all_dists = [scold_fullword_rumelhart_mat(w, x) for x in words] all_dists = np.asarray(all_dists, dtype=np.int64) all_dists = np.sort(all_dists) return(np.mean(all_dists[0:n]))