123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- """Convenience functions for working with Rumelhart-Siple representations of characters"""
- import numpy as np
- import matplotlib.pyplot as plt
- from scipy.signal import correlate
- # binary values for rumelhart characters - each character has a 14-bit representation
- rumelhart_vals = {
- 'a': np.array([1,1,1,1,1,0,1,0,1,0,0,0,0,0], dtype=bool),
- 'b': np.array([0,0,1,1,1,1,0,1,1,1,0,0,0,0], dtype=bool),
- 'c': np.array([1,1,1,0,0,1,0,0,0,0,0,0,0,0], dtype=bool),
- 'd': np.array([0,0,1,1,1,1,0,1,0,1,0,0,0,0], dtype=bool),
- 'e': np.array([1,1,1,0,0,1,1,0,0,0,0,0,0,0], dtype=bool),
- 'f': np.array([1,1,1,0,0,0,1,0,0,0,0,0,0,0], dtype=bool),
- 'g': np.array([1,1,1,0,1,1,0,0,1,0,0,0,0,0], dtype=bool),
- 'h': np.array([1,1,0,1,1,0,1,0,1,0,0,0,0,0], dtype=bool),
- 'i': np.array([0,0,1,0,0,1,0,1,0,1,0,0,0,0], dtype=bool),
- 'j': np.array([1,0,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
- 'k': np.array([1,1,0,0,0,0,1,0,0,0,0,1,1,0], dtype=bool),
- 'l': np.array([1,1,0,0,0,1,0,0,0,0,0,0,0,0], dtype=bool),
- 'm': np.array([1,1,0,1,1,0,0,0,0,0,1,1,0,0], dtype=bool),
- 'n': np.array([1,1,0,1,1,0,0,0,0,0,1,0,1,0], dtype=bool),
- 'o': np.array([1,1,1,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
- 'p': np.array([1,1,1,1,0,0,1,0,1,0,0,0,0,0], dtype=bool),
- 'q': np.array([1,1,1,1,1,1,0,0,0,0,0,0,1,0], dtype=bool),
- 'r': np.array([1,1,1,1,0,0,1,0,1,0,0,0,1,0], dtype=bool),
- 's': np.array([0,1,1,0,1,1,1,0,1,0,0,0,0,0], dtype=bool),
- 't': np.array([0,0,1,0,0,0,0,1,0,1,0,0,0,0], dtype=bool),
- 'u': np.array([1,1,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
- 'v': np.array([1,1,0,0,0,0,0,0,0,0,0,1,0,1], dtype=bool),
- 'w': np.array([1,1,0,1,1,0,0,0,0,0,0,0,1,1], dtype=bool),
- 'x': np.array([0,0,0,0,0,0,0,0,0,0,1,1,1,1], dtype=bool),
- 'y': np.array([0,0,0,0,0,0,0,0,0,1,1,1,0,0], dtype=bool),
- 'z': np.array([0,0,1,0,0,1,0,0,0,0,0,1,0,1], dtype=bool)}
- def plot_rumelhart(char=None, arr=None, w=12, h=18):
- """A simple function for plotting 14-bit Rumelhart representations
-
- Parameters
- ----------
- char : str, optional
- The character to look up and plot. If absent, `arr` must be defined. If both `arr` and `char` are defined, `char` will take priority.
- arr : ndarray, optional
- A 1-D array of the 14 bits to translate into an image. If absent, `char` must be defined.
- w : int
- Width of the image, in pixels - should be even.
- h : int
- Height of the image, in pixels - should be even.
- """
- c = np.zeros((h, w))
- w_hlf = int(w/2)
- h_hlf = int(h/2)
- if char is not None:
- arr = rumelhart_vals[char]
-
- if arr[0] == 1: c[h_hlf:h, 0] = 1
- if arr[1] == 1: c[0:h_hlf, 0] = 1
- if arr[2] == 1: c[0, :] = 1
- if arr[3] == 1: c[0:h_hlf, w-1] = 1
- if arr[4] == 1: c[h_hlf:h, w-1] = 1
- if arr[5] == 1: c[h-1, :] = 1
-
- if arr[6] == 1: c[h_hlf-1, 0:w_hlf] = 1
- if arr[7] == 1: c[0:h_hlf, w_hlf-1] = 1
- if arr[8] == 1: c[h_hlf-1, w_hlf:w] = 1
- if arr[9] == 1: c[h_hlf:h, w_hlf-1] = 1
-
- if arr[10] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.round(np.arange(0, w_hlf, w/h)).astype('int')] = 1
- if arr[11] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.flip(np.round(np.arange(w_hlf, w, w/h)).astype('int'))] = 1
- if arr[12] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.round(np.arange(w_hlf, w, w/h)).astype('int')] = 1
- if arr[13] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.flip(np.round(np.arange(0, w_hlf, w/h)).astype('int'))] = 1
-
- plt.imshow(c, cmap='Greys', interpolation='none')
- # function for calculating bit-wise full word form similarity of words built from Rumelhart-Siple characters
- def scold_fullword_rumelhart_slidingwindow(a, b):
- if len(a)>len(b):
- i = a.lower()
- j = b.lower()
- else:
- i = b.lower()
- j = a.lower()
- i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int)
- j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int)
- # simple sliding window to find max overlap
- str_len_diff = len(i)-len(j)
-
- if str_len_diff>0:
- i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0)))
- j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1)
- j_shifts_r = np.flip(j_shifts_l)
- sh_res = np.zeros(len(j_shifts_l))
- for sh in range(len(j_shifts_l)):
- j_pad = np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0)
- sh_res[sh] = np.sum(np.abs(i_arr ^ j_pad))
- return(np.min(sh_res))
- # a version of the function above that uses cross-correlation
- def scold_fullword_rumelhart_cc(a, b):
- if len(a)>len(b):
- i = a.lower()
- j = b.lower()
- else:
- i = b.lower()
- j = a.lower()
- i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int)
- j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int)
- # ensure same size
- str_len_diff = len(i)-len(j)
- if str_len_diff>0:
- j_arr = np.pad(j_arr, pad_width=((str_len_diff, 0), (0, 0)), constant_values=0)
- # ensure even indices
- if j_arr.shape[0] % 2 != 0:
- j_arr = np.pad(j_arr, ((0, 1), (0, 0)))
- i_arr = np.pad(i_arr, ((0, 1), (0, 0)))
- # pad to get all possible overlaps
- i_arr_pad = np.pad(i_arr, pad_width=((len(i_arr), len(i_arr)), (0, 0)))
- j_arr_pad = np.pad(j_arr, pad_width=((len(j_arr), len(j_arr)), (0, 0)))
- # cross correlate
- cc = correlate(i_arr_pad, j_arr_pad, mode='same')
- max_idx = np.argmax(cc[:, 7])
- shift = np.negative(np.round((j_arr_pad.shape[0])/2 - max_idx)).astype(int)
- if shift>0:
- pad = (shift, 0)
- elif shift<0:
- pad = (0, np.abs(shift))
- else:
- pad = (0, 0)
- i_res = np.pad(i_arr, pad_width=(np.flip(pad), (0, 0)), constant_values=0)
- j_res = np.pad(j_arr, pad_width=(pad, (0, 0)), constant_values=0)
- return(np.sum(np.abs(i_res ^ j_res)))
- # a version of the function above that tries to find the overlap in parallel
- def scold_fullword_rumelhart_mat(a, b):
- if len(a)>len(b):
- i = a.lower()
- j = b.lower()
- else:
- i = b.lower()
- j = a.lower()
- i_arr = np.array([rumelhart_vals[c] for c in i])
- j_arr = np.array([rumelhart_vals[c] for c in j])
- str_len_diff = len(i)-len(j)
-
- if str_len_diff>0:
- i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0)))
- j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1)
- j_shifts_r = np.flip(j_shifts_l)
- possible_js = np.array([np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0) for sh in range(len(j_shifts_l))])
- diffs = np.sum(np.logical_xor(i_arr, possible_js), axis=(1, 2))
- return(np.min(diffs))
- # SCOLD20 for words in Rumelhart characters
- def scold20_fullword_rumelhart(w, words, n=20):
- if w in words:
- words.remove(w)
-
- all_dists = [scold_fullword_rumelhart_mat(w, x) for x in words]
- all_dists = np.asarray(all_dists, dtype=np.int64)
- all_dists = np.sort(all_dists)
- return(np.mean(all_dists[0:n]))
|