JackEdTaylor
/
lettersim-ot-rsa


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
							"""Convenience functions for working with Rumelhart-Siple representations of characters"""

import numpy as np
import matplotlib.pyplot as plt

from scipy.signal import correlate

# binary values for rumelhart characters - each character has a 14-bit representation
rumelhart_vals = {
    'a': np.array([1,1,1,1,1,0,1,0,1,0,0,0,0,0], dtype=bool),
    'b': np.array([0,0,1,1,1,1,0,1,1,1,0,0,0,0], dtype=bool),
    'c': np.array([1,1,1,0,0,1,0,0,0,0,0,0,0,0], dtype=bool),
    'd': np.array([0,0,1,1,1,1,0,1,0,1,0,0,0,0], dtype=bool),
    'e': np.array([1,1,1,0,0,1,1,0,0,0,0,0,0,0], dtype=bool),
    'f': np.array([1,1,1,0,0,0,1,0,0,0,0,0,0,0], dtype=bool),
    'g': np.array([1,1,1,0,1,1,0,0,1,0,0,0,0,0], dtype=bool),
    'h': np.array([1,1,0,1,1,0,1,0,1,0,0,0,0,0], dtype=bool),
    'i': np.array([0,0,1,0,0,1,0,1,0,1,0,0,0,0], dtype=bool),
    'j': np.array([1,0,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
    'k': np.array([1,1,0,0,0,0,1,0,0,0,0,1,1,0], dtype=bool),
    'l': np.array([1,1,0,0,0,1,0,0,0,0,0,0,0,0], dtype=bool),
    'm': np.array([1,1,0,1,1,0,0,0,0,0,1,1,0,0], dtype=bool),
    'n': np.array([1,1,0,1,1,0,0,0,0,0,1,0,1,0], dtype=bool),
    'o': np.array([1,1,1,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
    'p': np.array([1,1,1,1,0,0,1,0,1,0,0,0,0,0], dtype=bool),
    'q': np.array([1,1,1,1,1,1,0,0,0,0,0,0,1,0], dtype=bool),
    'r': np.array([1,1,1,1,0,0,1,0,1,0,0,0,1,0], dtype=bool),
    's': np.array([0,1,1,0,1,1,1,0,1,0,0,0,0,0], dtype=bool),
    't': np.array([0,0,1,0,0,0,0,1,0,1,0,0,0,0], dtype=bool),
    'u': np.array([1,1,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
    'v': np.array([1,1,0,0,0,0,0,0,0,0,0,1,0,1], dtype=bool),
    'w': np.array([1,1,0,1,1,0,0,0,0,0,0,0,1,1], dtype=bool),
    'x': np.array([0,0,0,0,0,0,0,0,0,0,1,1,1,1], dtype=bool),
    'y': np.array([0,0,0,0,0,0,0,0,0,1,1,1,0,0], dtype=bool),
    'z': np.array([0,0,1,0,0,1,0,0,0,0,0,1,0,1], dtype=bool)}

def plot_rumelhart(char=None, arr=None, w=12, h=18):
    """A simple function for plotting 14-bit Rumelhart representations
    
    Parameters
    ----------
    char : str, optional
        The character to look up and plot. If absent, `arr` must be defined. If both `arr` and `char` are defined, `char` will take priority.
    arr : ndarray, optional
        A 1-D array of the 14 bits to translate into an image. If absent, `char` must be defined.
    w : int
        Width of the image, in pixels - should be even.
    h : int
        Height of the image, in pixels - should be even.
    """
    c = np.zeros((h, w))
    w_hlf = int(w/2)
    h_hlf = int(h/2)

    if char is not None:
        arr = rumelhart_vals[char]
    
    if arr[0] == 1: c[h_hlf:h, 0] = 1
    if arr[1] == 1: c[0:h_hlf, 0] = 1
    if arr[2] == 1: c[0, :] = 1
    if arr[3] == 1: c[0:h_hlf, w-1] = 1
    if arr[4] == 1: c[h_hlf:h, w-1] = 1
    if arr[5] == 1: c[h-1, :] = 1
    
    if arr[6] == 1: c[h_hlf-1, 0:w_hlf] = 1
    if arr[7] == 1: c[0:h_hlf, w_hlf-1] = 1
    if arr[8] == 1: c[h_hlf-1, w_hlf:w] = 1
    if arr[9] == 1: c[h_hlf:h, w_hlf-1] = 1
    
    if arr[10] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.round(np.arange(0, w_hlf, w/h)).astype('int')] = 1
    if arr[11] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.flip(np.round(np.arange(w_hlf, w, w/h)).astype('int'))] = 1
    if arr[12] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.round(np.arange(w_hlf, w, w/h)).astype('int')] = 1
    if arr[13] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.flip(np.round(np.arange(0, w_hlf, w/h)).astype('int'))] = 1
    
    plt.imshow(c, cmap='Greys',  interpolation='none')

# function for calculating bit-wise full word form similarity of words built from Rumelhart-Siple characters
def scold_fullword_rumelhart_slidingwindow(a, b):
    if len(a)>len(b):
        i = a.lower()
        j = b.lower()
    else:
        i = b.lower()
        j = a.lower()

    i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int)
    j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int)

    # simple sliding window to find max overlap
    str_len_diff = len(i)-len(j)
    
    if str_len_diff>0:
        i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0)))

    j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1)
    j_shifts_r = np.flip(j_shifts_l)

    sh_res = np.zeros(len(j_shifts_l))

    for sh in range(len(j_shifts_l)):
        j_pad = np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0)
        sh_res[sh] = np.sum(np.abs(i_arr ^ j_pad))

    return(np.min(sh_res))

# a version of the function above that uses cross-correlation
def scold_fullword_rumelhart_cc(a, b):
    if len(a)>len(b):
        i = a.lower()
        j = b.lower()
    else:
        i = b.lower()
        j = a.lower()

    i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int)
    j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int)

    # ensure same size
    str_len_diff = len(i)-len(j)

    if str_len_diff>0:
        j_arr = np.pad(j_arr, pad_width=((str_len_diff, 0), (0, 0)), constant_values=0)

    # ensure even indices
    if j_arr.shape[0] % 2 != 0:
        j_arr = np.pad(j_arr, ((0, 1), (0, 0)))
        i_arr = np.pad(i_arr, ((0, 1), (0, 0)))

    # pad to get all possible overlaps
    i_arr_pad = np.pad(i_arr, pad_width=((len(i_arr), len(i_arr)), (0, 0)))
    j_arr_pad = np.pad(j_arr, pad_width=((len(j_arr), len(j_arr)), (0, 0)))

    # cross correlate
    cc = correlate(i_arr_pad, j_arr_pad, mode='same')
    max_idx = np.argmax(cc[:, 7])
    shift = np.negative(np.round((j_arr_pad.shape[0])/2 - max_idx)).astype(int)

    if shift>0:
        pad = (shift, 0)
    elif shift<0:
        pad = (0, np.abs(shift))
    else:
        pad = (0, 0)

    i_res = np.pad(i_arr, pad_width=(np.flip(pad), (0, 0)), constant_values=0)
    j_res = np.pad(j_arr, pad_width=(pad, (0, 0)), constant_values=0)

    return(np.sum(np.abs(i_res ^ j_res)))

# a version of the function above that tries to find the overlap in parallel
def scold_fullword_rumelhart_mat(a, b):
    if len(a)>len(b):
        i = a.lower()
        j = b.lower()
    else:
        i = b.lower()
        j = a.lower()

    i_arr = np.array([rumelhart_vals[c] for c in i])
    j_arr = np.array([rumelhart_vals[c] for c in j])

    str_len_diff = len(i)-len(j)
    
    if str_len_diff>0:
        i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0)))

    j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1)
    j_shifts_r = np.flip(j_shifts_l)

    possible_js = np.array([np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0) for sh in range(len(j_shifts_l))])

    diffs = np.sum(np.logical_xor(i_arr, possible_js), axis=(1, 2))

    return(np.min(diffs))


# SCOLD20 for words in Rumelhart characters
def scold20_fullword_rumelhart(w, words, n=20):
    if w in words:
        words.remove(w)
    
    all_dists = [scold_fullword_rumelhart_mat(w, x) for x in words]
    all_dists = np.asarray(all_dists, dtype=np.int64)
    all_dists = np.sort(all_dists)
    return(np.mean(all_dists[0:n]))