Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

rumelhart.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. """Convenience functions for working with Rumelhart-Siple representations of characters"""
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from scipy.signal import correlate
  5. # binary values for rumelhart characters - each character has a 14-bit representation
  6. rumelhart_vals = {
  7. 'a': np.array([1,1,1,1,1,0,1,0,1,0,0,0,0,0], dtype=bool),
  8. 'b': np.array([0,0,1,1,1,1,0,1,1,1,0,0,0,0], dtype=bool),
  9. 'c': np.array([1,1,1,0,0,1,0,0,0,0,0,0,0,0], dtype=bool),
  10. 'd': np.array([0,0,1,1,1,1,0,1,0,1,0,0,0,0], dtype=bool),
  11. 'e': np.array([1,1,1,0,0,1,1,0,0,0,0,0,0,0], dtype=bool),
  12. 'f': np.array([1,1,1,0,0,0,1,0,0,0,0,0,0,0], dtype=bool),
  13. 'g': np.array([1,1,1,0,1,1,0,0,1,0,0,0,0,0], dtype=bool),
  14. 'h': np.array([1,1,0,1,1,0,1,0,1,0,0,0,0,0], dtype=bool),
  15. 'i': np.array([0,0,1,0,0,1,0,1,0,1,0,0,0,0], dtype=bool),
  16. 'j': np.array([1,0,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
  17. 'k': np.array([1,1,0,0,0,0,1,0,0,0,0,1,1,0], dtype=bool),
  18. 'l': np.array([1,1,0,0,0,1,0,0,0,0,0,0,0,0], dtype=bool),
  19. 'm': np.array([1,1,0,1,1,0,0,0,0,0,1,1,0,0], dtype=bool),
  20. 'n': np.array([1,1,0,1,1,0,0,0,0,0,1,0,1,0], dtype=bool),
  21. 'o': np.array([1,1,1,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
  22. 'p': np.array([1,1,1,1,0,0,1,0,1,0,0,0,0,0], dtype=bool),
  23. 'q': np.array([1,1,1,1,1,1,0,0,0,0,0,0,1,0], dtype=bool),
  24. 'r': np.array([1,1,1,1,0,0,1,0,1,0,0,0,1,0], dtype=bool),
  25. 's': np.array([0,1,1,0,1,1,1,0,1,0,0,0,0,0], dtype=bool),
  26. 't': np.array([0,0,1,0,0,0,0,1,0,1,0,0,0,0], dtype=bool),
  27. 'u': np.array([1,1,0,1,1,1,0,0,0,0,0,0,0,0], dtype=bool),
  28. 'v': np.array([1,1,0,0,0,0,0,0,0,0,0,1,0,1], dtype=bool),
  29. 'w': np.array([1,1,0,1,1,0,0,0,0,0,0,0,1,1], dtype=bool),
  30. 'x': np.array([0,0,0,0,0,0,0,0,0,0,1,1,1,1], dtype=bool),
  31. 'y': np.array([0,0,0,0,0,0,0,0,0,1,1,1,0,0], dtype=bool),
  32. 'z': np.array([0,0,1,0,0,1,0,0,0,0,0,1,0,1], dtype=bool)}
  33. def plot_rumelhart(char=None, arr=None, w=12, h=18):
  34. """A simple function for plotting 14-bit Rumelhart representations
  35. Parameters
  36. ----------
  37. char : str, optional
  38. The character to look up and plot. If absent, `arr` must be defined. If both `arr` and `char` are defined, `char` will take priority.
  39. arr : ndarray, optional
  40. A 1-D array of the 14 bits to translate into an image. If absent, `char` must be defined.
  41. w : int
  42. Width of the image, in pixels - should be even.
  43. h : int
  44. Height of the image, in pixels - should be even.
  45. """
  46. c = np.zeros((h, w))
  47. w_hlf = int(w/2)
  48. h_hlf = int(h/2)
  49. if char is not None:
  50. arr = rumelhart_vals[char]
  51. if arr[0] == 1: c[h_hlf:h, 0] = 1
  52. if arr[1] == 1: c[0:h_hlf, 0] = 1
  53. if arr[2] == 1: c[0, :] = 1
  54. if arr[3] == 1: c[0:h_hlf, w-1] = 1
  55. if arr[4] == 1: c[h_hlf:h, w-1] = 1
  56. if arr[5] == 1: c[h-1, :] = 1
  57. if arr[6] == 1: c[h_hlf-1, 0:w_hlf] = 1
  58. if arr[7] == 1: c[0:h_hlf, w_hlf-1] = 1
  59. if arr[8] == 1: c[h_hlf-1, w_hlf:w] = 1
  60. if arr[9] == 1: c[h_hlf:h, w_hlf-1] = 1
  61. if arr[10] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.round(np.arange(0, w_hlf, w/h)).astype('int')] = 1
  62. if arr[11] == 1: c[np.round(np.arange(0, h_hlf, 1)).astype('int'), np.flip(np.round(np.arange(w_hlf, w, w/h)).astype('int'))] = 1
  63. if arr[12] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.round(np.arange(w_hlf, w, w/h)).astype('int')] = 1
  64. if arr[13] == 1: c[np.round(np.arange(h_hlf, h, 1)).astype('int'), np.flip(np.round(np.arange(0, w_hlf, w/h)).astype('int'))] = 1
  65. plt.imshow(c, cmap='Greys', interpolation='none')
  66. # function for calculating bit-wise full word form similarity of words built from Rumelhart-Siple characters
  67. def scold_fullword_rumelhart_slidingwindow(a, b):
  68. if len(a)>len(b):
  69. i = a.lower()
  70. j = b.lower()
  71. else:
  72. i = b.lower()
  73. j = a.lower()
  74. i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int)
  75. j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int)
  76. # simple sliding window to find max overlap
  77. str_len_diff = len(i)-len(j)
  78. if str_len_diff>0:
  79. i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0)))
  80. j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1)
  81. j_shifts_r = np.flip(j_shifts_l)
  82. sh_res = np.zeros(len(j_shifts_l))
  83. for sh in range(len(j_shifts_l)):
  84. j_pad = np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0)
  85. sh_res[sh] = np.sum(np.abs(i_arr ^ j_pad))
  86. return(np.min(sh_res))
  87. # a version of the function above that uses cross-correlation
  88. def scold_fullword_rumelhart_cc(a, b):
  89. if len(a)>len(b):
  90. i = a.lower()
  91. j = b.lower()
  92. else:
  93. i = b.lower()
  94. j = a.lower()
  95. i_arr = np.array([rumelhart_vals[c] for c in i], dtype=int)
  96. j_arr = np.array([rumelhart_vals[c] for c in j], dtype=int)
  97. # ensure same size
  98. str_len_diff = len(i)-len(j)
  99. if str_len_diff>0:
  100. j_arr = np.pad(j_arr, pad_width=((str_len_diff, 0), (0, 0)), constant_values=0)
  101. # ensure even indices
  102. if j_arr.shape[0] % 2 != 0:
  103. j_arr = np.pad(j_arr, ((0, 1), (0, 0)))
  104. i_arr = np.pad(i_arr, ((0, 1), (0, 0)))
  105. # pad to get all possible overlaps
  106. i_arr_pad = np.pad(i_arr, pad_width=((len(i_arr), len(i_arr)), (0, 0)))
  107. j_arr_pad = np.pad(j_arr, pad_width=((len(j_arr), len(j_arr)), (0, 0)))
  108. # cross correlate
  109. cc = correlate(i_arr_pad, j_arr_pad, mode='same')
  110. max_idx = np.argmax(cc[:, 7])
  111. shift = np.negative(np.round((j_arr_pad.shape[0])/2 - max_idx)).astype(int)
  112. if shift>0:
  113. pad = (shift, 0)
  114. elif shift<0:
  115. pad = (0, np.abs(shift))
  116. else:
  117. pad = (0, 0)
  118. i_res = np.pad(i_arr, pad_width=(np.flip(pad), (0, 0)), constant_values=0)
  119. j_res = np.pad(j_arr, pad_width=(pad, (0, 0)), constant_values=0)
  120. return(np.sum(np.abs(i_res ^ j_res)))
  121. # a version of the function above that tries to find the overlap in parallel
  122. def scold_fullword_rumelhart_mat(a, b):
  123. if len(a)>len(b):
  124. i = a.lower()
  125. j = b.lower()
  126. else:
  127. i = b.lower()
  128. j = a.lower()
  129. i_arr = np.array([rumelhart_vals[c] for c in i])
  130. j_arr = np.array([rumelhart_vals[c] for c in j])
  131. str_len_diff = len(i)-len(j)
  132. if str_len_diff>0:
  133. i_arr = np.pad(i_arr, pad_width=((str_len_diff-1, str_len_diff-1), (0, 0)))
  134. j_shifts_l = np.arange(0, i_arr.shape[0]-1, step=1)
  135. j_shifts_r = np.flip(j_shifts_l)
  136. possible_js = np.array([np.pad(j_arr, pad_width=((j_shifts_l[sh], j_shifts_r[sh]), (0, 0)), constant_values=0) for sh in range(len(j_shifts_l))])
  137. diffs = np.sum(np.logical_xor(i_arr, possible_js), axis=(1, 2))
  138. return(np.min(diffs))
  139. # SCOLD20 for words in Rumelhart characters
  140. def scold20_fullword_rumelhart(w, words, n=20):
  141. if w in words:
  142. words.remove(w)
  143. all_dists = [scold_fullword_rumelhart_mat(w, x) for x in words]
  144. all_dists = np.asarray(all_dists, dtype=np.int64)
  145. all_dists = np.sort(all_dists)
  146. return(np.mean(all_dists[0:n]))