string_sim.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. """Functions for string similarity metrics"""
  2. from string import ascii_letters
  3. import numpy as np
  4. from scold import draw
  5. from scold import arr_sim
  6. def old(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0):
  7. """Calculate Levenshtein distance between two strings.
  8. Parameters
  9. ----------
  10. a : str
  11. b : str
  12. wt_del: float
  13. the cost assigned to deletion operations
  14. wt_ins: float
  15. the cost assigned to insertion operations
  16. wt_sub: float
  17. the cost assigned to substitution operations
  18. Returns
  19. -------
  20. float
  21. The Levenshtein distance between `a` and `b`
  22. """
  23. lena = len(a)
  24. lenb = len(b)
  25. m = np.empty((lena+1, lenb+1))
  26. # m[:] = np.nan
  27. m[0, 0] = 0
  28. m[:, 0] = np.arange(lena+1)
  29. m[0, :] = np.arange(lenb+1)
  30. for i in range(1,lena+1):
  31. for j in range(1,lenb+1):
  32. if a[i-1] == b[j-1]:
  33. cost = 0
  34. else:
  35. cost = wt_sub
  36. m[i, j] = min(
  37. m[i-1, j] + wt_del, # deletion
  38. m[i, j-1] + wt_ins, # insertion
  39. m[i-1, j-1] + cost, # substitution
  40. )
  41. return(m[lena, lenb])
  42. def dld(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0, wt_trans=1.0):
  43. """Calculate Damerau-Levenshtein distance between two strings.
  44. Parameters
  45. ----------
  46. a : str
  47. b : str
  48. wt_del: float
  49. the cost assigned to deletion operations
  50. wt_ins: float
  51. the cost assigned to insertion operations
  52. wt_sub: float
  53. the cost assigned to substitution operations
  54. wt_del: float
  55. the cost assigned to transposition operations
  56. Returns
  57. -------
  58. float
  59. The Damerau-Levenshtein distance between `a` and `b`
  60. """
  61. lena = len(a)
  62. lenb = len(b)
  63. m = np.empty((lena+1, lenb+1))
  64. # m[:] = np.nan
  65. m[0, 0] = 0
  66. m[:, 0] = np.arange(lena+1)
  67. m[0, :] = np.arange(lenb+1)
  68. for i in range(1,lena+1):
  69. for j in range(1,lenb+1):
  70. if a[i-1] == b[j-1]:
  71. cost = 0
  72. else:
  73. cost = wt_sub
  74. m[i, j] = min(
  75. m[i-1, j] + wt_del, # deletion
  76. m[i, j-1] + wt_ins, # insertion
  77. m[i-1, j-1] + cost, # substitution
  78. )
  79. if i and j and a[i-1] == b[j-2] and a[i-2] == b[j-1] and a[i-1] != b[j-1]:
  80. m[i,j] = min(m[i, j], m[i-2, j-2] + wt_trans) # transposition
  81. return(m[lena, lenb])
  82. def old_wt_mat(a, b, del_vec, ins_vec, sub_mat, chars=list(ascii_letters)):
  83. """Calculate Levenshtein distance between two strings using position-specific weights.
  84. Parameters
  85. ----------
  86. a : str
  87. b : str
  88. del_vec: ndarray
  89. A 1D array of the deletion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)`
  90. ins_vec: ndarray
  91. A 1D array of the insertion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)`
  92. sub_mat: ndarray
  93. A 2d array of the substitution costs for all characters in `chars`. Dimension 0 refers to characters in `a`, while dimension one refers to characters in `b`, such that `sub_mat[10, 11]` should be the cost of substituting `'k'` in `a` with `'l'` in `b`, assuming `chars=string.ascii_letters`. `sub_mat.shape` should equal `(len(chars), len(chars))`.
  94. chars: list
  95. A list of all characters. Positions should correspond to indices in `del_vec`, `ins_vec`, and `sub_mat`
  96. Returns
  97. -------
  98. float
  99. The Damerau-Levenshtein distance between `a` and `b`
  100. """
  101. lena = len(a)
  102. lenb = len(b)
  103. m = np.empty((lena+1, lenb+1))
  104. # m[:] = np.nan
  105. m[0, 0] = 0
  106. # set costs for starting characters (cumsum because the non-weighted equivalent would be (1,2,3,4) for four characters)
  107. m[1:(lena+1), 0] = np.cumsum([ins_vec[chars.index(x)] for x in a])
  108. m[0, 1:(lenb+1)] = np.cumsum([ins_vec[chars.index(x)] for x in b])
  109. # get possible operation combination costs
  110. for i in range(1,lena+1):
  111. del_cost = del_vec[chars.index(a[i-1])]
  112. for j in range(1,lenb+1):
  113. ins_cost = ins_vec[chars.index(b[j-1])]
  114. if a[i-1] == b[j-1]:
  115. sub_cost = 0
  116. else:
  117. sub_cost = sub_mat[chars.index(a[i-1]), chars.index(b[j-1])]
  118. m[i, j] = min(
  119. m[i-1, j] + del_cost, # deletion
  120. m[i, j-1] + ins_cost, # insertion
  121. m[i-1, j-1] + sub_cost, # substitution
  122. )
  123. return(m[lena, lenb])
  124. def old20(w, words, n=20, **kwargs):
  125. """Calculate Orthographic Levenshtein Distance 20 (OLD20) between a string, `w`, and its neighbourhood, `words`, with optional change to `n` (i.e., OLDn), and optional change of function to calculate variants of string similarity.
  126. Parameters
  127. ----------
  128. w : str
  129. The string to calculate OLD20 for.
  130. words : list
  131. The list of strings to calculate the distance from `w` for.
  132. n : int
  133. The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20.
  134. fun : function
  135. The function used to calculate string similarities. Default of `old` reflects OLD20. Alternatives include `dld` for Damerau-Levenshtein distance, `old_wt_mat` for position-specific weights, and `text_arr_sim.string_px_dist` for pixel difference between entire aligned strings. Note, however, that in the last case it would be faster to use `px_dist20()`.
  136. **kwargs
  137. These arguments will be passed to `fun`.
  138. Returns
  139. -------
  140. float
  141. The mean Orthographic Levenshtein Distance between `w` and its `n` closest neighbours in `words`
  142. """
  143. if w in words:
  144. words.remove(w)
  145. all_dists = [fun(w, x, **kwargs) for x in words]
  146. all_dists = np.asarray(all_dists, dtype=np.int64)
  147. all_dists = np.sort(all_dists)
  148. return(np.mean(all_dists[0:n]))
  149. def px_dist20(w, words, n=20, **kwargs):
  150. """Calculate pixel distance neighourhood for a string (or character), `w`, and it's neigbourhood, `words`, with optional change to `n` (i.e., OLDn). This function is equivalent to providing `px_dist()` to `string_sim.old20()`, but is faster as all arrays are prerendered.
  151. Parameters
  152. ----------
  153. w : str
  154. The string to calculate OLD20 for.
  155. words : list
  156. The list of strings to calculate the distance from `w` for.
  157. n : int
  158. The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20.
  159. **kwargs
  160. Arguments to pass to `draw.text_array()` when drawing all words' images.
  161. Returns
  162. -------
  163. float
  164. The mean pixel distance between `w` and its `n` closest neighbours in `words`.
  165. Examples
  166. --------
  167. >>> # siplified example, calculating the average distance of the 20 closest alphabetic characters to 'a'
  168. >>> from string import ascii_letters
  169. >>> px_dist20('a', list(ascii_letters), size=100, font='arial.ttf')
  170. 889.3
  171. """
  172. if w in words:
  173. words.remove(w)
  174. w_arr = draw.text_array(w, **kwargs)
  175. words_arrs = [draw.text_array(x, **kwargs) for x in words]
  176. all_dists = np.array([arr_sim.px_dist(w_arr, x) for x in words_arrs])
  177. all_dists = np.sort(all_dists)
  178. return(np.mean(all_dists[0:n]))