"""Functions for string similarity metrics""" from string import ascii_letters import numpy as np from scold import draw from scold import arr_sim def old(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0): """Calculate Levenshtein distance between two strings. Parameters ---------- a : str b : str wt_del: float the cost assigned to deletion operations wt_ins: float the cost assigned to insertion operations wt_sub: float the cost assigned to substitution operations Returns ------- float The Levenshtein distance between `a` and `b` """ lena = len(a) lenb = len(b) m = np.empty((lena+1, lenb+1)) # m[:] = np.nan m[0, 0] = 0 m[:, 0] = np.arange(lena+1) m[0, :] = np.arange(lenb+1) for i in range(1,lena+1): for j in range(1,lenb+1): if a[i-1] == b[j-1]: cost = 0 else: cost = wt_sub m[i, j] = min( m[i-1, j] + wt_del, # deletion m[i, j-1] + wt_ins, # insertion m[i-1, j-1] + cost, # substitution ) return(m[lena, lenb]) def dld(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0, wt_trans=1.0): """Calculate Damerau-Levenshtein distance between two strings. Parameters ---------- a : str b : str wt_del: float the cost assigned to deletion operations wt_ins: float the cost assigned to insertion operations wt_sub: float the cost assigned to substitution operations wt_del: float the cost assigned to transposition operations Returns ------- float The Damerau-Levenshtein distance between `a` and `b` """ lena = len(a) lenb = len(b) m = np.empty((lena+1, lenb+1)) # m[:] = np.nan m[0, 0] = 0 m[:, 0] = np.arange(lena+1) m[0, :] = np.arange(lenb+1) for i in range(1,lena+1): for j in range(1,lenb+1): if a[i-1] == b[j-1]: cost = 0 else: cost = wt_sub m[i, j] = min( m[i-1, j] + wt_del, # deletion m[i, j-1] + wt_ins, # insertion m[i-1, j-1] + cost, # substitution ) if i and j and a[i-1] == b[j-2] and a[i-2] == b[j-1] and a[i-1] != b[j-1]: m[i,j] = min(m[i, j], m[i-2, j-2] + wt_trans) # transposition return(m[lena, lenb]) def old_wt_mat(a, b, del_vec, ins_vec, sub_mat, chars=list(ascii_letters)): """Calculate Levenshtein distance between two strings using position-specific weights. Parameters ---------- a : str b : str del_vec: ndarray A 1D array of the deletion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)` ins_vec: ndarray A 1D array of the insertion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)` sub_mat: ndarray A 2d array of the substitution costs for all characters in `chars`. Dimension 0 refers to characters in `a`, while dimension one refers to characters in `b`, such that `sub_mat[10, 11]` should be the cost of substituting `'k'` in `a` with `'l'` in `b`, assuming `chars=string.ascii_letters`. `sub_mat.shape` should equal `(len(chars), len(chars))`. chars: list A list of all characters. Positions should correspond to indices in `del_vec`, `ins_vec`, and `sub_mat` Returns ------- float The Damerau-Levenshtein distance between `a` and `b` """ lena = len(a) lenb = len(b) m = np.empty((lena+1, lenb+1)) # m[:] = np.nan m[0, 0] = 0 # set costs for starting characters (cumsum because the non-weighted equivalent would be (1,2,3,4) for four characters) m[1:(lena+1), 0] = np.cumsum([ins_vec[chars.index(x)] for x in a]) m[0, 1:(lenb+1)] = np.cumsum([ins_vec[chars.index(x)] for x in b]) # get possible operation combination costs for i in range(1,lena+1): del_cost = del_vec[chars.index(a[i-1])] for j in range(1,lenb+1): ins_cost = ins_vec[chars.index(b[j-1])] if a[i-1] == b[j-1]: sub_cost = 0 else: sub_cost = sub_mat[chars.index(a[i-1]), chars.index(b[j-1])] m[i, j] = min( m[i-1, j] + del_cost, # deletion m[i, j-1] + ins_cost, # insertion m[i-1, j-1] + sub_cost, # substitution ) return(m[lena, lenb]) def old20(w, words, n=20, **kwargs): """Calculate Orthographic Levenshtein Distance 20 (OLD20) between a string, `w`, and its neighbourhood, `words`, with optional change to `n` (i.e., OLDn), and optional change of function to calculate variants of string similarity. Parameters ---------- w : str The string to calculate OLD20 for. words : list The list of strings to calculate the distance from `w` for. n : int The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20. fun : function The function used to calculate string similarities. Default of `old` reflects OLD20. Alternatives include `dld` for Damerau-Levenshtein distance, `old_wt_mat` for position-specific weights, and `text_arr_sim.string_px_dist` for pixel difference between entire aligned strings. Note, however, that in the last case it would be faster to use `px_dist20()`. **kwargs These arguments will be passed to `fun`. Returns ------- float The mean Orthographic Levenshtein Distance between `w` and its `n` closest neighbours in `words` """ if w in words: words.remove(w) all_dists = [fun(w, x, **kwargs) for x in words] all_dists = np.asarray(all_dists, dtype=np.int64) all_dists = np.sort(all_dists) return(np.mean(all_dists[0:n])) def px_dist20(w, words, n=20, **kwargs): """Calculate pixel distance neighourhood for a string (or character), `w`, and it's neigbourhood, `words`, with optional change to `n` (i.e., OLDn). This function is equivalent to providing `px_dist()` to `string_sim.old20()`, but is faster as all arrays are prerendered. Parameters ---------- w : str The string to calculate OLD20 for. words : list The list of strings to calculate the distance from `w` for. n : int The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20. **kwargs Arguments to pass to `draw.text_array()` when drawing all words' images. Returns ------- float The mean pixel distance between `w` and its `n` closest neighbours in `words`. Examples -------- >>> # siplified example, calculating the average distance of the 20 closest alphabetic characters to 'a' >>> from string import ascii_letters >>> px_dist20('a', list(ascii_letters), size=100, font='arial.ttf') 889.3 """ if w in words: words.remove(w) w_arr = draw.text_array(w, **kwargs) words_arrs = [draw.text_array(x, **kwargs) for x in words] all_dists = np.array([arr_sim.px_dist(w_arr, x) for x in words_arrs]) all_dists = np.sort(all_dists) return(np.mean(all_dists[0:n]))