123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- """Functions for string similarity metrics"""
- from string import ascii_letters
- import numpy as np
- from scold import draw
- from scold import arr_sim
- def old(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0):
- """Calculate Levenshtein distance between two strings.
-
- Parameters
- ----------
- a : str
- b : str
- wt_del: float
- the cost assigned to deletion operations
- wt_ins: float
- the cost assigned to insertion operations
- wt_sub: float
- the cost assigned to substitution operations
-
- Returns
- -------
- float
- The Levenshtein distance between `a` and `b`
- """
- lena = len(a)
- lenb = len(b)
-
- m = np.empty((lena+1, lenb+1))
- # m[:] = np.nan
- m[0, 0] = 0
- m[:, 0] = np.arange(lena+1)
- m[0, :] = np.arange(lenb+1)
-
- for i in range(1,lena+1):
- for j in range(1,lenb+1):
- if a[i-1] == b[j-1]:
- cost = 0
- else:
- cost = wt_sub
-
- m[i, j] = min(
- m[i-1, j] + wt_del, # deletion
- m[i, j-1] + wt_ins, # insertion
- m[i-1, j-1] + cost, # substitution
- )
-
- return(m[lena, lenb])
- def dld(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0, wt_trans=1.0):
- """Calculate Damerau-Levenshtein distance between two strings.
-
- Parameters
- ----------
- a : str
- b : str
- wt_del: float
- the cost assigned to deletion operations
- wt_ins: float
- the cost assigned to insertion operations
- wt_sub: float
- the cost assigned to substitution operations
- wt_del: float
- the cost assigned to transposition operations
-
- Returns
- -------
- float
- The Damerau-Levenshtein distance between `a` and `b`
- """
- lena = len(a)
- lenb = len(b)
-
- m = np.empty((lena+1, lenb+1))
- # m[:] = np.nan
- m[0, 0] = 0
- m[:, 0] = np.arange(lena+1)
- m[0, :] = np.arange(lenb+1)
-
- for i in range(1,lena+1):
- for j in range(1,lenb+1):
- if a[i-1] == b[j-1]:
- cost = 0
- else:
- cost = wt_sub
-
- m[i, j] = min(
- m[i-1, j] + wt_del, # deletion
- m[i, j-1] + wt_ins, # insertion
- m[i-1, j-1] + cost, # substitution
- )
-
- if i and j and a[i-1] == b[j-2] and a[i-2] == b[j-1] and a[i-1] != b[j-1]:
- m[i,j] = min(m[i, j], m[i-2, j-2] + wt_trans) # transposition
-
- return(m[lena, lenb])
- def old_wt_mat(a, b, del_vec, ins_vec, sub_mat, chars=list(ascii_letters)):
- """Calculate Levenshtein distance between two strings using position-specific weights.
-
- Parameters
- ----------
- a : str
- b : str
- del_vec: ndarray
- A 1D array of the deletion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)`
- ins_vec: ndarray
- A 1D array of the insertion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)`
- sub_mat: ndarray
- A 2d array of the substitution costs for all characters in `chars`. Dimension 0 refers to characters in `a`, while dimension one refers to characters in `b`, such that `sub_mat[10, 11]` should be the cost of substituting `'k'` in `a` with `'l'` in `b`, assuming `chars=string.ascii_letters`. `sub_mat.shape` should equal `(len(chars), len(chars))`.
- chars: list
- A list of all characters. Positions should correspond to indices in `del_vec`, `ins_vec`, and `sub_mat`
-
- Returns
- -------
- float
- The Damerau-Levenshtein distance between `a` and `b`
- """
-
- lena = len(a)
- lenb = len(b)
-
- m = np.empty((lena+1, lenb+1))
- # m[:] = np.nan
- m[0, 0] = 0
-
- # set costs for starting characters (cumsum because the non-weighted equivalent would be (1,2,3,4) for four characters)
- m[1:(lena+1), 0] = np.cumsum([ins_vec[chars.index(x)] for x in a])
- m[0, 1:(lenb+1)] = np.cumsum([ins_vec[chars.index(x)] for x in b])
-
- # get possible operation combination costs
- for i in range(1,lena+1):
- del_cost = del_vec[chars.index(a[i-1])]
-
- for j in range(1,lenb+1):
- ins_cost = ins_vec[chars.index(b[j-1])]
-
- if a[i-1] == b[j-1]:
- sub_cost = 0
- else:
- sub_cost = sub_mat[chars.index(a[i-1]), chars.index(b[j-1])]
-
- m[i, j] = min(
- m[i-1, j] + del_cost, # deletion
- m[i, j-1] + ins_cost, # insertion
- m[i-1, j-1] + sub_cost, # substitution
- )
-
- return(m[lena, lenb])
- def old20(w, words, n=20, **kwargs):
- """Calculate Orthographic Levenshtein Distance 20 (OLD20) between a string, `w`, and its neighbourhood, `words`, with optional change to `n` (i.e., OLDn), and optional change of function to calculate variants of string similarity.
-
- Parameters
- ----------
- w : str
- The string to calculate OLD20 for.
- words : list
- The list of strings to calculate the distance from `w` for.
- n : int
- The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20.
- fun : function
- The function used to calculate string similarities. Default of `old` reflects OLD20. Alternatives include `dld` for Damerau-Levenshtein distance, `old_wt_mat` for position-specific weights, and `text_arr_sim.string_px_dist` for pixel difference between entire aligned strings. Note, however, that in the last case it would be faster to use `px_dist20()`.
- **kwargs
- These arguments will be passed to `fun`.
-
- Returns
- -------
- float
- The mean Orthographic Levenshtein Distance between `w` and its `n` closest neighbours in `words`
- """
- if w in words:
- words.remove(w)
-
- all_dists = [fun(w, x, **kwargs) for x in words]
- all_dists = np.asarray(all_dists, dtype=np.int64)
- all_dists = np.sort(all_dists)
- return(np.mean(all_dists[0:n]))
- def px_dist20(w, words, n=20, **kwargs):
- """Calculate pixel distance neighourhood for a string (or character), `w`, and it's neigbourhood, `words`, with optional change to `n` (i.e., OLDn). This function is equivalent to providing `px_dist()` to `string_sim.old20()`, but is faster as all arrays are prerendered.
-
- Parameters
- ----------
- w : str
- The string to calculate OLD20 for.
- words : list
- The list of strings to calculate the distance from `w` for.
- n : int
- The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20.
- **kwargs
- Arguments to pass to `draw.text_array()` when drawing all words' images.
-
- Returns
- -------
- float
- The mean pixel distance between `w` and its `n` closest neighbours in `words`.
- Examples
- --------
- >>> # siplified example, calculating the average distance of the 20 closest alphabetic characters to 'a'
- >>> from string import ascii_letters
- >>> px_dist20('a', list(ascii_letters), size=100, font='arial.ttf')
- 889.3
- """
- if w in words:
- words.remove(w)
- w_arr = draw.text_array(w, **kwargs)
- words_arrs = [draw.text_array(x, **kwargs) for x in words]
- all_dists = np.array([arr_sim.px_dist(w_arr, x) for x in words_arrs])
- all_dists = np.sort(all_dists)
- return(np.mean(all_dists[0:n]))
|