JackEdTaylor
/
lettersim-ot-rsa


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
							"""Functions for string similarity metrics"""

from string import ascii_letters
import numpy as np

from scold import draw
from scold import arr_sim

def old(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0):
    """Calculate Levenshtein distance between two strings.
    
    Parameters
    ----------
    a : str
    b : str
    wt_del: float
        the cost assigned to deletion operations
    wt_ins: float
        the cost assigned to insertion operations
    wt_sub: float
        the cost assigned to substitution operations
    
    Returns
    -------
    float
        The Levenshtein distance between `a` and `b`
    """
    lena = len(a)
    lenb = len(b)
    
    m = np.empty((lena+1, lenb+1))
    # m[:] = np.nan
    m[0, 0] = 0
    m[:, 0] = np.arange(lena+1)
    m[0, :] = np.arange(lenb+1)
    
    for i in range(1,lena+1):
        for j in range(1,lenb+1):
            if a[i-1] == b[j-1]:
                cost = 0
            else:
                cost = wt_sub
            
            m[i, j] = min(
                           m[i-1, j] + wt_del, # deletion
                           m[i, j-1] + wt_ins, # insertion
                           m[i-1, j-1] + cost, # substitution
                          )
    
    return(m[lena, lenb])

def dld(a, b, wt_del=1.0, wt_ins=1.0, wt_sub=1.0, wt_trans=1.0):
    """Calculate Damerau-Levenshtein distance between two strings.
    
    Parameters
    ----------
    a : str
    b : str
    wt_del: float
        the cost assigned to deletion operations
    wt_ins: float
        the cost assigned to insertion operations
    wt_sub: float
        the cost assigned to substitution operations
    wt_del: float
        the cost assigned to transposition operations
    
    Returns
    -------
    float
        The Damerau-Levenshtein distance between `a` and `b`
    """
    lena = len(a)
    lenb = len(b)
    
    m = np.empty((lena+1, lenb+1))
    # m[:] = np.nan
    m[0, 0] = 0
    m[:, 0] = np.arange(lena+1)
    m[0, :] = np.arange(lenb+1)
    
    for i in range(1,lena+1):
        for j in range(1,lenb+1):
            if a[i-1] == b[j-1]:
                cost = 0
            else:
                cost = wt_sub
            
            m[i, j] = min(
                           m[i-1, j] + wt_del, # deletion
                           m[i, j-1] + wt_ins, # insertion
                           m[i-1, j-1] + cost, # substitution
                          )
            
            if i and j and a[i-1] == b[j-2] and a[i-2] == b[j-1] and a[i-1] != b[j-1]:
                m[i,j] = min(m[i, j], m[i-2, j-2] + wt_trans) # transposition
    
    return(m[lena, lenb])

def old_wt_mat(a, b, del_vec, ins_vec, sub_mat, chars=list(ascii_letters)):
    """Calculate Levenshtein distance between two strings using position-specific weights.
    
    Parameters
    ----------
    a : str
    b : str
    del_vec: ndarray
        A 1D array of the deletion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)`
    ins_vec: ndarray
        A 1D array of the insertion costs for all characters in `chars`. `len(del_vec)` should equal `len(chars)`
    sub_mat: ndarray
        A 2d array of the substitution costs for all characters in `chars`. Dimension 0 refers to characters in `a`, while dimension one refers to characters in `b`, such that `sub_mat[10, 11]` should be the cost of substituting `'k'` in `a` with `'l'` in `b`, assuming `chars=string.ascii_letters`. `sub_mat.shape` should equal `(len(chars), len(chars))`.
    chars: list
        A list of all characters. Positions should correspond to indices in `del_vec`, `ins_vec`, and `sub_mat`
    
    Returns
    -------
    float
        The Damerau-Levenshtein distance between `a` and `b`
    """
    
    lena = len(a)
    lenb = len(b)
    
    m = np.empty((lena+1, lenb+1))
    # m[:] = np.nan
    m[0, 0] = 0
    
    # set costs for starting characters (cumsum because the non-weighted equivalent would be (1,2,3,4) for four characters)
    m[1:(lena+1), 0] = np.cumsum([ins_vec[chars.index(x)] for x in a])
    m[0, 1:(lenb+1)] = np.cumsum([ins_vec[chars.index(x)] for x in b])
    
    # get possible operation combination costs
    for i in range(1,lena+1):
        del_cost = del_vec[chars.index(a[i-1])]
        
        for j in range(1,lenb+1):
            ins_cost = ins_vec[chars.index(b[j-1])]
            
            if a[i-1] == b[j-1]:
                sub_cost = 0
            else:
                sub_cost = sub_mat[chars.index(a[i-1]), chars.index(b[j-1])]
            
            m[i, j] = min(
                           m[i-1, j] + del_cost, # deletion
                           m[i, j-1] + ins_cost, # insertion
                           m[i-1, j-1] + sub_cost, # substitution
                          )
    
    return(m[lena, lenb])

def old20(w, words, n=20, **kwargs):
    """Calculate Orthographic Levenshtein Distance 20 (OLD20) between a string, `w`, and its neighbourhood, `words`, with optional change to `n` (i.e., OLDn), and optional change of function to calculate variants of string similarity.
    
    Parameters
    ----------
    w : str
        The string to calculate OLD20 for.
    words : list
        The list of strings to calculate the distance from `w` for.
    n : int
        The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20.
    fun : function
        The function used to calculate string similarities. Default of `old` reflects OLD20. Alternatives include `dld` for Damerau-Levenshtein distance, `old_wt_mat` for position-specific weights, and `text_arr_sim.string_px_dist` for pixel difference between entire aligned strings. Note, however, that in the last case it would be faster to use `px_dist20()`.
    **kwargs
        These arguments will be passed to `fun`.
    
    Returns
    -------
    float
        The mean Orthographic Levenshtein Distance between `w` and its `n` closest neighbours in `words`
    """
    if w in words:
        words.remove(w)
    
    all_dists = [fun(w, x, **kwargs) for x in words]
    all_dists = np.asarray(all_dists, dtype=np.int64)
    all_dists = np.sort(all_dists)
    return(np.mean(all_dists[0:n]))

def px_dist20(w, words, n=20, **kwargs):
    """Calculate pixel distance neighourhood for a string (or character), `w`, and it's neigbourhood, `words`, with optional change to `n` (i.e., OLDn). This function is equivalent to providing `px_dist()` to `string_sim.old20()`, but is faster as all arrays are prerendered.
    
    Parameters
    ----------
    w : str
        The string to calculate OLD20 for.
    words : list
        The list of strings to calculate the distance from `w` for.
    n : int
        The number of closest neighbours to use when calculating the average distance. Default of `20` reflects OLD20.
    **kwargs
        Arguments to pass to `draw.text_array()` when drawing all words' images.
    
    Returns
    -------
    float
        The mean pixel distance between `w` and its `n` closest neighbours in `words`.

    Examples
    --------
    >>> # siplified example, calculating the average distance of the 20 closest alphabetic characters to 'a'
    >>> from string import ascii_letters
    >>> px_dist20('a', list(ascii_letters), size=100, font='arial.ttf')
    889.3
    """
    if w in words:
        words.remove(w)

    w_arr = draw.text_array(w, **kwargs)
    words_arrs = [draw.text_array(x, **kwargs) for x in words]

    all_dists = np.array([arr_sim.px_dist(w_arr, x) for x in words_arrs])
    all_dists = np.sort(all_dists)
    return(np.mean(all_dists[0:n]))