Source code for shorttext.metrics.dynprog.jaccard


from itertools import product

from .dldist import damerau_levenshtein
from .lcp import longest_common_prefix


[docs] def similarity(word1: str, word2: str) -> float: """Calculate similarity between two words. Computes similarity as the maximum of: - 1 - Damerau-Levenshtein distance / max length - Longest common prefix length / max length Args: word1: First word. word2: Second word. Returns: Similarity score between 0 and 1. Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," IEEE CBMS 2014, pp. 347-350. http://ieeexplore.ieee.org/abstract/document/6881904/ """ maxlen = max(len(word1), len(word2)) editdistance = damerau_levenshtein(word1, word2) lcp = longest_common_prefix(word1, word2) return max(1. - float(editdistance)/maxlen, float(lcp)/maxlen)
[docs] def soft_intersection_list(tokens1: list[str], tokens2: list[str]) -> set[str]: """Compute soft intersection between two token lists. Finds the best matching pairs between tokens using similarity, where each token can only match once. Args: tokens1: First list of tokens. tokens2: Second list of tokens. Returns: Set of ((token1, token2), similarity) tuples representing matches. """ intersected_list = [((token1, token2), similarity(token1, token2)) for token1, token2 in product(tokens1, tokens2)] intersected_list = sorted(intersected_list, key=lambda item: item[1], reverse=True) included_list = set() used_tokens1 = set() used_tokens2 = set() for (token1, token2), sim in intersected_list: if (not (token1 in used_tokens1)) and (not (token2 in used_tokens2)): included_list.add(((token1, token2), sim)) used_tokens1.add(token1) used_tokens2.add(token2) return included_list
[docs] def soft_jaccard_score(tokens1: str, tokens2: str) -> float: """Compute soft Jaccard score between token lists. Uses fuzzy matching based on edit distance and longest common prefix. Args: tokens1: First list of tokens. tokens2: Second list of tokens. Returns: Soft Jaccard score between 0 and 1. Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," IEEE CBMS 2014, pp. 347-350. http://ieeexplore.ieee.org/abstract/document/6881904/ """ intersection_list = soft_intersection_list(tokens1, tokens2) num_intersections = sum([item[1] for item in intersection_list]) num_unions = len(tokens1) + len(tokens2) - num_intersections return num_intersections / num_unions