Source code for shorttext.metrics.embedfuzzy.jaccard


from itertools import product
from typing import Optional

import numpy as np
from gensim.models.keyedvectors import KeyedVectors

from ...utils import tokenize
from ...utils.compute import cosine_similarity


[docs] def jaccardscore_sents( sent1: str, sent2: str, wvmodel: KeyedVectors, sim_words: Optional[callable] = None ) -> float: """Compute Jaccard score between sentences using embeddings. Uses word embeddings to compute a fuzzy Jaccard score where word similarity is measured via embedding cosine similarity. Args: sent1: First sentence. sent2: Second sentence. wvmodel: Word embedding model. sim_words: Similarity function for word vectors. Default: cosine. Returns: Fuzzy Jaccard score between 0 and 1. """ if sim_words is None: sim_words = cosine_similarity tokens1 = tokenize(sent1) tokens2 = tokenize(sent2) tokens1 = list(filter(lambda w: w in wvmodel, tokens1)) tokens2 = list(filter(lambda w: w in wvmodel, tokens2)) allowable1 = [True] * len(tokens1) allowable2 = [True] * len(tokens2) simdict = {(i, j): sim_words(wvmodel[tokens1[i]].astype(np.float64), wvmodel[tokens2[j]].astype(np.float64)) for i, j in product(range(len(tokens1)), range(len(tokens2)))} intersection = 0.0 simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True) for idxtuple, sim in simdictitems: i, j = idxtuple if allowable1[i] and allowable2[j]: intersection += sim allowable1[i] = False allowable2[j] = False union = len(tokens1) + len(tokens2) - intersection if union > 0: return intersection / union elif intersection == 0: return 1. else: return np.inf