Source code for shorttext.metrics.wasserstein.wordmoverdist


from itertools import product
from typing import Optional
import warnings

import numpy as np
from scipy.spatial.distance import euclidean
from scipy.sparse import csr_matrix
from scipy.optimize import linprog, OptimizeResult
from gensim.models.keyedvectors import KeyedVectors

from ...utils.gensim_corpora import tokens_to_fracdict


[docs] def word_mover_distance_linprog( first_sent_tokens: list[str], second_sent_tokens: list[str], wvmodel: KeyedVectors, distancefunc: Optional[callable] = None ) -> OptimizeResult: """Compute Word Mover's distance via linear programming. Uses scipy.optimize.linprog to compute the transport problem for the Word Mover's Distance. Args: first_sent_tokens: First list of tokens. second_sent_tokens: Second list of tokens. wvmodel: Word embedding model. distancefunc: Distance function for word vectors. Default: Euclidean. Returns: scipy.optimize.OptimizeResult containing the optimization result. Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," ICML 2015. """ if distancefunc is None: distancefunc = euclidean nb_tokens_first_sent = len(first_sent_tokens) nb_tokens_second_sent = len(second_sent_tokens) all_tokens = list(set(first_sent_tokens+second_sent_tokens)) wordvecs = {token: wvmodel[token].astype(np.float64) for token in all_tokens} first_sent_buckets = tokens_to_fracdict(first_sent_tokens) second_sent_buckets = tokens_to_fracdict(second_sent_tokens) collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j # assigning T T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent) for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)): T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]], wordvecs[second_sent_tokens[j]]) # assigning Aeq and beq Aeq = csr_matrix( (nb_tokens_first_sent+nb_tokens_second_sent, nb_tokens_first_sent*nb_tokens_second_sent) ) beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent) for i in range(nb_tokens_first_sent): for j in range(nb_tokens_second_sent): Aeq[i, collapsed_idx_func(i, j)] = 1. beq[i] = first_sent_buckets[first_sent_tokens[i]] for j in range(nb_tokens_second_sent): for i in range(nb_tokens_first_sent): Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1. beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]] return linprog(T, A_eq=Aeq, b_eq=beq)
[docs] def word_mover_distance( first_sent_tokens: list[str], second_sent_tokens: list[str], wvmodel: KeyedVectors, distancefunc: Optional[callable] = None ) -> float: """Compute Word Mover's distance between token lists. Uses word embeddings to compute the minimum transport cost between words in two sentences. Args: first_sent_tokens: First list of tokens. second_sent_tokens: Second list of tokens. wvmodel: Word embedding model. distancefunc: Distance function for word vectors. Default: Euclidean. Returns: The Word Mover's distance (lower is more similar). Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," ICML 2015. """ if distancefunc is None: distancefunc = euclidean linprog_result = word_mover_distance_linprog( first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=distancefunc ) return linprog_result['fun']