Source code for shorttext.utils.gensim_corpora


from collections import Counter
from typing import Optional

import gensim
from deprecation import deprecated

from .textpreprocessing import tokenize


[docs] def generate_gensim_corpora( classdict: dict[str, list[str]], preprocess_and_tokenize: Optional[callable] = None ) -> tuple[gensim.corpora.Dictionary, list[list[tuple[int, int]]], list[str]]: """Generate gensim dictionary and corpus from training data. Args: classdict: Training data with class labels as keys and lists of texts as values. preprocess_and_tokenize: Function to preprocess and tokenize text. Default: tokenize. Returns: Tuple of (dictionary, corpus, class_labels). """ if preprocess_and_tokenize is None: preprocess_and_tokenize = tokenize classlabels = sorted(classdict.keys()) doc = [preprocess_and_tokenize(' '.join(classdict[classlabel])) for classlabel in classlabels] dictionary = gensim.corpora.Dictionary(doc) corpus = [dictionary.doc2bow(doctokens) for doctokens in doc] return dictionary, corpus, classlabels
[docs] @deprecated(deprecated_in="4.0.0", removed_in="5.0.0") def save_corpus( dictionary: gensim.corpora.Dictionary, corpus: list[list[tuple[int, int]]], prefix: str ) -> None: """Save gensim corpus and dictionary to files. Args: dictionary: Dictionary to save. corpus: Corpus to save. prefix: Prefix for output files. Note: Deprecated since 5.0.0, will be removed in 6.0.0. """ dictionary.save(prefix+'_dictionary.dict') gensim.corpora.MmCorpus.serialize(prefix+'_corpus.mm', corpus)
[docs] @deprecated(deprecated_in="4.0.0", removed_in="5.0.0") def load_corpus(prefix: str) -> tuple[gensim.corpora.MmCorpus, gensim.corpora.Dictionary]: """Load gensim corpus and dictionary from files. Args: prefix: Prefix of files to load. Returns: Tuple of (corpus, dictionary). Note: Deprecated since 5.0.0, will be removed in 6.0.0. """ corpus = gensim.corpora.MmCorpus(prefix+'_corpus.mm') dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict') return corpus, dictionary
[docs] def update_corpus_labels( dictionary: gensim.corpora.Dictionary, corpus: list[list[tuple[int, int]]], newclassdict: dict[str, list[str]], preprocess_and_tokenize: Optional[callable] = None ) -> tuple[list[list[tuple[int, int]]], list[list[tuple[int, int]]]]: """Update corpus with additional training data. Args: dictionary: Existing dictionary. corpus: Existing corpus. newclassdict: Additional training data. preprocess_and_tokenize: Function to preprocess text. Default: tokenize. Returns: Tuple of (updated_corpus, new_corpus). """ if preprocess_and_tokenize is None: preprocess_and_tokenize = tokenize newdoc = [preprocess_and_tokenize(' '.join(newclassdict[classlabel])) for classlabel in sorted(newclassdict.keys())] newcorpus = [dictionary.doc2bow(doctokens) for doctokens in newdoc] corpus += newcorpus return corpus, newcorpus
[docs] def tokens_to_fracdict(tokens: list[str]) -> dict[str, float]: """Convert tokens to normalized frequency dictionary. Args: tokens: List of tokens. Returns: Dictionary with tokens as keys and normalized frequencies as values. """ cntdict = Counter(tokens) totalcnt = sum(cntdict.values()) return {token: cnt / totalcnt for token, cnt in cntdict.items()}