Source code for shorttext.utils.gensim_corpora


from collections import Counter
from typing import Optional

import gensim
from deprecation import deprecated

from .textpreprocessing import tokenize



[docs]
def generate_gensim_corpora(
        classdict: dict[str, list[str]],
        preprocess_and_tokenize: Optional[callable] = None
) -> tuple[gensim.corpora.Dictionary, list[list[tuple[int, int]]], list[str]]:
    """Generate gensim dictionary and corpus from training data.

    Args:
        classdict: Training data with class labels as keys and lists of texts as values.
        preprocess_and_tokenize: Function to preprocess and tokenize text. Default: tokenize.

    Returns:
        Tuple of (dictionary, corpus, class_labels).
    """
    if preprocess_and_tokenize is None:
        preprocess_and_tokenize = tokenize

    classlabels = sorted(classdict.keys())
    doc = [preprocess_and_tokenize(' '.join(classdict[classlabel])) for classlabel in classlabels]
    dictionary = gensim.corpora.Dictionary(doc)
    corpus = [dictionary.doc2bow(doctokens) for doctokens in doc]
    return dictionary, corpus, classlabels




[docs]
@deprecated(deprecated_in="4.0.0", removed_in="5.0.0")
def save_corpus(
        dictionary: gensim.corpora.Dictionary,
        corpus: list[list[tuple[int, int]]],
        prefix: str
) -> None:
    """Save gensim corpus and dictionary to files.

    Args:
        dictionary: Dictionary to save.
        corpus: Corpus to save.
        prefix: Prefix for output files.

    Note:
        Deprecated since 5.0.0, will be removed in 6.0.0.
    """
    dictionary.save(prefix+'_dictionary.dict')
    gensim.corpora.MmCorpus.serialize(prefix+'_corpus.mm', corpus)




[docs]
@deprecated(deprecated_in="4.0.0", removed_in="5.0.0")
def load_corpus(prefix: str) -> tuple[gensim.corpora.MmCorpus, gensim.corpora.Dictionary]:
    """Load gensim corpus and dictionary from files.

    Args:
        prefix: Prefix of files to load.

    Returns:
        Tuple of (corpus, dictionary).

    Note:
        Deprecated since 5.0.0, will be removed in 6.0.0.
    """
    corpus = gensim.corpora.MmCorpus(prefix+'_corpus.mm')
    dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict')
    return corpus, dictionary




[docs]
def update_corpus_labels(
        dictionary: gensim.corpora.Dictionary,
        corpus: list[list[tuple[int, int]]],
        newclassdict: dict[str, list[str]],
        preprocess_and_tokenize: Optional[callable] = None
) -> tuple[list[list[tuple[int, int]]], list[list[tuple[int, int]]]]:
    """Update corpus with additional training data.

    Args:
        dictionary: Existing dictionary.
        corpus: Existing corpus.
        newclassdict: Additional training data.
        preprocess_and_tokenize: Function to preprocess text. Default: tokenize.

    Returns:
        Tuple of (updated_corpus, new_corpus).
    """
    if preprocess_and_tokenize is None:
        preprocess_and_tokenize = tokenize

    newdoc = [preprocess_and_tokenize(' '.join(newclassdict[classlabel])) for classlabel in sorted(newclassdict.keys())]
    newcorpus = [dictionary.doc2bow(doctokens) for doctokens in newdoc]
    corpus += newcorpus

    return corpus, newcorpus




[docs]
def tokens_to_fracdict(tokens: list[str]) -> dict[str, float]:
    """Convert tokens to normalized frequency dictionary.

    Args:
        tokens: List of tokens.

    Returns:
        Dictionary with tokens as keys and normalized frequencies as values.
    """
    cntdict = Counter(tokens)
    totalcnt = sum(cntdict.values())
    return {token: cnt / totalcnt for token, cnt in cntdict.items()}