Source code for shorttext.smartload


from typing import Optional
from os import PathLike

import gensim

from .utils import standard_text_preprocessor_1
from .utils import compactmodel_io as cio
from .utils import classification_exceptions as e
from .classifiers import VarNNEmbeddedVecClassifier, SumEmbeddedVecClassifier
from .generators import GensimTopicModeler
from .generators.bow.AutoEncodingTopicModeling import AutoencodingTopicModeler
from .generators import CharBasedSeq2SeqGenerator, Seq2SeqWithKeras
from .classifiers import TopicVectorSkLearnClassifier
from .classifiers.bow.maxent.MaxEntClassification import MaxEntClassifier
from .utils.dtm import NumpyDocumentTermMatrix



[docs]
def smartload_compact_model(
        filename: str | PathLike,
        wvmodel: Optional[gensim.models.keyedvectors.KeyedVectors],
        preprocessor: Optional[callable] = None,
        vecsize: Optional[int] = None
):
    """Load a classifier or model from a compact file.

    Automatically detects the model type and loads the appropriate classifier.
    Set wvmodel to None if no word embedding model is needed.

    Args:
        filename: Path to the compact model file.
        wvmodel: Word embedding model. Can be None for non-embedding models.
        preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1.
        vecsize: Vector size. Default: None (extracted from model).

    Returns:
        Appropriate classifier or model instance.

    Raises:
        AlgorithmNotExistException: If model type is unknown.
    """
    if preprocessor is None:
        preprocessor = standard_text_preprocessor_1()

    classifier_name = cio.get_model_classifier_name(filename)
    match classifier_name:
        case 'ldatopic' | 'lsitopic' | 'rptopic':
            return GensimTopicModeler.from_pretrained(filename, preprocessor=preprocessor, compact=True)
        case 'kerasautoencoder':
            return AutoencodingTopicModeler.from_pretrained(filename, preprocessor=preprocessor, compact=True)
        case 'topic_sklearn':
            topicmodel = cio.get_model_config_field(filename, 'topicmodel')
            if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']:
                return TopicVectorSkLearnClassifier.from_pretrained_gensimtopic_sklearnclassifier(
                    filename, preprocessor=preprocessor, compact=True
                )
            elif topicmodel in ['kerasautoencoder']:
                return TopicVectorSkLearnClassifier.from_pretrained_autoencoder_sklearnclassifier(
                    filename, preprocessor=preprocessor, compact=True
                )
            else:
                raise e.AlgorithmNotExistException(topicmodel)
        case 'nnlibvec':
            return VarNNEmbeddedVecClassifier.from_pretrained(wvmodel, filename, compact=True, vecsize=vecsize)
        case 'sumvec':
            return SumEmbeddedVecClassifier.from_pretrained(wvmodel, filename, compact=True, vecsize=vecsize)
        case 'maxent':
            return MaxEntClassifier.from_pretrained(filename, compact=True)
        case 'kerasseq2seq':
            return Seq2SeqWithKeras.from_pretrained(filename, compact=True)
        case 'charbases2s':
            return CharBasedSeq2SeqGenerator.from_pretrained(filename, compact=True)
        case "npdtm":
            return NumpyDocumentTermMatrix.from_npdict_file(filename)
        case _:
            raise e.AlgorithmNotExistException(classifier_name)