Source code for shorttext.smartload


from typing import Optional
from os import PathLike

import gensim

from .utils import standard_text_preprocessor_1
from .utils import compactmodel_io as cio
from .utils import classification_exceptions as e
from .classifiers import VarNNEmbeddedVecClassifier, SumEmbeddedVecClassifier
from .generators import GensimTopicModeler
from .generators.bow.AutoEncodingTopicModeling import AutoencodingTopicModeler
from .generators import CharBasedSeq2SeqGenerator, Seq2SeqWithKeras
from .classifiers import TopicVectorSkLearnClassifier
from .classifiers.bow.maxent.MaxEntClassification import MaxEntClassifier
from .utils.dtm import NumpyDocumentTermMatrix


[docs] def smartload_compact_model( filename: str | PathLike, wvmodel: Optional[gensim.models.keyedvectors.KeyedVectors], preprocessor: Optional[callable] = None, vecsize: Optional[int] = None ): """Load a classifier or model from a compact file. Automatically detects the model type and loads the appropriate classifier. Set wvmodel to None if no word embedding model is needed. Args: filename: Path to the compact model file. wvmodel: Word embedding model. Can be None for non-embedding models. preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. vecsize: Vector size. Default: None (extracted from model). Returns: Appropriate classifier or model instance. Raises: AlgorithmNotExistException: If model type is unknown. """ if preprocessor is None: preprocessor = standard_text_preprocessor_1() classifier_name = cio.get_model_classifier_name(filename) match classifier_name: case 'ldatopic' | 'lsitopic' | 'rptopic': return GensimTopicModeler.from_pretrained(filename, preprocessor=preprocessor, compact=True) case 'kerasautoencoder': return AutoencodingTopicModeler.from_pretrained(filename, preprocessor=preprocessor, compact=True) case 'topic_sklearn': topicmodel = cio.get_model_config_field(filename, 'topicmodel') if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']: return TopicVectorSkLearnClassifier.from_pretrained_gensimtopic_sklearnclassifier( filename, preprocessor=preprocessor, compact=True ) elif topicmodel in ['kerasautoencoder']: return TopicVectorSkLearnClassifier.from_pretrained_autoencoder_sklearnclassifier( filename, preprocessor=preprocessor, compact=True ) else: raise e.AlgorithmNotExistException(topicmodel) case 'nnlibvec': return VarNNEmbeddedVecClassifier.from_pretrained(wvmodel, filename, compact=True, vecsize=vecsize) case 'sumvec': return SumEmbeddedVecClassifier.from_pretrained(wvmodel, filename, compact=True, vecsize=vecsize) case 'maxent': return MaxEntClassifier.from_pretrained(filename, compact=True) case 'kerasseq2seq': return Seq2SeqWithKeras.from_pretrained(filename, compact=True) case 'charbases2s': return CharBasedSeq2SeqGenerator.from_pretrained(filename, compact=True) case "npdtm": return NumpyDocumentTermMatrix.from_npdict_file(filename) case _: raise e.AlgorithmNotExistException(classifier_name)