Source code for shorttext.classifiers.bow.topic.TopicVectorDistanceClassification


from typing import Optional, Literal, Self

from deprecation import deprecated

from ....generators import LatentTopicModeler, GensimTopicModeler, AutoencodingTopicModeler
from ...base import AbstractScorer


[docs] class TopicVecCosineDistanceClassifier(AbstractScorer): """Classifier using cosine similarity with topic vectors. Classifies short text based on cosine similarity between topic vectors of the input and class centroids. Topic vectors are generated by a LatentTopicModeler. """
[docs] def __init__(self, topicmodeler: LatentTopicModeler): """Initialize the classifier. Args: topicmodeler: A LatentTopicModeler instance. """ self.topicmodeler = topicmodeler
[docs] def score(self, shorttext: str) -> dict[str, float]: """Calculate cosine similarity to all class topic vectors. Args: shorttext: Input text. Returns: Dictionary mapping class labels to similarity scores. """ return self.topicmodeler.get_batch_cos_similarities(shorttext)
[docs] def loadmodel(self, nameprefix: str) -> None: """Load the topic model. Args: nameprefix: Prefix for input files. """ self.topicmodeler.loadmodel(nameprefix)
[docs] def savemodel(self, nameprefix: str) -> None: """Save the topic model. Args: nameprefix: Prefix for output files. Raises: ModelNotTrainedException: If model not trained. """ self.topicmodeler.savemodel(nameprefix)
[docs] def load_compact_model(self, name: str) -> None: """Load compact model. Args: name: Name of the compact model file. """ self.topicmodeler.load_compact_model(name)
[docs] def save_compact_model(self, name: str) -> None: """Save compact model. Args: name: Name of the compact model file. """ self.topicmodeler.save_compact_model(name)
[docs] @classmethod def from_pretrained_gensimtopic( cls, name: str, preprocessor: Optional[callable] = None, tokenizer: Optional[callable] = None, compact: bool = True ) -> Self: """Load a gensim topic model and return a cosine classifier. Args: name: Model name (compact) or file prefix (non-compact). preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. compact: Whether to load compact model. Default: True. Returns: TopicVecCosineDistanceClassifier instance. """ topicmodeler = GensimTopicModeler.from_pretrained( name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact ) return TopicVecCosineDistanceClassifier(topicmodeler)
[docs] @classmethod def from_pretrained_autoencoder( cls, name: str, preprocessor: Optional[callable] = None, tokenizer: Optional[callable] = None, compact: bool = True ) -> Self: autoencoder = AutoencodingTopicModeler.from_pretrained( name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact ) return TopicVecCosineDistanceClassifier(autoencoder)
[docs] def train_gensimtopicvec_cosineClassifier( classdict: dict[str, list[str]], nb_topics: int, preprocessor: Optional[callable] = None, tokenizer: Optional[callable] = None, algorithm: Literal["lda", "lsi", "rp"] = "lda", toweigh: bool = True, normalize: bool = True, *args, **kwargs ) -> TopicVecCosineDistanceClassifier: """Train a gensim topic model and return a cosine classifier. Args: classdict: Training data. nb_topics: Number of latent topics. preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. algorithm: Topic modeling algorithm. Options: lda, lsi, rp. Default: lda. toweigh: Whether to apply tf-idf weighting. Default: True. normalize: Whether to normalize topic vectors. Default: True. *args: Additional arguments for gensim topic model. **kwargs: Additional keyword arguments for gensim topic model. Returns: TopicVecCosineDistanceClassifier instance. """ # train topic model topicmodeler = GensimTopicModeler(preprocessor=preprocessor, tokenizer=tokenizer, algorithm=algorithm, toweigh=toweigh, normalize=normalize) topicmodeler.train(classdict, nb_topics, *args, **kwargs) # cosine distance classifier return TopicVecCosineDistanceClassifier(topicmodeler)
[docs] @deprecated(deprecated_in="4.0.1", removed_in="5.0.0") def load_gensimtopicvec_cosineClassifier( name: str, preprocessor: Optional[callable] = None, tokenizer: Optional[callable] = None, compact: bool=True ) -> TopicVecCosineDistanceClassifier: """ Deprecated. Use `~TopicVecCosineDistanceClassifier.from_pretrained_gensimtopic`. """ return TopicVecCosineDistanceClassifier.from_pretrained_gensimtopic( name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact )
[docs] def train_autoencoder_cosineClassifier( classdict: dict[str, list[str]], nb_topics: int, preprocessor: Optional[callable] = None, tokenizer: Optional[callable] = None, normalize: bool = True, *args, **kwargs ) -> TopicVecCosineDistanceClassifier: """Train an autoencoder topic model and return a cosine classifier. Args: classdict: Training data. nb_topics: Number of topics (encoding dimensions). preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. normalize: Whether to normalize topic vectors. Default: True. *args: Additional arguments for Keras model fitting. **kwargs: Additional keyword arguments for Keras model fitting. Returns: TopicVecCosineDistanceClassifier instance. """ # train the autoencoder autoencoder = AutoencodingTopicModeler( preprocessor=preprocessor, tokenizer=tokenizer, normalize=normalize ) autoencoder.train(classdict, nb_topics, *args, **kwargs) # cosine distance classifier return TopicVecCosineDistanceClassifier(autoencoder)
[docs] def load_autoencoder_cosineClassifier( name: str, preprocessor: Optional[callable] = None, tokenizer: Optional[callable] = None, compact: bool = True ) -> TopicVecCosineDistanceClassifier: """ Deprecated. Use `~TopicVecCosineDistanceClassifier.from_pretrained_autoencoder` """ return TopicVecCosineDistanceClassifier.from_pretrained_autoencoder( name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact )