from typing import Optional, Literal, Self
from deprecation import deprecated
from ....generators import LatentTopicModeler, GensimTopicModeler, AutoencodingTopicModeler
from ...base import AbstractScorer
[docs]
class TopicVecCosineDistanceClassifier(AbstractScorer):
"""Classifier using cosine similarity with topic vectors.
Classifies short text based on cosine similarity between topic vectors
of the input and class centroids. Topic vectors are generated by a
LatentTopicModeler.
"""
[docs]
def __init__(self, topicmodeler: LatentTopicModeler):
"""Initialize the classifier.
Args:
topicmodeler: A LatentTopicModeler instance.
"""
self.topicmodeler = topicmodeler
[docs]
def score(self, shorttext: str) -> dict[str, float]:
"""Calculate cosine similarity to all class topic vectors.
Args:
shorttext: Input text.
Returns:
Dictionary mapping class labels to similarity scores.
"""
return self.topicmodeler.get_batch_cos_similarities(shorttext)
[docs]
def loadmodel(self, nameprefix: str) -> None:
"""Load the topic model.
Args:
nameprefix: Prefix for input files.
"""
self.topicmodeler.loadmodel(nameprefix)
[docs]
def savemodel(self, nameprefix: str) -> None:
"""Save the topic model.
Args:
nameprefix: Prefix for output files.
Raises:
ModelNotTrainedException: If model not trained.
"""
self.topicmodeler.savemodel(nameprefix)
[docs]
def load_compact_model(self, name: str) -> None:
"""Load compact model.
Args:
name: Name of the compact model file.
"""
self.topicmodeler.load_compact_model(name)
[docs]
def save_compact_model(self, name: str) -> None:
"""Save compact model.
Args:
name: Name of the compact model file.
"""
self.topicmodeler.save_compact_model(name)
[docs]
@classmethod
def from_pretrained_gensimtopic(
cls,
name: str,
preprocessor: Optional[callable] = None,
tokenizer: Optional[callable] = None,
compact: bool = True
) -> Self:
"""Load a gensim topic model and return a cosine classifier.
Args:
name: Model name (compact) or file prefix (non-compact).
preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1.
compact: Whether to load compact model. Default: True.
Returns:
TopicVecCosineDistanceClassifier instance.
"""
topicmodeler = GensimTopicModeler.from_pretrained(
name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact
)
return TopicVecCosineDistanceClassifier(topicmodeler)
[docs]
@classmethod
def from_pretrained_autoencoder(
cls,
name: str,
preprocessor: Optional[callable] = None,
tokenizer: Optional[callable] = None,
compact: bool = True
) -> Self:
autoencoder = AutoencodingTopicModeler.from_pretrained(
name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact
)
return TopicVecCosineDistanceClassifier(autoencoder)
[docs]
def train_gensimtopicvec_cosineClassifier(
classdict: dict[str, list[str]],
nb_topics: int,
preprocessor: Optional[callable] = None,
tokenizer: Optional[callable] = None,
algorithm: Literal["lda", "lsi", "rp"] = "lda",
toweigh: bool = True,
normalize: bool = True,
*args, **kwargs
) -> TopicVecCosineDistanceClassifier:
"""Train a gensim topic model and return a cosine classifier.
Args:
classdict: Training data.
nb_topics: Number of latent topics.
preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1.
algorithm: Topic modeling algorithm. Options: lda, lsi, rp. Default: lda.
toweigh: Whether to apply tf-idf weighting. Default: True.
normalize: Whether to normalize topic vectors. Default: True.
*args: Additional arguments for gensim topic model.
**kwargs: Additional keyword arguments for gensim topic model.
Returns:
TopicVecCosineDistanceClassifier instance.
"""
# train topic model
topicmodeler = GensimTopicModeler(preprocessor=preprocessor,
tokenizer=tokenizer,
algorithm=algorithm,
toweigh=toweigh,
normalize=normalize)
topicmodeler.train(classdict, nb_topics, *args, **kwargs)
# cosine distance classifier
return TopicVecCosineDistanceClassifier(topicmodeler)
[docs]
@deprecated(deprecated_in="4.0.1", removed_in="5.0.0")
def load_gensimtopicvec_cosineClassifier(
name: str,
preprocessor: Optional[callable] = None,
tokenizer: Optional[callable] = None,
compact: bool=True
) -> TopicVecCosineDistanceClassifier:
"""
Deprecated. Use `~TopicVecCosineDistanceClassifier.from_pretrained_gensimtopic`.
"""
return TopicVecCosineDistanceClassifier.from_pretrained_gensimtopic(
name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact
)
[docs]
def train_autoencoder_cosineClassifier(
classdict: dict[str, list[str]],
nb_topics: int,
preprocessor: Optional[callable] = None,
tokenizer: Optional[callable] = None,
normalize: bool = True,
*args, **kwargs
) -> TopicVecCosineDistanceClassifier:
"""Train an autoencoder topic model and return a cosine classifier.
Args:
classdict: Training data.
nb_topics: Number of topics (encoding dimensions).
preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1.
normalize: Whether to normalize topic vectors. Default: True.
*args: Additional arguments for Keras model fitting.
**kwargs: Additional keyword arguments for Keras model fitting.
Returns:
TopicVecCosineDistanceClassifier instance.
"""
# train the autoencoder
autoencoder = AutoencodingTopicModeler(
preprocessor=preprocessor, tokenizer=tokenizer, normalize=normalize
)
autoencoder.train(classdict, nb_topics, *args, **kwargs)
# cosine distance classifier
return TopicVecCosineDistanceClassifier(autoencoder)
[docs]
def load_autoencoder_cosineClassifier(
name: str,
preprocessor: Optional[callable] = None,
tokenizer: Optional[callable] = None,
compact: bool = True
) -> TopicVecCosineDistanceClassifier:
"""
Deprecated. Use `~TopicVecCosineDistanceClassifier.from_pretrained_autoencoder`
"""
return TopicVecCosineDistanceClassifier.from_pretrained_autoencoder(
name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact
)