Source code for shorttext.generators.charbase.char2vec


from functools import partial
from os import PathLike
from typing import Self, Optional

import numpy as np
import numpy.typing as npt
from scipy.sparse import csc_matrix
from gensim.corpora import Dictionary
from sklearn.preprocessing import OneHotEncoder
from deprecation import deprecated

from ...utils.misc import textfile_generator


[docs] class SentenceToCharVecEncoder: """One-hot encoder for character-level text representations. Converts sentences into one-hot encoded vectors at the character level. Useful for character-level sequence models. Reference: General architecture inspired by char-RNN and related models. """
[docs] def __init__(self, dictionary: Dictionary, signalchar: str='\n'): """Initialize the character vector encoder. Args: dictionary: Gensim Dictionary mapping characters to indices. signalchar: Signal character for sequence markers. Default: '\\n'. """ self.dictionary = dictionary self.signalchar = signalchar numchars = len(self.dictionary) self.onehot_encoder = OneHotEncoder() self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1)))
[docs] def calculate_prelim_vec(self, sent: str) -> npt.NDArray[np.float64]: """Convert sentence to one-hot character vectors. Args: sent: Input sentence. Returns: One-hot encoded sparse matrix where each row represents a character's encoding. """ return self.onehot_encoder.transform( np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1)) ).astype(np.float64)
[docs] def encode_sentence( self, sent: str, maxlen: int, startsig: bool = False, endsig=False ) -> csc_matrix: """Encode a sentence to a sparse character vector matrix. Args: sent: Input sentence to encode. maxlen: Maximum length of the encoded sequence. startsig: Whether to prepend signal character. Default: False. endsig: Whether to append signal character. Default: False. Returns: Sparse matrix representing the sentence with shape (maxlen + startsig + endsig, num_chars). """ cor_sent = (self.signalchar if startsig else '') + sent[:min(maxlen, len(sent))] + (self.signalchar if endsig else '') sent_vec = self.calculate_prelim_vec(cor_sent).tocsc() if sent_vec.shape[0] == maxlen + startsig + endsig: return sent_vec else: return csc_matrix((sent_vec.data, sent_vec.indices, sent_vec.indptr), shape=(maxlen + startsig + endsig, sent_vec.shape[1]), dtype=np.float64)
[docs] def encode_sentences( self, sentences: list[str], maxlen: int, sparse: bool = True, startsig: bool = False, endsig: bool = False ) -> list[npt.NDArray[np.float64]] | npt.NDArray[np.float64]: """Encode multiple sentences into character vectors. Args: sentences: List of sentences to encode. maxlen: Maximum length for each encoded sentence. sparse: Whether to return sparse matrices. Default: True. startsig: Whether to prepend signal character. Default: False. endsig: Whether to append signal character. Default: False. Returns: If sparse=True: list of sparse matrices. If sparse=False: numpy array of shape (n_sentences, maxlen, num_chars). """ encode_sent_func = partial(self.encode_sentence, startsig=startsig, endsig=endsig, maxlen=maxlen) list_encoded_sentences_map = map(encode_sent_func, sentences) if sparse: return list(list_encoded_sentences_map) else: return np.array([sparsevec.toarray() for sparsevec in list_encoded_sentences_map])
[docs] def __len__(self) -> int: """Return the number of unique characters in the dictionary.""" return len(self.dictionary)
[docs] @classmethod def from_pretrained( cls, textfile: str | PathLike, encoding: Optional[bool] = None ) -> Self: """Create a SentenceToCharVecEncoder from a text file. Builds a character dictionary from the given text file and returns an encoder instance. Args: textfile: Path to the text file for building the character dictionary. encoding: Encoding of the text file. Default: None. Returns: A SentenceToCharVecEncoder instance. """ dictionary = Dictionary( map( lambda line: [c for c in line], textfile_generator(textfile, encoding=encoding) ) ) return SentenceToCharVecEncoder(dictionary)
[docs] def initialize_SentenceToCharVecEncoder( textfile: str | PathLike, encoding: Optional[bool] = None ) -> SentenceToCharVecEncoder: """ Deprecated. Use `~SentenceToCharVecEncoder.from_pretrained`. """ return SentenceToCharVecEncoder.from_pretrained( textfile, encoding=encoding )
[docs] @deprecated(deprecated_in="4.0.0", removed_in="4.1.0") def initSentenceToCharVecEncoder( textfile: str | PathLike, encoding: Optional[bool] = None ) -> SentenceToCharVecEncoder: """ Deprecated. Use initialize_SentenceToCharVecEncoder instead. """ return SentenceToCharVecEncoder.from_pretrained( textfile, encoding=encoding )