Source code for shorttext.utils.textpreprocessing


import re
import os
import codecs
from typing import TextIO
from functools import partial

import snowballstemmer


# tokenizer
[docs] def tokenize(s: str) -> list[str]: """Tokenize a string by splitting on whitespace. Args: s: Input string to tokenize. Returns: List of tokens split by whitespace. """ return s.split(' ')
# stemmer
[docs] class StemmerSingleton: """Singleton class for Porter stemmer. Provides a singleton instance of the snowball stemmer for English. """ def __new__(cls): if not hasattr(cls, 'instance'): cls.instance = super(StemmerSingleton, cls).__new__(cls) cls.stemmer = snowballstemmer.stemmer('english') return cls.instance
[docs] def __call__(cls, s: str) -> str: """Stem a word using Porter stemmer. Args: s: Word to stem. Returns: Stemmed word. """ return cls.stemmer.stemWord(s)
[docs] def stemword(s: str) -> str: """Stem a word using Porter stemmer. Args: s: Word to stem. Returns: Stemmed word. """ return StemmerSingleton()(s)
[docs] def preprocess_text(text: str, pipeline: list[callable]) -> str: """Preprocess text according to a given pipeline. Applies a sequence of preprocessing functions to the input text. Each function in the pipeline transforms the text (e.g., stemming, lemmatizing, removing punctuation). Args: text: Input text to preprocess. pipeline: List of functions that each transform a text string to another text string. Returns: The preprocessed text after applying all pipeline functions. """ return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
[docs] def tokenize_text( text: str, presplit_pipeline: list[callable], primitize_tokenizer: callable, postsplit_pipeline: list[callable], stopwordsfile: TextIO ) -> list[str]: """Tokenize text with preprocessing pipelines. Applies pre-split and post-split pipelines to tokenize text, filtering out stopwords. Args: text: Input text to tokenize. presplit_pipeline: List of functions to apply before tokenization. primitize_tokenizer: Tokenizer function to split text into tokens. postsplit_pipeline: List of functions to apply to each token after tokenization. stopwordsfile: File containing stopwords to filter out. Returns: List of tokens after preprocessing and stopword filtering. """ # load stop words file stopwordset = set([stopword.strip() for stopword in stopwordsfile]) # done presplit_text = text for func in presplit_pipeline: presplit_text = func(presplit_text) postsplit_tokens = primitize_tokenizer(presplit_text) for func in postsplit_pipeline: for i, token in enumerate(postsplit_tokens): postsplit_tokens[i] = func(token) postsplit_tokens = [ token for token in postsplit_tokens if token not in stopwordset ] return postsplit_tokens
[docs] def text_preprocessor(pipeline: list[callable]) -> callable: """Create a text preprocessor function from a pipeline. Returns a function that applies the given pipeline to preprocess text. This is a convenience function that wraps preprocess_text with a fixed pipeline. Args: pipeline: List of functions that transform text to text. Returns: A callable that takes text and returns preprocessed text. """ return partial(preprocess_text, pipeline=pipeline)
[docs] def oldschool_standard_text_preprocessor(stopwordsfile: TextIO) -> callable: """Create a standard text preprocessor. Returns a text preprocessor with the following steps: - Remove special characters - Remove numerals - Convert to lowercase - Remove stop words - Stem words using Porter stemmer Args: stopwordsfile: File object containing stopwords to filter. Returns: A callable that takes text and returns preprocessed text. """ # load stop words file stopwordset = set([stopword.strip() for stopword in stopwordsfile]) stopwordsfile.close() # the pipeline pipeline = [lambda s: re.sub(r'[^\w\s]', '', s), lambda s: re.sub(r'[0-9]', '', s), lambda s: s.lower(), lambda s: ' '.join(filter(lambda s: not (s in stopwordset), tokenize(s))), lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)]) ] return text_preprocessor(pipeline)
[docs] def standard_text_preprocessor_1() -> callable: """Create a standard text preprocessor using NLTK stopwords. Returns a text preprocessor with the following steps: - Remove special characters - Remove numerals - Convert to lowercase - Remove stop words (NLTK list) - Stem words using Porter stemmer Returns: A callable that takes text and returns preprocessed text. """ # load stop words this_dir, _ = os.path.split(__file__) stopwordsfile = codecs.open(os.path.join(this_dir, 'stopwords.txt'), 'r', 'utf-8') return oldschool_standard_text_preprocessor(stopwordsfile)
[docs] def standard_text_preprocessor_2() -> callable: """Create a standard text preprocessor with negation-aware stopwords. Returns a text preprocessor with the following steps: - Remove special characters - Remove numerals - Convert to lowercase - Remove stop words (NLTK list minus negation terms) - Stem words using Porter stemmer Returns: A callable that takes text and returns preprocessed text. """ # load stop words this_dir, _ = os.path.split(__file__) stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8') return oldschool_standard_text_preprocessor(stopwordsfile)
[docs] def advanced_text_tokenizer_1() -> callable: """Create an advanced text tokenizer. Returns a tokenizer function that applies preprocessing steps: - Remove special characters - Remove numerals - Convert to lowercase - Stem tokens using Porter stemmer - Filter out negation-aware stopwords Returns: A callable that takes text and returns a list of tokens. """ presplit_pipeline = [ lambda s: re.sub(r'[^\w\s]', '', s), lambda s: re.sub(r'[0-9]', '', s), lambda s: s.lower() ] tokenizer = tokenize postsplit_pipeline = [ lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)]) ] this_dir, _ = os.path.split(__file__) return partial( tokenize_text, presplit_pipeline=presplit_pipeline, primitize_tokenizer=tokenizer, postsplit_pipeline=postsplit_pipeline, stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8') )