Source code for shorttext.utils.textpreprocessing


import re
import os
import codecs
from typing import TextIO
from functools import partial

import snowballstemmer


# tokenizer

[docs]
def tokenize(s: str) -> list[str]:
    """Tokenize a string by splitting on whitespace.

    Args:
        s: Input string to tokenize.

    Returns:
        List of tokens split by whitespace.
    """
    return s.split(' ')



# stemmer

[docs]
class StemmerSingleton:
    """Singleton class for Porter stemmer.

    Provides a singleton instance of the snowball stemmer for English.
    """

    def __new__(cls):
        if not hasattr(cls, 'instance'):
            cls.instance = super(StemmerSingleton, cls).__new__(cls)
            cls.stemmer = snowballstemmer.stemmer('english')
        return cls.instance


[docs]
    def __call__(cls, s: str) -> str:
        """Stem a word using Porter stemmer.

        Args:
            s: Word to stem.

        Returns:
            Stemmed word.
        """
        return cls.stemmer.stemWord(s)





[docs]
def stemword(s: str) -> str:
    """Stem a word using Porter stemmer.

    Args:
        s: Word to stem.

    Returns:
        Stemmed word.
    """
    return StemmerSingleton()(s)




[docs]
def preprocess_text(text: str, pipeline: list[callable]) -> str:
    """Preprocess text according to a given pipeline.

    Applies a sequence of preprocessing functions to the input text.
    Each function in the pipeline transforms the text (e.g., stemming,
    lemmatizing, removing punctuation).

    Args:
        text: Input text to preprocess.
        pipeline: List of functions that each transform a text string to another text string.

    Returns:
        The preprocessed text after applying all pipeline functions.
    """
    return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])




[docs]
def tokenize_text(
        text: str,
        presplit_pipeline: list[callable],
        primitize_tokenizer: callable,
        postsplit_pipeline: list[callable],
        stopwordsfile: TextIO
) -> list[str]:
    """Tokenize text with preprocessing pipelines.

    Applies pre-split and post-split pipelines to tokenize text,
    filtering out stopwords.

    Args:
        text: Input text to tokenize.
        presplit_pipeline: List of functions to apply before tokenization.
        primitize_tokenizer: Tokenizer function to split text into tokens.
        postsplit_pipeline: List of functions to apply to each token after tokenization.
        stopwordsfile: File containing stopwords to filter out.

    Returns:
        List of tokens after preprocessing and stopword filtering.
    """
    # load stop words file
    stopwordset = set([stopword.strip() for stopword in stopwordsfile])

    # done
    presplit_text = text
    for func in presplit_pipeline:
        presplit_text = func(presplit_text)
    postsplit_tokens = primitize_tokenizer(presplit_text)
    for func in postsplit_pipeline:
        for i, token in enumerate(postsplit_tokens):
            postsplit_tokens[i] = func(token)
    postsplit_tokens = [
        token for token in postsplit_tokens
        if token not in stopwordset
    ]
    return postsplit_tokens




[docs]
def text_preprocessor(pipeline: list[callable]) -> callable:
    """Create a text preprocessor function from a pipeline.

    Returns a function that applies the given pipeline to preprocess text.
    This is a convenience function that wraps preprocess_text with
    a fixed pipeline.

    Args:
        pipeline: List of functions that transform text to text.

    Returns:
        A callable that takes text and returns preprocessed text.
    """
    return partial(preprocess_text, pipeline=pipeline)




[docs]
def oldschool_standard_text_preprocessor(stopwordsfile: TextIO) -> callable:
    """Create a standard text preprocessor.

    Returns a text preprocessor with the following steps:
    - Remove special characters
    - Remove numerals
    - Convert to lowercase
    - Remove stop words
    - Stem words using Porter stemmer

    Args:
        stopwordsfile: File object containing stopwords to filter.

    Returns:
        A callable that takes text and returns preprocessed text.
    """
    # load stop words file
    stopwordset = set([stopword.strip() for stopword in stopwordsfile])
    stopwordsfile.close()

    # the pipeline
    pipeline = [lambda s: re.sub(r'[^\w\s]', '', s),
                lambda s: re.sub(r'[0-9]', '', s),
                lambda s: s.lower(),
                lambda s: ' '.join(filter(lambda s: not (s in stopwordset), tokenize(s))),
                lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
               ]
    return text_preprocessor(pipeline)




[docs]
def standard_text_preprocessor_1() -> callable:
    """Create a standard text preprocessor using NLTK stopwords.

    Returns a text preprocessor with the following steps:
    - Remove special characters
    - Remove numerals
    - Convert to lowercase
    - Remove stop words (NLTK list)
    - Stem words using Porter stemmer

    Returns:
        A callable that takes text and returns preprocessed text.
    """
    # load stop words
    this_dir, _ = os.path.split(__file__)
    stopwordsfile = codecs.open(os.path.join(this_dir, 'stopwords.txt'), 'r', 'utf-8')

    return oldschool_standard_text_preprocessor(stopwordsfile)




[docs]
def standard_text_preprocessor_2() -> callable:
    """Create a standard text preprocessor with negation-aware stopwords.

    Returns a text preprocessor with the following steps:
    - Remove special characters
    - Remove numerals
    - Convert to lowercase
    - Remove stop words (NLTK list minus negation terms)
    - Stem words using Porter stemmer

    Returns:
        A callable that takes text and returns preprocessed text.
    """
    # load stop words
    this_dir, _ = os.path.split(__file__)
    stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')

    return oldschool_standard_text_preprocessor(stopwordsfile)




[docs]
def advanced_text_tokenizer_1() -> callable:
    """Create an advanced text tokenizer.

    Returns a tokenizer function that applies preprocessing steps:
    - Remove special characters
    - Remove numerals
    - Convert to lowercase
    - Stem tokens using Porter stemmer
    - Filter out negation-aware stopwords

    Returns:
        A callable that takes text and returns a list of tokens.
    """
    presplit_pipeline = [
        lambda s: re.sub(r'[^\w\s]', '', s),
        lambda s: re.sub(r'[0-9]', '', s),
        lambda s: s.lower()
    ]
    tokenizer = tokenize
    postsplit_pipeline = [
        lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
    ]
    this_dir, _ = os.path.split(__file__)
    return partial(
        tokenize_text,
        presplit_pipeline=presplit_pipeline,
        primitize_tokenizer=tokenizer,
        postsplit_pipeline=postsplit_pipeline,
        stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
    )