Source code for shorttext.spell.norvig


# reference: https://norvig.com/spell-correct.html

import re
from collections import Counter
from typing import Generator

from . import SpellCorrector
from .editor import compute_set_edits1, compute_set_edits2


[docs] class NorvigSpellCorrector(SpellCorrector): """Spell corrector based on Peter Norvig's algorithm. Uses word frequency counts to suggest corrections for misspelled words by finding edits that exist in the vocabulary. Reference: https://norvig.com/spell-correct.html """
[docs] def __init__(self): """Initialize the spell corrector.""" self.train('')
[docs] def train(self, text: str) -> None: """Train on a text corpus. Builds a word frequency dictionary from the input text. Args: text: Training text corpus. """ self.words = re.findall('\\w+', text.lower()) self.WORDS = Counter(self.words) self.N = sum(self.WORDS.values())
[docs] def P(self, word: str) -> float: """Compute word probability from the training corpus. Args: word: Word to get probability for. Returns: Probability of the word appearing in the corpus. """ return self.WORDS[word] / float(self.N)
[docs] def correct(self, word: str) -> str: """Recommend spelling correction for a word. Args: word: Word to correct. Returns: Most likely correction, or the original word if no better option. """ return max(self.candidates(word), key=self.P)
[docs] def known(self, words: list[str]) -> set[str]: """Filter words found in the training vocabulary. Args: words: List of words to check. Returns: Subset of words that appear in the training corpus. """ return set(w for w in words if w in self.WORDS)
[docs] def candidates(self, word: str) -> Generator[str, None, None]: """Generate spelling correction candidates. Checks exact match, then edits of distance 1 and 2. Args: word: Word to find candidates for. Yields: Viable correction candidates. """ return (self.known([word]) or self.known(compute_set_edits1(word)) or self.known(compute_set_edits2(word)) or [word])