Source code for spellchecker.spellchecker

""" SpellChecker Module; simple, intuitive spell checker based on the post by
    Peter Norvig. See: https://norvig.com/spell-correct.html """
import gzip
import json
import pkgutil
import string
import typing
from collections import Counter
from collections.abc import Iterable

from spellchecker.utils import KeyT, PathOrStr, _parse_into_words, ensure_unicode, load_file, write_file



[docs]
class SpellChecker:
    """The SpellChecker class encapsulates the basics needed to accomplish a
    simple spell checking algorithm. It is based on the work by
    Peter Norvig (https://norvig.com/spell-correct.html)

    Args:
        language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
            `en`, `es`, `it`, `de`, `fr`, `pt`, `ru`, `lv`, `eu`, and `nl`. Defaults to `en`. A list of languages \
            may be provided and all languages will be loaded.
        local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
            will be loaded
        distance (int): The edit distance to use. Defaults to 2.
        case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
            language dictionary.
    Note:
        Using a case sensitive dictionary can be slow to correct words."""

    __slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]

    def __init__(
        self,
        language: typing.Union[str, typing.Iterable[str], None] = "en",
        local_dictionary: typing.Optional[PathOrStr] = None,
        distance: int = 2,
        tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
        case_sensitive: bool = False,
    ) -> None:
        self._distance = 2  # default
        self.distance = distance  # use the setter value check

        if tokenizer:
            self._tokenizer = tokenizer
        else:
            self._tokenizer = _parse_into_words

        self._case_sensitive = case_sensitive if not language else False
        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)

        if local_dictionary:
            self._word_frequency.load_dictionary(local_dictionary)
        elif language:
            if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):
                language = [language]
            for lang in language:
                filename = f"resources/{lang.lower()}.json.gz"
                try:
                    json_open = pkgutil.get_data("spellchecker", filename)
                except FileNotFoundError as exc:
                    msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
                    raise ValueError(msg) from exc
                if json_open:
                    lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
                self._word_frequency.load_json(lang_dict)

    def __contains__(self, key: KeyT) -> bool:
        """setup easier known checks"""
        key = ensure_unicode(key)
        return key in self._word_frequency

    def __getitem__(self, key: KeyT) -> int:
        """setup easier frequency checks"""
        key = ensure_unicode(key)
        return self._word_frequency[key]

    def __iter__(self) -> typing.Generator[str, None, None]:
        """setup iter support"""
        yield from self._word_frequency.dictionary


[docs]
    @classmethod
    def languages(cls) -> typing.Iterable[str]:
        """list: A list of all official languages supported by the library"""
        return ["en", "es", "fr", "it", "pt", "de", "ru", "ar", "lv", "eu", "nl"]


    @property
    def word_frequency(self) -> "WordFrequency":
        """WordFrequency: An encapsulation of the word frequency `dictionary`

        Note:
            Not settable"""
        return self._word_frequency

    @property
    def distance(self) -> int:
        """int: The maximum edit distance to calculate

        Note:
            Valid values are 1 or 2; if an invalid value is passed, defaults to 2"""
        return self._distance

    @distance.setter
    def distance(self, val: int) -> None:
        """set the distance parameter"""
        tmp = 2
        try:
            if 0 < int(val) <= 2:
                tmp = val
        except (ValueError, TypeError):
            pass
        self._distance = tmp


[docs]
    def split_words(self, text: KeyT) -> typing.Iterable[str]:
        """Split text into individual `words` using either a simple whitespace
        regex or the passed in tokenizer

        Args:
            text (str): The text to split into individual words
        Returns:
            list(str): A listing of all words in the provided text"""
        text = ensure_unicode(text)
        return self._tokenizer(text)



[docs]
    def export(self, filepath: PathOrStr, encoding: str = "utf-8", gzipped: bool = True) -> None:
        """Export the word frequency list for import in the future

        Args:
           filepath (str): The filepath to the exported dictionary
           encoding (str): The encoding of the resulting output
           gzipped (bool): Whether to gzip the dictionary or not"""
        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
        write_file(filepath, encoding, gzipped, data)



[docs]
    def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
        """Calculate the frequency to the `word` provided as seen across the
        entire dictionary

        Args:
            word (str): The word for which the word probability is calculated
            total_words (int): The total number of words to use in the calculation; \
                use the default for using the whole word frequency
        Returns:
            float: The probability that the word is the correct word"""
        if not total_words:
            total_words = self._word_frequency.total_words
        word = ensure_unicode(word)
        return self._word_frequency.dictionary[word] / total_words



[docs]
    def correction(self, word: KeyT) -> typing.Optional[str]:
        """The most probable correct spelling for the word

        Args:
            word (str): The word to correct
        Returns:
            str: The most likely candidate or None if no correction is present"""
        word = ensure_unicode(word)
        candidates = self.candidates(word)
        if not candidates:
            return None
        return max(sorted(list(candidates)), key=self.__getitem__)



[docs]
    def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
        """Generate possible spelling corrections for the provided word up to
        an edit distance of two, if and only when needed

        Args:
            word (str): The word for which to calculate candidate spellings
        Returns:
            set: The set of words that are possible candidates or None if there are no candidates"""
        word = ensure_unicode(word)
        if self.known([word]):  # short-cut if word is correct already
            return {word}

        if not self._check_if_should_check(word):
            return {word}

        # get edit distance 1...
        res = list(self.edit_distance_1(word))
        tmp = self.known(res)
        if tmp:
            return tmp
        # if still not found, use the edit distance 1 to calc edit distance 2
        if self._distance == 2:
            tmp = self.known(list(self.__edit_distance_alt(res)))
            if tmp:
                return tmp
        return None



[docs]
    def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
        """The subset of `words` that appear in the dictionary of words

        Args:
            words (list): List of words to determine which are in the corpus
        Returns:
            set: The set of those words from the input that are in the corpus"""
        tmp_words = [ensure_unicode(w) for w in words]
        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
        return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}



[docs]
    def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
        """The subset of `words` that do not appear in the dictionary

        Args:
            words (list): List of words to determine which are not in the corpus
        Returns:
            set: The set of those words from the input that are not in the corpus"""
        tmp_words = [ensure_unicode(w) for w in words]
        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
        return {w for w in tmp if w not in self._word_frequency.dictionary}



[docs]
    def edit_distance_1(self, word: KeyT) -> typing.Set[str]:
        """Compute all strings that are one edit away from `word` using only
        the letters in the corpus

        Args:
            word (str): The word for which to calculate the edit distance
        Returns:
            set: The set of strings that are edit distance one from the provided word"""
        tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
        if self._check_if_should_check(tmp_word) is False:
            return {tmp_word}
        letters = self._word_frequency.letters
        splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)



[docs]
    def edit_distance_2(self, word: KeyT) -> typing.List[str]:
        """Compute all strings that are two edits away from `word` using only
        the letters in the corpus

        Args:
            word (str): The word for which to calculate the edit distance
        Returns:
            set: The set of strings that are edit distance two from the provided word"""
        word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
        return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]


    def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:
        """Compute all strings that are 1 edits away from all the words using
        only the letters in the corpus

        Args:
            words (list): The words for which to calculate the edit distance
        Returns:
            set: The set of strings that are edit distance two from the provided words"""
        tmp_words = [ensure_unicode(w) for w in words]
        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]

    def _check_if_should_check(self, word: str) -> bool:
        if len(word) == 1 and word in string.punctuation:
            return False
        if len(word) > self._word_frequency.longest_word_length + 3:  # allow removal of up to 2 letters
            return False
        if word.lower() == "nan":  # nan passes the float(word) so this will bypass that issue (#125)
            return True
        try:  # check if it is a number (int, float, etc)
            float(word)
            return False
        except ValueError:
            pass

        return True




[docs]
class WordFrequency:
    """Store the `dictionary` as a word frequency list while allowing for
    different methods to load the data and update over time"""

    __slots__ = [
        "_dictionary",
        "_total_words",
        "_unique_words",
        "_letters",
        "_tokenizer",
        "_case_sensitive",
        "_longest_word_length",
    ]

    def __init__(
        self,
        tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
        case_sensitive: bool = False,
    ) -> None:
        self._dictionary: typing.Counter = Counter()
        self._total_words = 0
        self._unique_words = 0
        self._letters: typing.Set[str] = set()
        self._case_sensitive = case_sensitive
        self._longest_word_length = 0

        self._tokenizer = _parse_into_words
        if tokenizer is not None:
            self._tokenizer = tokenizer  # type:  ignore

    def __contains__(self, key: KeyT) -> bool:
        """turn on contains"""
        key = ensure_unicode(key)
        key = key if self._case_sensitive else key.lower()
        return key in self._dictionary

    def __getitem__(self, key: KeyT) -> int:
        """turn on getitem"""
        key = ensure_unicode(key)
        key = key if self._case_sensitive else key.lower()
        return self._dictionary[key]

    def __iter__(self) -> typing.Generator[str, None, None]:
        """turn on iter support"""
        yield from self._dictionary


[docs]
    def pop(self, key: KeyT, default: typing.Optional[int] = None) -> typing.Optional[int]:
        """Remove the key and return the associated value or default if not
        found

        Args:
            key (str): The key to remove
            default (obj): The value to return if key is not present
        Returns:
            int | None: Returns the number of instances of key, or None if not in the dictionary"""
        key = ensure_unicode(key)
        return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)


    @property
    def dictionary(self) -> typing.Dict[str, int]:
        """Counter: A counting dictionary of all words in the corpus and the number
        of times each has been seen

        Note:
            Not settable"""
        return self._dictionary

    @property
    def total_words(self) -> int:
        """int: The sum of all word occurrences in the word frequency dictionary

        Note:
            Not settable"""
        return self._total_words

    @property
    def unique_words(self) -> int:
        """int: The total number of unique words in the word frequency list

        Note:
            Not settable"""
        return self._unique_words

    @property
    def letters(self) -> typing.Set[str]:
        """set: The listing of all letters found within the corpus

        Note:
            Not settable"""
        return self._letters

    @property
    def longest_word_length(self) -> int:
        """int: The longest word length in the dictionary

        Note:
            Not settable"""
        return self._longest_word_length


[docs]
    def tokenize(self, text: KeyT) -> typing.Iterator[str]:
        """Tokenize the provided string object into individual words

        Args:
            text (str): The string object to tokenize
        Yields:
            str: The next `word` in the tokenized string
        Note:
            This is the same as the `spellchecker.split_words()` unless a tokenizer function was provided."""
        tmp_text = ensure_unicode(text)
        for word in self._tokenizer(tmp_text):
            yield word if self._case_sensitive else word.lower()



[docs]
    def keys(self) -> typing.Iterator[str]:
        """Iterator over the key of the dictionary

        Yields:
            str: The next key in the dictionary
        Note:
            This is the same as `spellchecker.words()`"""
        yield from self._dictionary.keys()



[docs]
    def words(self) -> typing.Iterator[str]:
        """Iterator over the words in the dictionary

        Yields:
            str: The next word in the dictionary
        Note:
            This is the same as `spellchecker.keys()`"""
        yield from self._dictionary.keys()



[docs]
    def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:
        """Iterator over the words in the dictionary

        Yields:
            str: The next word in the dictionary
            int: The number of instances in the dictionary
        Note:
            This is the same as `dict.items()`"""
        yield from self._dictionary.items()



[docs]
    def load_dictionary(self, filename: PathOrStr, encoding: str = "utf-8") -> None:
        """Load in a pre-built word frequency list

        Args:
            filename (str): The filepath to the json (optionally gzipped) file to be loaded
            encoding (str): The encoding of the dictionary"""
        with load_file(filename, encoding) as data:
            data = data if self._case_sensitive else data.lower()
            self._dictionary.update(json.loads(data))
            self._update_dictionary()



[docs]
    def load_json(self, data: typing.Dict[str, int]) -> None:
        """Load in a pre-built word frequency list

        Args:
            data (dict): The dictionary to be loaded"""
        self._dictionary.update(data)
        self._update_dictionary()



[docs]
    def load_text_file(
        self,
        filename: PathOrStr,
        encoding: str = "utf-8",
        tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
    ) -> None:
        """Load in a text file from which to generate a word frequency list

        Args:
            filename (str): The filepath to the text file to be loaded
            encoding (str): The encoding of the text file
            tokenizer (function): The function to use to tokenize a string
        """
        with load_file(filename, encoding=encoding) as data:
            self.load_text(data, tokenizer)



[docs]
    def load_text(
        self,
        text: KeyT,
        tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
    ) -> None:
        """Load text from which to generate a word frequency list

        Args:
            text (str): The text to be loaded
            tokenizer (function): The function to use to tokenize a string
        """
        text = ensure_unicode(text)
        if tokenizer:
            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
        else:
            words = self.tokenize(text)  # type: ignore[assignment]

        self._dictionary.update(words)
        self._update_dictionary()



[docs]
    def load_words(self, words: typing.Iterable[KeyT]) -> None:
        """Load a list of words from which to generate a word frequency list

        Args:
            words (list): The list of words to be loaded"""
        words = [ensure_unicode(w) for w in words]
        self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
        self._update_dictionary()



[docs]
    def add(self, word: KeyT, val: int = 1) -> None:
        """Add a word to the word frequency list

        Args:
            word (str): The word to add
            val (int): The number of times to insert the word"""
        word = ensure_unicode(word)
        self.load_json({word if self._case_sensitive else word.lower(): val})



[docs]
    def remove_words(self, words: typing.Iterable[KeyT]) -> None:
        """Remove a list of words from the word frequency list

        Args:
            words (list): The list of words to remove"""
        words = [ensure_unicode(w) for w in words]
        for word in words:
            self.pop(word)
        self._update_dictionary()



[docs]
    def remove(self, word: KeyT) -> None:
        """Remove a word from the word frequency list

        Args:
            word (str): The word to remove"""
        self.pop(word)
        self._update_dictionary()



[docs]
    def remove_by_threshold(self, threshold: int = 5) -> None:
        """Remove all words at, or below, the provided threshold

        Args:
            threshold (int): The threshold at which a word is to be removed"""
        to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
        self.remove_words(to_remove)


    def _update_dictionary(self) -> None:
        """Update the word frequency object"""
        self._longest_word_length = 0
        self._total_words = sum(self._dictionary.values())
        self._unique_words = len(self._dictionary.keys())
        self._letters = set()
        for key in self._dictionary:
            if len(key) > self._longest_word_length:
                self._longest_word_length = len(key)
            self._letters.update(key)