Make Spellchecker() load builtin dictionaries by default

nthykier · nthykier · commit f93959324537 · 2024-05-17T20:16:53.000Z
This makes the API easier to use out of the box.
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
@@ -16,4 +16,4 @@ jobs:
         with:
           check_filenames: true
           # When using this Action in other repos, the --skip option below can be removed
-          skip: "./.git,./codespell_lib/data,./example/code.c,test_basic.py,*.pyc,README.rst,pyproject-codespell.precommit-toml"
+          skip: "./.git,./codespell_lib/data,./example/code.c,spellchecker.py,test_basic.py,*.pyc,README.rst,pyproject-codespell.precommit-toml"
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
@@ -39,12 +39,19 @@
     Tuple,
 )
 
+from ._text_util import fix_case
+
 # autogenerated by setuptools_scm
 from ._version import (  # type: ignore[import-not-found]
     __version__ as VERSION,  # noqa: N812
 )
-from .spellchecker import Spellchecker, LineTokenizer, DetectedMisspelling
-from ._text_util import fix_case
+from .spellchecker import (
+    DetectedMisspelling,
+    LineTokenizer,
+    Spellchecker,
+    _builtin_default,
+    _builtin_dictionaries,
+)
 
 word_regex_def = r"[\w\-'’]+"  # noqa: RUF001
 # While we want to treat characters like ( or " as okay for a starting break,
@@ -59,75 +66,6 @@
 \t%prog [OPTIONS] [file1 file2 ... fileN]
 """
 
-supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
-supported_languages = supported_languages_en
-
-# Users might want to link this file into /usr/local/bin, so we resolve the
-# symbolic link path to the real path if necessary.
-_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
-_builtin_dictionaries = (
-    # name, desc, name, err in aspell, correction in aspell, \
-    # err dictionary array, rep dictionary array
-    # The arrays must contain the names of aspell dictionaries
-    # The aspell tests here aren't the ideal state, but the None's are
-    # realistic for obscure words
-    ("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
-    (
-        "rare",
-        "for rare (but valid) words that are likely to be errors",
-        "_rare",
-        None,
-        None,
-        None,
-        None,
-    ),
-    (
-        "informal",
-        "for making informal words more formal",
-        "_informal",
-        True,
-        True,
-        supported_languages_en,
-        supported_languages_en,
-    ),
-    (
-        "usage",
-        "for replacing phrasing with recommended terms",
-        "_usage",
-        None,
-        None,
-        None,
-        None,
-    ),
-    (
-        "code",
-        "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)",  # noqa: E501
-        "_code",
-        None,
-        None,
-        None,
-        None,
-    ),
-    (
-        "names",
-        "for valid proper names that might be typos",
-        "_names",
-        None,
-        None,
-        None,
-        None,
-    ),
-    (
-        "en-GB_to_en-US",
-        "for corrections from en-GB to en-US",
-        "_en-GB_to_en-US",
-        True,
-        True,
-        ("en_GB",),
-        ("en_US",),
-    ),
-)
-_builtin_default = "clear,rare"
 
 # docs say os.EX_USAGE et al. are only available on Unix systems, so to be safe
 # we protect and just use the values they are on macOS and Linux
@@ -1145,37 +1083,45 @@ def main(*args: str) -> int:
     dictionaries = flatten_clean_comma_separated_arguments(options.dictionary or ["-"])
 
     use_dictionaries = []
+    builtin_dictionaries: List[str] = []
     for dictionary in dictionaries:
         if dictionary == "-":
-            # figure out which builtin dictionaries to use
-            use = sorted(set(options.builtin.split(",")))
-            for u in use:
+            # validate and clean up the builtin dictionary names to use
+            builtin_dictionaries = sorted(set(options.builtin.split(",")))
+            for name in builtin_dictionaries:
                 for builtin in _builtin_dictionaries:
-                    if builtin[0] == u:
-                        use_dictionaries.append(
-                            os.path.join(_data_root, f"dictionary{builtin[2]}.txt")
-                        )
+                    if builtin[0] == name:
+                        # Valid
                         break
                 else:
                     print(
-                        f"ERROR: Unknown builtin dictionary: {u}",
+                        f"ERROR: Unknown builtin dictionary: {name}",
                         file=sys.stderr,
                     )
                     parser.print_help()
                     return EX_USAGE
-        else:
-            if not os.path.isfile(dictionary):
-                print(
-                    f"ERROR: cannot find dictionary file: {dictionary}",
-                    file=sys.stderr,
-                )
-                parser.print_help()
-                return EX_USAGE
-            use_dictionaries.append(dictionary)
-    spellchecker = Spellchecker()
+        elif not os.path.isfile(dictionary):
+            print(
+                f"ERROR: cannot find dictionary file: {dictionary}",
+                file=sys.stderr,
+            )
+            parser.print_help()
+            return EX_USAGE
+        use_dictionaries.append(dictionary)
+    # Due to the command line options, we need to manually load builtin dictionaries.
+    spellchecker = Spellchecker(builtin_dictionaries=())
     spellchecker.ignore_words_cased = ignore_words_cased
     for dictionary in use_dictionaries:
-        spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
+        if dictionary == "-":
+            spellchecker.load_builtin_dictionaries(
+                builtin_dictionaries,
+                ignore_words=ignore_words,
+            )
+        else:
+            spellchecker.load_dictionary_from_file(
+                dictionary,
+                ignore_words=ignore_words,
+            )
     colors = TermColors()
     if not options.colors:
         colors.disable()
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
@@ -15,7 +15,9 @@
 Copyright (C) 2010-2011  Lucas De Marchi <lucas.de.marchi@gmail.com>
 Copyright (C) 2011  ProFUSION embedded systems
 """
+
 import re
+import os
 from typing import Dict, Sequence, Container, Optional, Iterable, Callable
 
 # Pass all misspellings through this translation table to generate
@@ -26,6 +28,91 @@
 LineTokenizer = Callable[[str], Iterable[re.Match[str]]]
 
 
+supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
+supported_languages = supported_languages_en
+
+# Users might want to link this file into /usr/local/bin, so we resolve the
+# symbolic link path to the real path if necessary.
+_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
+_builtin_dictionaries = (
+    # name, desc, name, err in aspell, correction in aspell, \
+    # err dictionary array, rep dictionary array
+    # The arrays must contain the names of aspell dictionaries
+    # The aspell tests here aren't the ideal state, but the None's are
+    # realistic for obscure words
+    ("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
+    (
+        "rare",
+        "for rare (but valid) words that are likely to be errors",
+        "_rare",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "informal",
+        "for making informal words more formal",
+        "_informal",
+        True,
+        True,
+        supported_languages_en,
+        supported_languages_en,
+    ),
+    (
+        "usage",
+        "for replacing phrasing with recommended terms",
+        "_usage",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "code",
+        "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)",  # noqa: E501
+        "_code",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "names",
+        "for valid proper names that might be typos",
+        "_names",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "en-GB_to_en-US",
+        "for corrections from en-GB to en-US",
+        "_en-GB_to_en-US",
+        True,
+        True,
+        ("en_GB",),
+        ("en_US",),
+    ),
+)
+_builtin_default = "clear,rare"
+
+_builtin_default_as_tuple = tuple(_builtin_default.split(","))
+
+
+class UnknownBuiltinDictionaryError(ValueError):
+    def __init__(self, name: str) -> None:
+        super().__init__(f"Unknown built-in dictionary: {name}")
+
+
+class BuiltinDictionariesAlreadyLoadedError(TypeError):
+    def __init__(self) -> None:
+        super().__init__(
+            "load_builtin_dictionaries must not be called more than once",
+        )
+
+
 class Misspelling:
     def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
         self.candidates = candidates
@@ -43,9 +130,42 @@ def __init__(self, word: str, lword: str, misspelling: Misspelling, match: re.Ma
 
 
 class Spellchecker:
-    def __init__(self) -> None:
+    """The spellchecking dictionaries of codespell
+
+    The Spellchecker is responsible for spellchecking words or lines. It maintains state
+    for known typos, their corrections and known ignored words.
+
+        >>> import re
+        >>> s = Spellchecker()
+        >>> # Very simple tokenizer
+        >>> tokenizer = re.compile(r"[^ ]+").finditer
+        >>> line = "A touple tpyo but also correct words appear" # codespell:ignore
+        >>> issues = list(s.spellcheck_line(line, tokenizer))
+        >>> len(issues) == 2
+        >>> issues[0].word
+        'touple'
+        >>> list(issues[0].misspelling.candidates)
+        ['tuple', 'couple', 'topple', 'toupee']
+        >>> issues[0].misspelling.fix
+        False
+        >>> issues[1].word
+        'tpyo'
+        >>> list(issues[1].misspelling.candidates)
+        ['typo']
+        >>> issues[1].misspelling.fix
+        True
+    """
+
+    def __init__(
+        self,
+        *,
+        builtin_dictionaries: Optional[Sequence[str]] = _builtin_default_as_tuple,
+    ) -> None:
         self._misspellings: Dict[str, Misspelling] = {}
+        self._builtin_loaded = False
         self.ignore_words_cased: Container[str] = frozenset()
+        if builtin_dictionaries:
+            self.load_builtin_dictionaries(builtin_dictionaries)
 
     def spellcheck_line(
         self,
@@ -84,9 +204,68 @@ def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
         """
         return self._misspellings.get(word)
 
-    def add_from_file(self, filename: str, *, ignore_words: Container[str] = frozenset()) -> None:
+    def load_builtin_dictionaries(
+        self,
+        builtin_dictionaries: Iterable[str] = _builtin_default_as_tuple,
+        *,
+        ignore_words: Container[str] = frozenset(),
+    ) -> None:
+        """Load codespell builtin dictionaries (for manual dictionary load order)
+
+        This method enables you to load builtin dictionaries in a special order relative
+        to custom dictionaries. To use this method, you must ensure that the constructor
+        did *not* load any builtin dictionaries.
+
+           >>> s = Spellchecker(builtin_dictionaries=None)
+           >>> # A couple of s.load_dictionary_from_file(...) lines here
+           >>> s.load_builtin_dictionaries("clear")
+
+        This method updates the spellchecker to include any corrected listed
+        in the file. Load order is important. When multiple corrections are
+        loaded for the same typo, then the last loaded corrections for that
+        typo will be used.
+
+        :param builtin_dictionaries: Names of the codespell dictionaries to load
+        :param ignore_words: Words to ignore from this dictionary.
+        """
+        if self._builtin_loaded:
+            # It would work, but if you are doing manual load order, then probably
+            # you will want to be sure it you get it correct.
+            raise BuiltinDictionariesAlreadyLoadedError()
+        for name in sorted(set(builtin_dictionaries)):
+            self._load_builtin_dictionary(name, ignore_words=ignore_words)
+        self._builtin_loaded = True
+
+    def _load_builtin_dictionary(
+        self,
+        name: str,
+        *,
+        ignore_words: Container[str] = frozenset(),
+    ) -> None:
+        for builtin in _builtin_dictionaries:
+            if builtin[0] == name:
+                filename = os.path.join(_data_root, f"dictionary{builtin[2]}.txt")
+                self.load_dictionary_from_file(filename, ignore_words=ignore_words)
+                return
+        raise UnknownBuiltinDictionaryError(name)
+
+    def load_dictionary_from_file(
+        self,
+        filename: str,
+        *,
+        ignore_words: Container[str] = frozenset(),
+    ) -> None:
         """Parse a codespell dictionary
 
+        This is primarily useful for loading custom dictionaries not provided by
+        codespell. This is the API version of the `-D` / `--dictionary` command
+        line option except it only accept files (and not the special `-`).
+
+        This method updates the spellchecker to include any corrected listed in
+        the file. Load order is important. When multiple corrections are loaded
+        for the same typo, then the last loaded corrections for that typo will
+        be used.
+
         :param filename: The codespell dictionary file to parse
         :param ignore_words: Words to ignore from this dictionary.
         """
diff --git a/codespell_lib/tests/test_dictionary.py b/codespell_lib/tests/test_dictionary.py
@@ -7,10 +7,10 @@
 
 import pytest
 
-from codespell_lib._codespell import (
+from codespell_lib._codespell import word_regex_def
+from codespell_lib.spellchecker import (
     _builtin_dictionaries,
     supported_languages,
-    word_regex_def,
 )
 
 spellers = {}
diff --git a/pyproject-codespell.precommit-toml b/pyproject-codespell.precommit-toml