Skip to content

Commit f939593

Browse files
committed
Make Spellchecker() load builtin dictionaries by default
This makes the API easier to use out of the box.
1 parent 0d86cc6 commit f939593

File tree

5 files changed

+221
-96
lines changed

5 files changed

+221
-96
lines changed

.github/workflows/codespell.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ jobs:
1616
with:
1717
check_filenames: true
1818
# When using this Action in other repos, the --skip option below can be removed
19-
skip: "./.git,./codespell_lib/data,./example/code.c,test_basic.py,*.pyc,README.rst,pyproject-codespell.precommit-toml"
19+
skip: "./.git,./codespell_lib/data,./example/code.c,spellchecker.py,test_basic.py,*.pyc,README.rst,pyproject-codespell.precommit-toml"

codespell_lib/_codespell.py

Lines changed: 36 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,19 @@
3939
Tuple,
4040
)
4141

42+
from ._text_util import fix_case
43+
4244
# autogenerated by setuptools_scm
4345
from ._version import ( # type: ignore[import-not-found]
4446
__version__ as VERSION, # noqa: N812
4547
)
46-
from .spellchecker import Spellchecker, LineTokenizer, DetectedMisspelling
47-
from ._text_util import fix_case
48+
from .spellchecker import (
49+
DetectedMisspelling,
50+
LineTokenizer,
51+
Spellchecker,
52+
_builtin_default,
53+
_builtin_dictionaries,
54+
)
4855

4956
word_regex_def = r"[\w\-'’]+" # noqa: RUF001
5057
# While we want to treat characters like ( or " as okay for a starting break,
@@ -59,75 +66,6 @@
5966
\t%prog [OPTIONS] [file1 file2 ... fileN]
6067
"""
6168

62-
supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
63-
supported_languages = supported_languages_en
64-
65-
# Users might want to link this file into /usr/local/bin, so we resolve the
66-
# symbolic link path to the real path if necessary.
67-
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
68-
_builtin_dictionaries = (
69-
# name, desc, name, err in aspell, correction in aspell, \
70-
# err dictionary array, rep dictionary array
71-
# The arrays must contain the names of aspell dictionaries
72-
# The aspell tests here aren't the ideal state, but the None's are
73-
# realistic for obscure words
74-
("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
75-
(
76-
"rare",
77-
"for rare (but valid) words that are likely to be errors",
78-
"_rare",
79-
None,
80-
None,
81-
None,
82-
None,
83-
),
84-
(
85-
"informal",
86-
"for making informal words more formal",
87-
"_informal",
88-
True,
89-
True,
90-
supported_languages_en,
91-
supported_languages_en,
92-
),
93-
(
94-
"usage",
95-
"for replacing phrasing with recommended terms",
96-
"_usage",
97-
None,
98-
None,
99-
None,
100-
None,
101-
),
102-
(
103-
"code",
104-
"for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501
105-
"_code",
106-
None,
107-
None,
108-
None,
109-
None,
110-
),
111-
(
112-
"names",
113-
"for valid proper names that might be typos",
114-
"_names",
115-
None,
116-
None,
117-
None,
118-
None,
119-
),
120-
(
121-
"en-GB_to_en-US",
122-
"for corrections from en-GB to en-US",
123-
"_en-GB_to_en-US",
124-
True,
125-
True,
126-
("en_GB",),
127-
("en_US",),
128-
),
129-
)
130-
_builtin_default = "clear,rare"
13169

13270
# docs say os.EX_USAGE et al. are only available on Unix systems, so to be safe
13371
# we protect and just use the values they are on macOS and Linux
@@ -1145,37 +1083,45 @@ def main(*args: str) -> int:
11451083
dictionaries = flatten_clean_comma_separated_arguments(options.dictionary or ["-"])
11461084

11471085
use_dictionaries = []
1086+
builtin_dictionaries: List[str] = []
11481087
for dictionary in dictionaries:
11491088
if dictionary == "-":
1150-
# figure out which builtin dictionaries to use
1151-
use = sorted(set(options.builtin.split(",")))
1152-
for u in use:
1089+
# validate and clean up the builtin dictionary names to use
1090+
builtin_dictionaries = sorted(set(options.builtin.split(",")))
1091+
for name in builtin_dictionaries:
11531092
for builtin in _builtin_dictionaries:
1154-
if builtin[0] == u:
1155-
use_dictionaries.append(
1156-
os.path.join(_data_root, f"dictionary{builtin[2]}.txt")
1157-
)
1093+
if builtin[0] == name:
1094+
# Valid
11581095
break
11591096
else:
11601097
print(
1161-
f"ERROR: Unknown builtin dictionary: {u}",
1098+
f"ERROR: Unknown builtin dictionary: {name}",
11621099
file=sys.stderr,
11631100
)
11641101
parser.print_help()
11651102
return EX_USAGE
1166-
else:
1167-
if not os.path.isfile(dictionary):
1168-
print(
1169-
f"ERROR: cannot find dictionary file: {dictionary}",
1170-
file=sys.stderr,
1171-
)
1172-
parser.print_help()
1173-
return EX_USAGE
1174-
use_dictionaries.append(dictionary)
1175-
spellchecker = Spellchecker()
1103+
elif not os.path.isfile(dictionary):
1104+
print(
1105+
f"ERROR: cannot find dictionary file: {dictionary}",
1106+
file=sys.stderr,
1107+
)
1108+
parser.print_help()
1109+
return EX_USAGE
1110+
use_dictionaries.append(dictionary)
1111+
# Due to the command line options, we need to manually load builtin dictionaries.
1112+
spellchecker = Spellchecker(builtin_dictionaries=())
11761113
spellchecker.ignore_words_cased = ignore_words_cased
11771114
for dictionary in use_dictionaries:
1178-
spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
1115+
if dictionary == "-":
1116+
spellchecker.load_builtin_dictionaries(
1117+
builtin_dictionaries,
1118+
ignore_words=ignore_words,
1119+
)
1120+
else:
1121+
spellchecker.load_dictionary_from_file(
1122+
dictionary,
1123+
ignore_words=ignore_words,
1124+
)
11791125
colors = TermColors()
11801126
if not options.colors:
11811127
colors.disable()

codespell_lib/spellchecker.py

Lines changed: 181 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
Copyright (C) 2010-2011 Lucas De Marchi <[email protected]>
1616
Copyright (C) 2011 ProFUSION embedded systems
1717
"""
18+
1819
import re
20+
import os
1921
from typing import Dict, Sequence, Container, Optional, Iterable, Callable
2022

2123
# Pass all misspellings through this translation table to generate
@@ -26,6 +28,91 @@
2628
LineTokenizer = Callable[[str], Iterable[re.Match[str]]]
2729

2830

31+
supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
32+
supported_languages = supported_languages_en
33+
34+
# Users might want to link this file into /usr/local/bin, so we resolve the
35+
# symbolic link path to the real path if necessary.
36+
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
37+
_builtin_dictionaries = (
38+
# name, desc, name, err in aspell, correction in aspell, \
39+
# err dictionary array, rep dictionary array
40+
# The arrays must contain the names of aspell dictionaries
41+
# The aspell tests here aren't the ideal state, but the None's are
42+
# realistic for obscure words
43+
("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
44+
(
45+
"rare",
46+
"for rare (but valid) words that are likely to be errors",
47+
"_rare",
48+
None,
49+
None,
50+
None,
51+
None,
52+
),
53+
(
54+
"informal",
55+
"for making informal words more formal",
56+
"_informal",
57+
True,
58+
True,
59+
supported_languages_en,
60+
supported_languages_en,
61+
),
62+
(
63+
"usage",
64+
"for replacing phrasing with recommended terms",
65+
"_usage",
66+
None,
67+
None,
68+
None,
69+
None,
70+
),
71+
(
72+
"code",
73+
"for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501
74+
"_code",
75+
None,
76+
None,
77+
None,
78+
None,
79+
),
80+
(
81+
"names",
82+
"for valid proper names that might be typos",
83+
"_names",
84+
None,
85+
None,
86+
None,
87+
None,
88+
),
89+
(
90+
"en-GB_to_en-US",
91+
"for corrections from en-GB to en-US",
92+
"_en-GB_to_en-US",
93+
True,
94+
True,
95+
("en_GB",),
96+
("en_US",),
97+
),
98+
)
99+
_builtin_default = "clear,rare"
100+
101+
_builtin_default_as_tuple = tuple(_builtin_default.split(","))
102+
103+
104+
class UnknownBuiltinDictionaryError(ValueError):
105+
def __init__(self, name: str) -> None:
106+
super().__init__(f"Unknown built-in dictionary: {name}")
107+
108+
109+
class BuiltinDictionariesAlreadyLoadedError(TypeError):
110+
def __init__(self) -> None:
111+
super().__init__(
112+
"load_builtin_dictionaries must not be called more than once",
113+
)
114+
115+
29116
class Misspelling:
30117
def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
31118
self.candidates = candidates
@@ -43,9 +130,42 @@ def __init__(self, word: str, lword: str, misspelling: Misspelling, match: re.Ma
43130

44131

45132
class Spellchecker:
46-
def __init__(self) -> None:
133+
"""The spellchecking dictionaries of codespell
134+
135+
The Spellchecker is responsible for spellchecking words or lines. It maintains state
136+
for known typos, their corrections and known ignored words.
137+
138+
>>> import re
139+
>>> s = Spellchecker()
140+
>>> # Very simple tokenizer
141+
>>> tokenizer = re.compile(r"[^ ]+").finditer
142+
>>> line = "A touple tpyo but also correct words appear" # codespell:ignore
143+
>>> issues = list(s.spellcheck_line(line, tokenizer))
144+
>>> len(issues) == 2
145+
>>> issues[0].word
146+
'touple'
147+
>>> list(issues[0].misspelling.candidates)
148+
['tuple', 'couple', 'topple', 'toupee']
149+
>>> issues[0].misspelling.fix
150+
False
151+
>>> issues[1].word
152+
'tpyo'
153+
>>> list(issues[1].misspelling.candidates)
154+
['typo']
155+
>>> issues[1].misspelling.fix
156+
True
157+
"""
158+
159+
def __init__(
160+
self,
161+
*,
162+
builtin_dictionaries: Optional[Sequence[str]] = _builtin_default_as_tuple,
163+
) -> None:
47164
self._misspellings: Dict[str, Misspelling] = {}
165+
self._builtin_loaded = False
48166
self.ignore_words_cased: Container[str] = frozenset()
167+
if builtin_dictionaries:
168+
self.load_builtin_dictionaries(builtin_dictionaries)
49169

50170
def spellcheck_line(
51171
self,
@@ -84,9 +204,68 @@ def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
84204
"""
85205
return self._misspellings.get(word)
86206

87-
def add_from_file(self, filename: str, *, ignore_words: Container[str] = frozenset()) -> None:
207+
def load_builtin_dictionaries(
208+
self,
209+
builtin_dictionaries: Iterable[str] = _builtin_default_as_tuple,
210+
*,
211+
ignore_words: Container[str] = frozenset(),
212+
) -> None:
213+
"""Load codespell builtin dictionaries (for manual dictionary load order)
214+
215+
This method enables you to load builtin dictionaries in a special order relative
216+
to custom dictionaries. To use this method, you must ensure that the constructor
217+
did *not* load any builtin dictionaries.
218+
219+
>>> s = Spellchecker(builtin_dictionaries=None)
220+
>>> # A couple of s.load_dictionary_from_file(...) lines here
221+
>>> s.load_builtin_dictionaries("clear")
222+
223+
This method updates the spellchecker to include any corrected listed
224+
in the file. Load order is important. When multiple corrections are
225+
loaded for the same typo, then the last loaded corrections for that
226+
typo will be used.
227+
228+
:param builtin_dictionaries: Names of the codespell dictionaries to load
229+
:param ignore_words: Words to ignore from this dictionary.
230+
"""
231+
if self._builtin_loaded:
232+
# It would work, but if you are doing manual load order, then probably
233+
# you will want to be sure it you get it correct.
234+
raise BuiltinDictionariesAlreadyLoadedError()
235+
for name in sorted(set(builtin_dictionaries)):
236+
self._load_builtin_dictionary(name, ignore_words=ignore_words)
237+
self._builtin_loaded = True
238+
239+
def _load_builtin_dictionary(
240+
self,
241+
name: str,
242+
*,
243+
ignore_words: Container[str] = frozenset(),
244+
) -> None:
245+
for builtin in _builtin_dictionaries:
246+
if builtin[0] == name:
247+
filename = os.path.join(_data_root, f"dictionary{builtin[2]}.txt")
248+
self.load_dictionary_from_file(filename, ignore_words=ignore_words)
249+
return
250+
raise UnknownBuiltinDictionaryError(name)
251+
252+
def load_dictionary_from_file(
253+
self,
254+
filename: str,
255+
*,
256+
ignore_words: Container[str] = frozenset(),
257+
) -> None:
88258
"""Parse a codespell dictionary
89259
260+
This is primarily useful for loading custom dictionaries not provided by
261+
codespell. This is the API version of the `-D` / `--dictionary` command
262+
line option except it only accept files (and not the special `-`).
263+
264+
This method updates the spellchecker to include any corrected listed in
265+
the file. Load order is important. When multiple corrections are loaded
266+
for the same typo, then the last loaded corrections for that typo will
267+
be used.
268+
90269
:param filename: The codespell dictionary file to parse
91270
:param ignore_words: Words to ignore from this dictionary.
92271
"""

codespell_lib/tests/test_dictionary.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77

88
import pytest
99

10-
from codespell_lib._codespell import (
10+
from codespell_lib._codespell import word_regex_def
11+
from codespell_lib.spellchecker import (
1112
_builtin_dictionaries,
1213
supported_languages,
13-
word_regex_def,
1414
)
1515

1616
spellers = {}

0 commit comments

Comments
 (0)