Skip to content

Commit cdde333

Browse files
committed
Support non-regex based tokens for spellcheck_line
The `Spellchecker` only needs the `group` method from the `re.Match`. With a bit of generics and typing protocols, we can make the `Spellchecker` work with any token type that has a `group()` method. The `codespell` command line tool still assumes `re.Match` but it can get that via its own line tokenizer, so it all works out for everyone.
1 parent 9d9425b commit cdde333

File tree

2 files changed

+145
-16
lines changed

2 files changed

+145
-16
lines changed

codespell_lib/_codespell.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,7 @@ def is_text_file(filename: str) -> bool:
716716

717717
def ask_for_word_fix(
718718
line: str,
719-
issue: DetectedMisspelling,
719+
issue: "DetectedMisspelling[re.Match[str]]",
720720
interactivity: int,
721721
colors: TermColors,
722722
) -> Tuple[bool, Sequence[str]]:
@@ -725,7 +725,7 @@ def ask_for_word_fix(
725725
if interactivity <= 0:
726726
return misspelling.fix, fix_case(wrongword, misspelling.candidates)
727727

728-
match = issue.re_match
728+
match = issue.token
729729

730730
line_ui = (
731731
f"{line[:match.start()]}"
@@ -841,7 +841,7 @@ def line_tokenizer_factory(
841841
uri_regex: Pattern[str],
842842
word_regex: Pattern[str],
843843
ignore_word_regex: Optional[Pattern[str]],
844-
) -> LineTokenizer:
844+
) -> "LineTokenizer[re.Match[str]]":
845845
def line_tokenizer(line: str) -> Iterable[Match[str]]:
846846
# If all URI spelling errors will be ignored, erase any URI before
847847
# extracting words. Otherwise, apply ignores after extracting words.

codespell_lib/spellchecker.py

Lines changed: 142 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,135 @@
1616
Copyright (C) 2011 ProFUSION embedded systems
1717
"""
1818

19+
import os
20+
import re
1921
from typing import (
20-
Callable,
2122
Container,
2223
Dict,
24+
Generic,
2325
Iterable,
24-
Match,
2526
Optional,
27+
Protocol,
2628
Sequence,
29+
TypeVar,
2730
)
2831

2932
# Pass all misspellings through this translation table to generate
3033
# alternative misspellings and fixes.
3134
alt_chars = (("'", "’"),) # noqa: RUF001
3235

36+
T_co = TypeVar("T_co", bound="Token", covariant=True)
3337

34-
LineTokenizer = Callable[[str], Iterable[Match[str]]]
38+
39+
supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
40+
supported_languages = supported_languages_en
41+
42+
# Users might want to link this file into /usr/local/bin, so we resolve the
43+
# symbolic link path to the real path if necessary.
44+
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
45+
_builtin_dictionaries = (
46+
# name, desc, name, err in aspell, correction in aspell, \
47+
# err dictionary array, rep dictionary array
48+
# The arrays must contain the names of aspell dictionaries
49+
# The aspell tests here aren't the ideal state, but the None's are
50+
# realistic for obscure words
51+
("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
52+
(
53+
"rare",
54+
"for rare (but valid) words that are likely to be errors",
55+
"_rare",
56+
None,
57+
None,
58+
None,
59+
None,
60+
),
61+
(
62+
"informal",
63+
"for making informal words more formal",
64+
"_informal",
65+
True,
66+
True,
67+
supported_languages_en,
68+
supported_languages_en,
69+
),
70+
(
71+
"usage",
72+
"for replacing phrasing with recommended terms",
73+
"_usage",
74+
None,
75+
None,
76+
None,
77+
None,
78+
),
79+
(
80+
"code",
81+
"for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501
82+
"_code",
83+
None,
84+
None,
85+
None,
86+
None,
87+
),
88+
(
89+
"names",
90+
"for valid proper names that might be typos",
91+
"_names",
92+
None,
93+
None,
94+
None,
95+
None,
96+
),
97+
(
98+
"en-GB_to_en-US",
99+
"for corrections from en-GB to en-US",
100+
"_en-GB_to_en-US",
101+
True,
102+
True,
103+
("en_GB",),
104+
("en_US",),
105+
),
106+
)
107+
_builtin_default = "clear,rare"
108+
109+
_builtin_default_as_tuple = tuple(_builtin_default.split(","))
110+
111+
112+
class UnknownBuiltinDictionaryError(ValueError):
113+
def __init__(self, name: str) -> None:
114+
super().__init__(f"Unknown built-in dictionary: {name}")
115+
116+
117+
class BuiltinDictionariesAlreadyLoadedError(TypeError):
118+
def __init__(self) -> None:
119+
super().__init__(
120+
"load_builtin_dictionaries must not be called more than once",
121+
)
122+
123+
124+
class LineTokenizer(Protocol[T_co]):
125+
"""Callable that splits a line into multiple tokens to be spellchecked
126+
127+
Generally, a regex will do for simple cases. A probably too simple one is:
128+
129+
>>> tokenizer = re.compile(r"[^ ]+").finditer
130+
131+
For more complex cases, either use more complex regexes or custom tokenization
132+
code.
133+
"""
134+
135+
def __call__(self, line: str) -> Iterable[T_co]: ...
136+
137+
138+
class Token(Protocol):
139+
"""Describes a token
140+
141+
This is a protocol to support `re.Match[str]` (which codespell uses) and any
142+
other tokenization method that our API consumers might be using.
143+
"""
144+
145+
def group(self) -> str: ...
146+
147+
def start(self) -> int: ...
35148

36149

37150
class Misspelling:
@@ -41,13 +154,18 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
41154
self.reason = reason
42155

43156

44-
class DetectedMisspelling:
45-
46-
def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None:
157+
class DetectedMisspelling(Generic[T_co]):
158+
def __init__(
159+
self,
160+
word: str,
161+
lword: str,
162+
misspelling: Misspelling,
163+
token: T_co,
164+
) -> None:
47165
self.word = word
48166
self.lword = lword
49167
self.misspelling = misspelling
50-
self.re_match = match
168+
self.token = token
51169

52170

53171
class Spellchecker:
@@ -58,14 +176,25 @@ def __init__(self) -> None:
58176
def spellcheck_line(
59177
self,
60178
line: str,
61-
tokenizer: Callable[[str], Iterable[re.Match[str]]],
179+
tokenizer: LineTokenizer[T_co],
62180
*,
63181
extra_words_to_ignore: Container[str] = frozenset()
64-
) -> Iterable[DetectedMisspelling]:
182+
) -> Iterable[DetectedMisspelling[T_co]]:
183+
"""Tokenize and spellcheck a line
184+
185+
Split the line into tokens based using the provided tokenizer. See the doc
186+
string for the class for an example.
187+
188+
:param line: The line to spellcheck.
189+
:param tokenizer: A callable that will tokenize the line
190+
:param extra_words_to_ignore: Extra words to ignore for this particular line
191+
(such as content from a `codespell:ignore` comment)
192+
"""
65193
misspellings = self._misspellings
66194
ignore_words_cased = self.ignore_words_cased
67-
for match in tokenizer(line):
68-
word = match.group()
195+
196+
for token in tokenizer(line):
197+
word = token.group()
69198
if word in ignore_words_cased:
70199
continue
71200
lword = word.lower()
@@ -74,7 +203,7 @@ def spellcheck_line(
74203
# Sometimes we find a 'misspelling' which is actually a valid word
75204
# preceded by a string escape sequence. Ignore such cases as
76205
# they're usually false alarms; see issue #17 among others.
77-
char_before_idx = match.start() - 1
206+
char_before_idx = token.start() - 1
78207
if (
79208
char_before_idx >= 0
80209
and line[char_before_idx] == "\\"
@@ -83,7 +212,7 @@ def spellcheck_line(
83212
and lword[1:] not in misspellings
84213
):
85214
continue
86-
yield DetectedMisspelling(word, lword, misspelling, match)
215+
yield DetectedMisspelling(word, lword, misspelling, token)
87216

88217
def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
89218
"""Check a given word against the loaded dictionaries

0 commit comments

Comments
 (0)