Skip to content

Commit 94ad8f6

Browse files
committed
Support non-regex based tokens for spellcheck_line
The `Spellchecker` only needs the `group` method from the `re.Match`. With a bit of generics and typing protocols, we can make the `Spellchecker` work with any token type that has a `group()` method. The `codespell` command line tool still assumes `re.Match` but it can get that via its own line tokenizer, so it all works out for everyone.
1 parent f939593 commit 94ad8f6

File tree

2 files changed

+66
-16
lines changed

2 files changed

+66
-16
lines changed

codespell_lib/_codespell.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -649,7 +649,7 @@ def is_text_file(filename: str) -> bool:
649649

650650
def ask_for_word_fix(
651651
line: str,
652-
issue: DetectedMisspelling,
652+
issue: "DetectedMisspelling[re.Match[str]]",
653653
interactivity: int,
654654
colors: TermColors,
655655
) -> Tuple[bool, Sequence[str]]:
@@ -658,7 +658,7 @@ def ask_for_word_fix(
658658
if interactivity <= 0:
659659
return misspelling.fix, fix_case(wrongword, misspelling.candidates)
660660

661-
match = issue.re_match
661+
match = issue.token
662662

663663
line_ui = (
664664
f"{line[:match.start()]}"
@@ -774,7 +774,7 @@ def line_tokenizer_factory(
774774
uri_regex: Pattern[str],
775775
word_regex: Pattern[str],
776776
ignore_word_regex: Optional[Pattern[str]],
777-
) -> LineTokenizer:
777+
) -> "LineTokenizer[re.Match[str]]":
778778
def line_tokenizer(line: str) -> Iterable[Match[str]]:
779779
# If all URI spelling errors will be ignored, erase any URI before
780780
# extracting words. Otherwise, apply ignores after extracting words.

codespell_lib/spellchecker.py

Lines changed: 63 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,22 @@
1818

1919
import re
2020
import os
21-
from typing import Dict, Sequence, Container, Optional, Iterable, Callable
21+
from typing import (
22+
Dict,
23+
Sequence,
24+
Container,
25+
Optional,
26+
Iterable,
27+
Protocol,
28+
Generic,
29+
TypeVar,
30+
)
2231

2332
# Pass all misspellings through this translation table to generate
2433
# alternative misspellings and fixes.
2534
alt_chars = (("'", "’"),) # noqa: RUF001
2635

27-
28-
LineTokenizer = Callable[[str], Iterable[re.Match[str]]]
36+
T_co = TypeVar("T_co", bound="Token", covariant=True)
2937

3038

3139
supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
@@ -113,20 +121,51 @@ def __init__(self) -> None:
113121
)
114122

115123

124+
class LineTokenizer(Protocol[T_co]):
125+
"""Callable that splits a line into multiple tokens to be spellchecked
126+
127+
Generally, a regex will do for simple cases. A probably too simple one is:
128+
129+
>>> tokenizer = re.compile(r"[^ ]+").finditer
130+
131+
For more complex cases, either use more complex regexes or custom tokenization
132+
code.
133+
"""
134+
135+
def __call__(self, line: str) -> Iterable[T_co]: ...
136+
137+
138+
class Token(Protocol):
139+
"""Describes a token
140+
141+
This is a protocol to support `re.Match[str]` (which codespell uses) and any
142+
other tokenization method that our API consumers might be using.
143+
"""
144+
145+
def group(self) -> str: ...
146+
147+
def start(self) -> int: ...
148+
149+
116150
class Misspelling:
117151
def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
118152
self.candidates = candidates
119153
self.fix = fix
120154
self.reason = reason
121155

122156

123-
class DetectedMisspelling:
124-
125-
def __init__(self, word: str, lword: str, misspelling: Misspelling, match: re.Match[str]) -> None:
157+
class DetectedMisspelling(Generic[T_co]):
158+
def __init__(
159+
self,
160+
word: str,
161+
lword: str,
162+
misspelling: Misspelling,
163+
token: T_co,
164+
) -> None:
126165
self.word = word
127166
self.lword = lword
128167
self.misspelling = misspelling
129-
self.re_match = match
168+
self.token = token
130169

131170

132171
class Spellchecker:
@@ -170,14 +209,25 @@ def __init__(
170209
def spellcheck_line(
171210
self,
172211
line: str,
173-
tokenizer: Callable[[str], Iterable[re.Match[str]]],
212+
tokenizer: LineTokenizer[T_co],
174213
*,
175214
extra_words_to_ignore: Container[str] = frozenset()
176-
) -> Iterable[DetectedMisspelling]:
215+
) -> Iterable[DetectedMisspelling[T_co]]:
216+
"""Tokenize and spellcheck a line
217+
218+
Split the line into tokens based using the provided tokenizer. See the doc
219+
string for the class for an example.
220+
221+
:param line: The line to spellcheck.
222+
:param tokenizer: A callable that will tokenize the line
223+
:param extra_words_to_ignore: Extra words to ignore for this particular line
224+
(such as content from a `codespell:ignore` comment)
225+
"""
177226
misspellings = self._misspellings
178227
ignore_words_cased = self.ignore_words_cased
179-
for match in tokenizer(line):
180-
word = match.group()
228+
229+
for token in tokenizer(line):
230+
word = token.group()
181231
if word in ignore_words_cased:
182232
continue
183233
lword = word.lower()
@@ -186,7 +236,7 @@ def spellcheck_line(
186236
# Sometimes we find a 'misspelling' which is actually a valid word
187237
# preceded by a string escape sequence. Ignore such cases as
188238
# they're usually false alarms; see issue #17 among others.
189-
char_before_idx = match.start() - 1
239+
char_before_idx = token.start() - 1
190240
if (
191241
char_before_idx >= 0
192242
and line[char_before_idx] == "\\"
@@ -195,7 +245,7 @@ def spellcheck_line(
195245
and lword[1:] not in misspellings
196246
):
197247
continue
198-
yield DetectedMisspelling(word, lword, misspelling, match)
248+
yield DetectedMisspelling(word, lword, misspelling, token)
199249

200250
def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
201251
"""Check a given word against the loaded dictionaries

0 commit comments

Comments
 (0)