1818
1919import re
2020import os
21- from typing import Dict , Sequence , Container , Optional , Iterable , Callable
21+ from typing import (
22+ Dict ,
23+ Sequence ,
24+ Container ,
25+ Optional ,
26+ Iterable ,
27+ Protocol ,
28+ Generic ,
29+ TypeVar ,
30+ )
2231
2332# Pass all misspellings through this translation table to generate
2433# alternative misspellings and fixes.
2534alt_chars = (("'" , "’" ),) # noqa: RUF001
2635
27-
28- LineTokenizer = Callable [[str ], Iterable [re .Match [str ]]]
36+ T_co = TypeVar ("T_co" , bound = "Token" , covariant = True )
2937
3038
3139supported_languages_en = ("en" , "en_GB" , "en_US" , "en_CA" , "en_AU" )
@@ -113,20 +121,51 @@ def __init__(self) -> None:
113121 )
114122
115123
124+ class LineTokenizer (Protocol [T_co ]):
125+ """Callable that splits a line into multiple tokens to be spellchecked
126+
127+ Generally, a regex will do for simple cases. A probably too simple one is:
128+
129+ >>> tokenizer = re.compile(r"[^ ]+").finditer
130+
131+ For more complex cases, either use more complex regexes or custom tokenization
132+ code.
133+ """
134+
135+ def __call__ (self , line : str ) -> Iterable [T_co ]: ...
136+
137+
138+ class Token (Protocol ):
139+ """Describes a token
140+
141+ This is a protocol to support `re.Match[str]` (which codespell uses) and any
142+ other tokenization method that our API consumers might be using.
143+ """
144+
145+ def group (self ) -> str : ...
146+
147+ def start (self ) -> int : ...
148+
149+
116150class Misspelling :
117151 def __init__ (self , candidates : Sequence [str ], fix : bool , reason : str ) -> None :
118152 self .candidates = candidates
119153 self .fix = fix
120154 self .reason = reason
121155
122156
123- class DetectedMisspelling :
124-
125- def __init__ (self , word : str , lword : str , misspelling : Misspelling , match : re .Match [str ]) -> None :
157+ class DetectedMisspelling (Generic [T_co ]):
158+ def __init__ (
159+ self ,
160+ word : str ,
161+ lword : str ,
162+ misspelling : Misspelling ,
163+ token : T_co ,
164+ ) -> None :
126165 self .word = word
127166 self .lword = lword
128167 self .misspelling = misspelling
129- self .re_match = match
168+ self .token = token
130169
131170
132171class Spellchecker :
@@ -170,14 +209,25 @@ def __init__(
170209 def spellcheck_line (
171210 self ,
172211 line : str ,
173- tokenizer : Callable [[ str ], Iterable [ re . Match [ str ]] ],
212+ tokenizer : LineTokenizer [ T_co ],
174213 * ,
175214 extra_words_to_ignore : Container [str ] = frozenset ()
176- ) -> Iterable [DetectedMisspelling ]:
215+ ) -> Iterable [DetectedMisspelling [T_co ]]:
216+ """Tokenize and spellcheck a line
217+
218+ Split the line into tokens based using the provided tokenizer. See the doc
219+ string for the class for an example.
220+
221+ :param line: The line to spellcheck.
222+ :param tokenizer: A callable that will tokenize the line
223+ :param extra_words_to_ignore: Extra words to ignore for this particular line
224+ (such as content from a `codespell:ignore` comment)
225+ """
177226 misspellings = self ._misspellings
178227 ignore_words_cased = self .ignore_words_cased
179- for match in tokenizer (line ):
180- word = match .group ()
228+
229+ for token in tokenizer (line ):
230+ word = token .group ()
181231 if word in ignore_words_cased :
182232 continue
183233 lword = word .lower ()
@@ -186,7 +236,7 @@ def spellcheck_line(
186236 # Sometimes we find a 'misspelling' which is actually a valid word
187237 # preceded by a string escape sequence. Ignore such cases as
188238 # they're usually false alarms; see issue #17 among others.
189- char_before_idx = match .start () - 1
239+ char_before_idx = token .start () - 1
190240 if (
191241 char_before_idx >= 0
192242 and line [char_before_idx ] == "\\ "
@@ -195,7 +245,7 @@ def spellcheck_line(
195245 and lword [1 :] not in misspellings
196246 ):
197247 continue
198- yield DetectedMisspelling (word , lword , misspelling , match )
248+ yield DetectedMisspelling (word , lword , misspelling , token )
199249
200250 def check_lower_cased_word (self , word : str ) -> Optional [Misspelling ]:
201251 """Check a given word against the loaded dictionaries
0 commit comments