1616Copyright (C) 2011 ProFUSION embedded systems
1717"""
1818
19+ import os
20+ import re
1921from typing import (
20- Callable ,
2122 Container ,
2223 Dict ,
24+ Generic ,
2325 Iterable ,
24- Match ,
2526 Optional ,
27+ Protocol ,
2628 Sequence ,
29+ TypeVar ,
2730)
2831
2932# Pass all misspellings through this translation table to generate
3033# alternative misspellings and fixes.
3134alt_chars = (("'" , "’" ),) # noqa: RUF001
3235
36+ T_co = TypeVar ("T_co" , bound = "Token" , covariant = True )
3337
34- LineTokenizer = Callable [[str ], Iterable [Match [str ]]]
38+
39+ supported_languages_en = ("en" , "en_GB" , "en_US" , "en_CA" , "en_AU" )
40+ supported_languages = supported_languages_en
41+
42+ # Users might want to link this file into /usr/local/bin, so we resolve the
43+ # symbolic link path to the real path if necessary.
44+ _data_root = os .path .join (os .path .dirname (os .path .realpath (__file__ )), "data" )
45+ _builtin_dictionaries = (
46+ # name, desc, name, err in aspell, correction in aspell, \
47+ # err dictionary array, rep dictionary array
48+ # The arrays must contain the names of aspell dictionaries
49+ # The aspell tests here aren't the ideal state, but the None's are
50+ # realistic for obscure words
51+ ("clear" , "for unambiguous errors" , "" , False , None , supported_languages_en , None ),
52+ (
53+ "rare" ,
54+ "for rare (but valid) words that are likely to be errors" ,
55+ "_rare" ,
56+ None ,
57+ None ,
58+ None ,
59+ None ,
60+ ),
61+ (
62+ "informal" ,
63+ "for making informal words more formal" ,
64+ "_informal" ,
65+ True ,
66+ True ,
67+ supported_languages_en ,
68+ supported_languages_en ,
69+ ),
70+ (
71+ "usage" ,
72+ "for replacing phrasing with recommended terms" ,
73+ "_usage" ,
74+ None ,
75+ None ,
76+ None ,
77+ None ,
78+ ),
79+ (
80+ "code" ,
81+ "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)" , # noqa: E501
82+ "_code" ,
83+ None ,
84+ None ,
85+ None ,
86+ None ,
87+ ),
88+ (
89+ "names" ,
90+ "for valid proper names that might be typos" ,
91+ "_names" ,
92+ None ,
93+ None ,
94+ None ,
95+ None ,
96+ ),
97+ (
98+ "en-GB_to_en-US" ,
99+ "for corrections from en-GB to en-US" ,
100+ "_en-GB_to_en-US" ,
101+ True ,
102+ True ,
103+ ("en_GB" ,),
104+ ("en_US" ,),
105+ ),
106+ )
107+ _builtin_default = "clear,rare"
108+
109+ _builtin_default_as_tuple = tuple (_builtin_default .split ("," ))
110+
111+
112+ class UnknownBuiltinDictionaryError (ValueError ):
113+ def __init__ (self , name : str ) -> None :
114+ super ().__init__ (f"Unknown built-in dictionary: { name } " )
115+
116+
117+ class BuiltinDictionariesAlreadyLoadedError (TypeError ):
118+ def __init__ (self ) -> None :
119+ super ().__init__ (
120+ "load_builtin_dictionaries must not be called more than once" ,
121+ )
122+
123+
124+ class LineTokenizer (Protocol [T_co ]):
125+ """Callable that splits a line into multiple tokens to be spellchecked
126+
127+ Generally, a regex will do for simple cases. A probably too simple one is:
128+
129+ >>> tokenizer = re.compile(r"[^ ]+").finditer
130+
131+ For more complex cases, either use more complex regexes or custom tokenization
132+ code.
133+ """
134+
135+ def __call__ (self , line : str ) -> Iterable [T_co ]: ...
136+
137+
138+ class Token (Protocol ):
139+ """Describes a token
140+
141+ This is a protocol to support `re.Match[str]` (which codespell uses) and any
142+ other tokenization method that our API consumers might be using.
143+ """
144+
145+ def group (self ) -> str : ...
146+
147+ def start (self ) -> int : ...
35148
36149
37150class Misspelling :
@@ -41,13 +154,18 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
41154 self .reason = reason
42155
43156
44- class DetectedMisspelling :
45-
46- def __init__ (self , word : str , lword : str , misspelling : Misspelling , match : Match [str ]) -> None :
157+ class DetectedMisspelling (Generic [T_co ]):
158+ def __init__ (
159+ self ,
160+ word : str ,
161+ lword : str ,
162+ misspelling : Misspelling ,
163+ token : T_co ,
164+ ) -> None :
47165 self .word = word
48166 self .lword = lword
49167 self .misspelling = misspelling
50- self .re_match = match
168+ self .token = token
51169
52170
53171class Spellchecker :
@@ -58,14 +176,25 @@ def __init__(self) -> None:
58176 def spellcheck_line (
59177 self ,
60178 line : str ,
61- tokenizer : Callable [[ str ], Iterable [ re . Match [ str ]] ],
179+ tokenizer : LineTokenizer [ T_co ],
62180 * ,
63181 extra_words_to_ignore : Container [str ] = frozenset ()
64- ) -> Iterable [DetectedMisspelling ]:
182+ ) -> Iterable [DetectedMisspelling [T_co ]]:
183+ """Tokenize and spellcheck a line
184+
185+ Split the line into tokens based using the provided tokenizer. See the doc
186+ string for the class for an example.
187+
188+ :param line: The line to spellcheck.
189+ :param tokenizer: A callable that will tokenize the line
190+ :param extra_words_to_ignore: Extra words to ignore for this particular line
191+ (such as content from a `codespell:ignore` comment)
192+ """
65193 misspellings = self ._misspellings
66194 ignore_words_cased = self .ignore_words_cased
67- for match in tokenizer (line ):
68- word = match .group ()
195+
196+ for token in tokenizer (line ):
197+ word = token .group ()
69198 if word in ignore_words_cased :
70199 continue
71200 lword = word .lower ()
@@ -74,7 +203,7 @@ def spellcheck_line(
74203 # Sometimes we find a 'misspelling' which is actually a valid word
75204 # preceded by a string escape sequence. Ignore such cases as
76205 # they're usually false alarms; see issue #17 among others.
77- char_before_idx = match .start () - 1
206+ char_before_idx = token .start () - 1
78207 if (
79208 char_before_idx >= 0
80209 and line [char_before_idx ] == "\\ "
@@ -83,7 +212,7 @@ def spellcheck_line(
83212 and lword [1 :] not in misspellings
84213 ):
85214 continue
86- yield DetectedMisspelling (word , lword , misspelling , match )
215+ yield DetectedMisspelling (word , lword , misspelling , token )
87216
88217 def check_lower_cased_word (self , word : str ) -> Optional [Misspelling ]:
89218 """Check a given word against the loaded dictionaries
0 commit comments