1515Copyright (C) 2010-2011 Lucas De Marchi <[email protected] > 1616Copyright (C) 2011 ProFUSION embedded systems
1717"""
18+
1819import re
20+ import os
1921from typing import Dict , Sequence , Container , Optional , Iterable , Callable
2022
2123# Pass all misspellings through this translation table to generate
2628LineTokenizer = Callable [[str ], Iterable [re .Match [str ]]]
2729
2830
31+ supported_languages_en = ("en" , "en_GB" , "en_US" , "en_CA" , "en_AU" )
32+ supported_languages = supported_languages_en
33+
34+ # Users might want to link this file into /usr/local/bin, so we resolve the
35+ # symbolic link path to the real path if necessary.
36+ _data_root = os .path .join (os .path .dirname (os .path .realpath (__file__ )), "data" )
37+ _builtin_dictionaries = (
38+ # name, desc, name, err in aspell, correction in aspell, \
39+ # err dictionary array, rep dictionary array
40+ # The arrays must contain the names of aspell dictionaries
41+ # The aspell tests here aren't the ideal state, but the None's are
42+ # realistic for obscure words
43+ ("clear" , "for unambiguous errors" , "" , False , None , supported_languages_en , None ),
44+ (
45+ "rare" ,
46+ "for rare (but valid) words that are likely to be errors" ,
47+ "_rare" ,
48+ None ,
49+ None ,
50+ None ,
51+ None ,
52+ ),
53+ (
54+ "informal" ,
55+ "for making informal words more formal" ,
56+ "_informal" ,
57+ True ,
58+ True ,
59+ supported_languages_en ,
60+ supported_languages_en ,
61+ ),
62+ (
63+ "usage" ,
64+ "for replacing phrasing with recommended terms" ,
65+ "_usage" ,
66+ None ,
67+ None ,
68+ None ,
69+ None ,
70+ ),
71+ (
72+ "code" ,
73+ "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)" , # noqa: E501
74+ "_code" ,
75+ None ,
76+ None ,
77+ None ,
78+ None ,
79+ ),
80+ (
81+ "names" ,
82+ "for valid proper names that might be typos" ,
83+ "_names" ,
84+ None ,
85+ None ,
86+ None ,
87+ None ,
88+ ),
89+ (
90+ "en-GB_to_en-US" ,
91+ "for corrections from en-GB to en-US" ,
92+ "_en-GB_to_en-US" ,
93+ True ,
94+ True ,
95+ ("en_GB" ,),
96+ ("en_US" ,),
97+ ),
98+ )
99+ _builtin_default = "clear,rare"
100+
101+ _builtin_default_as_tuple = tuple (_builtin_default .split ("," ))
102+
103+
104+ class UnknownBuiltinDictionaryError (ValueError ):
105+ def __init__ (self , name : str ) -> None :
106+ super ().__init__ (f"Unknown built-in dictionary: { name } " )
107+
108+
109+ class BuiltinDictionariesAlreadyLoadedError (TypeError ):
110+ def __init__ (self ) -> None :
111+ super ().__init__ (
112+ "load_builtin_dictionaries must not be called more than once" ,
113+ )
114+
115+
29116class Misspelling :
30117 def __init__ (self , candidates : Sequence [str ], fix : bool , reason : str ) -> None :
31118 self .candidates = candidates
@@ -43,9 +130,42 @@ def __init__(self, word: str, lword: str, misspelling: Misspelling, match: re.Ma
43130
44131
45132class Spellchecker :
46- def __init__ (self ) -> None :
133+ """The spellchecking dictionaries of codespell
134+
135+ The Spellchecker is responsible for spellchecking words or lines. It maintains state
136+ for known typos, their corrections and known ignored words.
137+
138+ >>> import re
139+ >>> s = Spellchecker()
140+ >>> # Very simple tokenizer
141+ >>> tokenizer = re.compile(r"[^ ]+").finditer
142+ >>> line = "A touple tpyo but also correct words appear" # codespell:ignore
143+ >>> issues = list(s.spellcheck_line(line, tokenizer))
144+ >>> len(issues) == 2
145+ >>> issues[0].word
146+ 'touple'
147+ >>> list(issues[0].misspelling.candidates)
148+ ['tuple', 'couple', 'topple', 'toupee']
149+ >>> issues[0].misspelling.fix
150+ False
151+ >>> issues[1].word
152+ 'tpyo'
153+ >>> list(issues[1].misspelling.candidates)
154+ ['typo']
155+ >>> issues[1].misspelling.fix
156+ True
157+ """
158+
159+ def __init__ (
160+ self ,
161+ * ,
162+ builtin_dictionaries : Optional [Sequence [str ]] = _builtin_default_as_tuple ,
163+ ) -> None :
47164 self ._misspellings : Dict [str , Misspelling ] = {}
165+ self ._builtin_loaded = False
48166 self .ignore_words_cased : Container [str ] = frozenset ()
167+ if builtin_dictionaries :
168+ self .load_builtin_dictionaries (builtin_dictionaries )
49169
50170 def spellcheck_line (
51171 self ,
@@ -84,9 +204,68 @@ def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
84204 """
85205 return self ._misspellings .get (word )
86206
87- def add_from_file (self , filename : str , * , ignore_words : Container [str ] = frozenset ()) -> None :
207+ def load_builtin_dictionaries (
208+ self ,
209+ builtin_dictionaries : Iterable [str ] = _builtin_default_as_tuple ,
210+ * ,
211+ ignore_words : Container [str ] = frozenset (),
212+ ) -> None :
213+ """Load codespell builtin dictionaries (for manual dictionary load order)
214+
215+ This method enables you to load builtin dictionaries in a special order relative
216+ to custom dictionaries. To use this method, you must ensure that the constructor
217+ did *not* load any builtin dictionaries.
218+
219+ >>> s = Spellchecker(builtin_dictionaries=None)
220+ >>> # A couple of s.load_dictionary_from_file(...) lines here
221+ >>> s.load_builtin_dictionaries("clear")
222+
223+ This method updates the spellchecker to include any corrected listed
224+ in the file. Load order is important. When multiple corrections are
225+ loaded for the same typo, then the last loaded corrections for that
226+ typo will be used.
227+
228+ :param builtin_dictionaries: Names of the codespell dictionaries to load
229+ :param ignore_words: Words to ignore from this dictionary.
230+ """
231+ if self ._builtin_loaded :
232+ # It would work, but if you are doing manual load order, then probably
233+ # you will want to be sure it you get it correct.
234+ raise BuiltinDictionariesAlreadyLoadedError ()
235+ for name in sorted (set (builtin_dictionaries )):
236+ self ._load_builtin_dictionary (name , ignore_words = ignore_words )
237+ self ._builtin_loaded = True
238+
239+ def _load_builtin_dictionary (
240+ self ,
241+ name : str ,
242+ * ,
243+ ignore_words : Container [str ] = frozenset (),
244+ ) -> None :
245+ for builtin in _builtin_dictionaries :
246+ if builtin [0 ] == name :
247+ filename = os .path .join (_data_root , f"dictionary{ builtin [2 ]} .txt" )
248+ self .load_dictionary_from_file (filename , ignore_words = ignore_words )
249+ return
250+ raise UnknownBuiltinDictionaryError (name )
251+
252+ def load_dictionary_from_file (
253+ self ,
254+ filename : str ,
255+ * ,
256+ ignore_words : Container [str ] = frozenset (),
257+ ) -> None :
88258 """Parse a codespell dictionary
89259
260+ This is primarily useful for loading custom dictionaries not provided by
261+ codespell. This is the API version of the `-D` / `--dictionary` command
262+ line option except it only accept files (and not the special `-`).
263+
264+ This method updates the spellchecker to include any corrected listed in
265+ the file. Load order is important. When multiple corrections are loaded
266+ for the same typo, then the last loaded corrections for that typo will
267+ be used.
268+
90269 :param filename: The codespell dictionary file to parse
91270 :param ignore_words: Words to ignore from this dictionary.
92271 """
0 commit comments