Skip to content

Commit a3023f9

Browse files
committed
Pure python spell checker
1 parent dab4e11 commit a3023f9

File tree

6 files changed

+82
-60
lines changed

6 files changed

+82
-60
lines changed

comment_spell_check/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
""" comment_spell_check """
2+
from comment_spell_check import comment_spell_check
3+
4+
def main():
5+
"""Entry point for the application script"""
6+
comment_spell_check.main()

comment_spell_check/additional_dictionary.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,7 @@ optimizied
14761476
or
14771477
or_eq
14781478
orangered
1479+
org
14791480
organising
14801481
orthogonally
14811482
orthonormal

comment_spell_check/comment_spell_check.py

Lines changed: 58 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,12 @@
2929
from pathlib import Path
3030
from importlib.metadata import version, PackageNotFoundError
3131

32-
from enchant.checker import SpellChecker
33-
from enchant.tokenize import EmailFilter, URLFilter
34-
from enchant import Dict
35-
3632
from comment_parser import comment_parser
3733

38-
try:
39-
from comment_spell_check.lib import bibtex_loader
40-
except ImportError:
41-
from lib import bibtex_loader
34+
from spellchecker import SpellChecker
35+
36+
from lib import bibtex_loader
37+
from lib import create_checker
4238

4339
__version__ = "unknown"
4440

@@ -113,13 +109,27 @@ def load_text_file(filename):
113109
def spell_check_words(spell_checker: SpellChecker, words: list[str]):
114110
"""Check each word and report False if at least one has an spelling error."""
115111
for word in words:
116-
if not spell_checker.check(word):
112+
if not (word in spell_checker or word.lower() in spell_checker):
117113
return False
118114
return True
119115

116+
def find_misspellings(spell: SpellChecker, line: str, verbose: bool = False) -> list[str]:
117+
"""Find misspellings in a line of text."""
118+
119+
l2 = re.sub(r'[^a-zA-Z]', ' ', line)
120+
words = l2.split()
121+
122+
mistakes = []
123+
124+
for word in words:
125+
if not (word.lower() in spell or word in spell):
126+
if verbose:
127+
print(f"Misspelled word: {word}")
128+
mistakes.append(word)
129+
return mistakes
120130

121131
def spell_check_comment(
122-
spell_checker: SpellChecker,
132+
spell: SpellChecker,
123133
c: comment_parser.common.Comment,
124134
prefixes: list[str] = None,
125135
output_lvl=2,
@@ -129,11 +139,10 @@ def spell_check_comment(
129139
if output_lvl > 1:
130140
print(f"Line {c.line_number()}: {c}")
131141

132-
mistakes = []
133-
spell_checker.set_text(c.text())
142+
bad_words = find_misspellings(spell, c.text(), verbose=output_lvl > 1)
134143

135-
for error in spell_checker:
136-
error_word = error.word
144+
mistakes = []
145+
for error_word in bad_words:
137146

138147
if output_lvl > 1:
139148
print(f" Error: {error_word}")
@@ -150,8 +159,7 @@ def spell_check_comment(
150159
" Stripping contraction: "
151160
+ f"{original_error_word} -> {error_word}"
152161
)
153-
if spell_checker.check(error_word):
154-
valid = True
162+
valid = error_word in spell
155163
break
156164

157165
if valid:
@@ -177,17 +185,14 @@ def spell_check_comment(
177185
if output_lvl > 1:
178186
print(f" Trying without '{pre}' prefix: {error_word} -> {wrd}")
179187
try:
180-
if spell_checker.check(wrd):
181-
valid = True
182-
else:
188+
valid = wrd in spell
189+
if not valid:
183190
# Try splitting camel case words and checking each sub-words
184191
if output_lvl > 1:
185-
print(f" Trying splitting camel case word: {wrd}")
192+
print("Trying splitting camel case word: {wrd}")
186193
sub_words = split_camel_case(wrd)
187-
if output_lvl > 1:
188-
print(" Sub-words: ", sub_words)
189194
if len(sub_words) > 1 and spell_check_words(
190-
spell_checker, sub_words
195+
spell, sub_words
191196
):
192197
valid = True
193198
break
@@ -201,11 +206,11 @@ def spell_check_comment(
201206
if output_lvl > 1:
202207
print(f" Trying splitting camel case word: {error_word}")
203208
sub_words = split_camel_case(error_word)
204-
if len(sub_words) > 1 and spell_check_words(spell_checker, sub_words):
209+
if len(sub_words) > 1 and spell_check_words(spell, sub_words):
205210
continue
206211

207212
if output_lvl > 1:
208-
msg = f" Error: '{error_word}', suggestions: {spell_checker.suggest()}"
213+
msg = f" error: '{error_word}', suggestions: {spell.candidates(error_word)}"
209214
else:
210215
msg = error_word
211216
mistakes.append(msg)
@@ -426,34 +431,33 @@ def add_dict(enchant_dict, filename, verbose=False):
426431
enchant_dict.add(wrd)
427432

428433

429-
def create_spell_checker(args, output_lvl):
430-
"""Create a SpellChecker."""
431-
432-
my_dict = Dict("en_US")
433-
434-
# Load the dictionary files
435-
#
434+
def build_dictionary_list(args):
435+
"""build a list of dictionaries to use for spell checking."""
436+
dict_list = []
436437
initial_dct = Path(__file__).parent / "additional_dictionary.txt"
437-
if not initial_dct.exists():
438-
initial_dct = None
438+
439+
if initial_dct.exists():
440+
dict_list.append(initial_dct)
439441
else:
440-
add_dict(my_dict, str(initial_dct), any([args.brief, output_lvl >= 0]))
442+
print("Warning: initial dictionary not found.", initial_dct)
441443

442-
if args.dict is not None:
443-
for d in args.dict:
444-
add_dict(my_dict, d, any([args.brief, output_lvl >= 0]))
444+
if not isinstance(args.dict, list):
445+
return dict_list
445446

446-
# Load the bibliography files
447-
#
448-
if args.bibtex is not None:
449-
for bib in args.bibtex:
450-
bibtex_loader.add_bibtex(my_dict, bib, any([args.brief, output_lvl >= 0]))
447+
for d in args.dict:
448+
dpath = Path(d)
449+
if dpath.exists():
450+
dict_list.append(dpath)
451451

452-
# Create the spell checking object
453-
spell_checker = SpellChecker(my_dict, filters=[EmailFilter, URLFilter])
452+
return dict_list
454453

455-
return spell_checker
456454

455+
def add_bibtex_words(spell, bibtex_files, verbose=False):
456+
"""Add words from bibtex files to the spell checker."""
457+
for bibtex_file in bibtex_files:
458+
if verbose:
459+
print(f"Loading bibtex file: {bibtex_file}")
460+
bibtex_loader.add_bibtex(spell, bibtex_file, verbose=verbose)
457461

458462
def main():
459463
"""comment_spell_check main function."""
@@ -469,7 +473,12 @@ def main():
469473
if args.miss:
470474
output_lvl = -1
471475

472-
spell_checker = create_spell_checker(args, output_lvl)
476+
dict_list = build_dictionary_list(args)
477+
478+
spell = create_checker.create_checker(dict_list, output_lvl>1)
479+
480+
if args.bibtex:
481+
add_bibtex_words(spell, args.bibtex, verbose=output_lvl > 1)
473482

474483
file_list = []
475484
if len(args.filenames):
@@ -515,7 +524,7 @@ def main():
515524
print(f"\nChecking {x}")
516525
result = spell_check_file(
517526
x,
518-
spell_checker,
527+
spell,
519528
args.mime_type,
520529
output_lvl=output_lvl,
521530
prefixes=prefixes,
@@ -532,7 +541,7 @@ def main():
532541
# f is a file, so spell check it
533542
result = spell_check_file(
534543
f,
535-
spell_checker,
544+
spell,
536545
args.mime_type,
537546
output_lvl=output_lvl,
538547
prefixes=prefixes,
Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
12
""" Load Bibtex files into a spell checking dictionary. """
23

34
import bibtexparser
5+
import spellchecker
46

5-
6-
def split_bibtex_name(name):
7+
def split_bibtex_name(name:str):
78
"""
89
Split a Bibtex name, which is two words seperated by a number.
910
"""
@@ -17,19 +18,22 @@ def split_bibtex_name(name):
1718
return words
1819

1920

20-
def add_bibtex(enchant_dict, filename, verbose=False):
21-
"""Update ``enchant_dict`` spell checking dictionary with names
21+
def add_bibtex(spell: spellchecker.SpellChecker, filename:str, verbose:bool=False):
22+
"""Update ``spell_checker`` spell checking dictionary with names
2223
from ``filename``, a Bibtex file."""
2324

2425
if verbose:
2526
print(f"Bibtex file: {filename}")
2627

28+
word_list = []
29+
2730
with open(filename, "rt", encoding="utf-8") as biblatex_file:
2831
bib_database = bibtexparser.load(biblatex_file)
2932

3033
for k in bib_database.get_entry_dict().keys():
3134
words = split_bibtex_name(k)
32-
for w in words:
33-
enchant_dict.add(w)
34-
if verbose:
35-
print("Added Bibtex word:", w)
35+
word_list.extend(words)
36+
37+
if verbose:
38+
print(f"Words: {word_list}")
39+
spell.word_frequency.load_words(word_list)

tests/example.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
int test_int;
3131
int hello_world() {
32-
// Sup, bro?
32+
// Sup, dude?
3333
print("Mmmm, pie.");
3434
}
3535

tests/test_comment_spell_check.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def test_basic(self):
3535
[
3636
"python",
3737
"comment_spell_check.py",
38-
"--verbose",
38+
"--miss",
3939
"--dict",
4040
"../tests/dict.txt",
4141
"--prefix",
@@ -91,7 +91,6 @@ def test_bibtex(self):
9191
[
9292
"python",
9393
"comment_spell_check.py",
94-
"--verbose",
9594
"--bibtex",
9695
"../tests/itk.bib",
9796
"../tests/bibtest.py",
@@ -100,3 +99,6 @@ def test_bibtex(self):
10099
stdout=subprocess.PIPE,
101100
)
102101
self.assertEqual(runresult.returncode, 0, runresult.stdout)
102+
103+
if __name__ == "__main__":
104+
unittest.main()

0 commit comments

Comments
 (0)