Skip to content

Commit 1b40eb0

Browse files
committed
Pure python spell checker
This PR changes the spell checker used from pyenchant to pyspellchecker. The latter is pure Python so no C library is required.
1 parent dab4e11 commit 1b40eb0

File tree

9 files changed

+161
-64
lines changed

9 files changed

+161
-64
lines changed

comment_spell_check/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
""" comment_spell_check """
2+
3+
from comment_spell_check import comment_spell_check
4+
5+
6+
def main():
7+
"""Entry point for the application script"""
8+
comment_spell_check.main()

comment_spell_check/additional_dictionary.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,7 @@ optimizied
14761476
or
14771477
or_eq
14781478
orangered
1479+
org
14791480
organising
14801481
orthogonally
14811482
orthonormal

comment_spell_check/comment_spell_check.py

Lines changed: 67 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#
1919
# ==========================================================================*/
2020

21-
""" spell check the comments in code. """
21+
"""spell check the comments in code."""
2222

2323
import sys
2424
import os
@@ -29,16 +29,12 @@
2929
from pathlib import Path
3030
from importlib.metadata import version, PackageNotFoundError
3131

32-
from enchant.checker import SpellChecker
33-
from enchant.tokenize import EmailFilter, URLFilter
34-
from enchant import Dict
35-
3632
from comment_parser import comment_parser
3733

38-
try:
39-
from comment_spell_check.lib import bibtex_loader
40-
except ImportError:
41-
from lib import bibtex_loader
34+
from spellchecker import SpellChecker
35+
36+
from lib import bibtex_loader
37+
from lib import create_checker
4238

4339
__version__ = "unknown"
4440

@@ -113,13 +109,31 @@ def load_text_file(filename):
113109
def spell_check_words(spell_checker: SpellChecker, words: list[str]):
114110
"""Check each word and report False if at least one has an spelling error."""
115111
for word in words:
116-
if not spell_checker.check(word):
112+
if not (word in spell_checker or word.lower() in spell_checker):
117113
return False
118114
return True
119115

120116

117+
def find_misspellings(
118+
spell: SpellChecker, line: str, verbose: bool = False
119+
) -> list[str]:
120+
"""Find misspellings in a line of text."""
121+
122+
l2 = re.sub(r"[^a-zA-Z]", " ", line)
123+
words = l2.split()
124+
125+
mistakes = []
126+
127+
for word in words:
128+
if not (word.lower() in spell or word in spell):
129+
if verbose:
130+
print(f"Misspelled word: {word}")
131+
mistakes.append(word)
132+
return mistakes
133+
134+
121135
def spell_check_comment(
122-
spell_checker: SpellChecker,
136+
spell: SpellChecker,
123137
c: comment_parser.common.Comment,
124138
prefixes: list[str] = None,
125139
output_lvl=2,
@@ -129,12 +143,10 @@ def spell_check_comment(
129143
if output_lvl > 1:
130144
print(f"Line {c.line_number()}: {c}")
131145

132-
mistakes = []
133-
spell_checker.set_text(c.text())
134-
135-
for error in spell_checker:
136-
error_word = error.word
146+
bad_words = find_misspellings(spell, c.text(), verbose=output_lvl > 1)
137147

148+
mistakes = []
149+
for error_word in bad_words:
138150
if output_lvl > 1:
139151
print(f" Error: {error_word}")
140152

@@ -150,8 +162,7 @@ def spell_check_comment(
150162
" Stripping contraction: "
151163
+ f"{original_error_word} -> {error_word}"
152164
)
153-
if spell_checker.check(error_word):
154-
valid = True
165+
valid = error_word in spell
155166
break
156167

157168
if valid:
@@ -177,18 +188,13 @@ def spell_check_comment(
177188
if output_lvl > 1:
178189
print(f" Trying without '{pre}' prefix: {error_word} -> {wrd}")
179190
try:
180-
if spell_checker.check(wrd):
181-
valid = True
182-
else:
191+
valid = wrd in spell
192+
if not valid:
183193
# Try splitting camel case words and checking each sub-words
184194
if output_lvl > 1:
185-
print(f" Trying splitting camel case word: {wrd}")
195+
print("Trying splitting camel case word: {wrd}")
186196
sub_words = split_camel_case(wrd)
187-
if output_lvl > 1:
188-
print(" Sub-words: ", sub_words)
189-
if len(sub_words) > 1 and spell_check_words(
190-
spell_checker, sub_words
191-
):
197+
if len(sub_words) > 1 and spell_check_words(spell, sub_words):
192198
valid = True
193199
break
194200
except TypeError:
@@ -201,11 +207,14 @@ def spell_check_comment(
201207
if output_lvl > 1:
202208
print(f" Trying splitting camel case word: {error_word}")
203209
sub_words = split_camel_case(error_word)
204-
if len(sub_words) > 1 and spell_check_words(spell_checker, sub_words):
210+
if len(sub_words) > 1 and spell_check_words(spell, sub_words):
205211
continue
206212

207213
if output_lvl > 1:
208-
msg = f" Error: '{error_word}', suggestions: {spell_checker.suggest()}"
214+
msg = (
215+
f" error: '{error_word}', "
216+
+ f"suggestions: {spell.candidates(error_word)}"
217+
)
209218
else:
210219
msg = error_word
211220
mistakes.append(msg)
@@ -426,33 +435,33 @@ def add_dict(enchant_dict, filename, verbose=False):
426435
enchant_dict.add(wrd)
427436

428437

429-
def create_spell_checker(args, output_lvl):
430-
"""Create a SpellChecker."""
431-
432-
my_dict = Dict("en_US")
433-
434-
# Load the dictionary files
435-
#
438+
def build_dictionary_list(args):
439+
"""build a list of dictionaries to use for spell checking."""
440+
dict_list = []
436441
initial_dct = Path(__file__).parent / "additional_dictionary.txt"
437-
if not initial_dct.exists():
438-
initial_dct = None
442+
443+
if initial_dct.exists():
444+
dict_list.append(initial_dct)
439445
else:
440-
add_dict(my_dict, str(initial_dct), any([args.brief, output_lvl >= 0]))
446+
print("Warning: initial dictionary not found.", initial_dct)
441447

442-
if args.dict is not None:
443-
for d in args.dict:
444-
add_dict(my_dict, d, any([args.brief, output_lvl >= 0]))
448+
if not isinstance(args.dict, list):
449+
return dict_list
445450

446-
# Load the bibliography files
447-
#
448-
if args.bibtex is not None:
449-
for bib in args.bibtex:
450-
bibtex_loader.add_bibtex(my_dict, bib, any([args.brief, output_lvl >= 0]))
451+
for d in args.dict:
452+
dpath = Path(d)
453+
if dpath.exists():
454+
dict_list.append(dpath)
451455

452-
# Create the spell checking object
453-
spell_checker = SpellChecker(my_dict, filters=[EmailFilter, URLFilter])
456+
return dict_list
454457

455-
return spell_checker
458+
459+
def add_bibtex_words(spell, bibtex_files, verbose=False):
460+
"""Add words from bibtex files to the spell checker."""
461+
for bibtex_file in bibtex_files:
462+
if verbose:
463+
print(f"Loading bibtex file: {bibtex_file}")
464+
bibtex_loader.add_bibtex(spell, bibtex_file, verbose=verbose)
456465

457466

458467
def main():
@@ -469,7 +478,12 @@ def main():
469478
if args.miss:
470479
output_lvl = -1
471480

472-
spell_checker = create_spell_checker(args, output_lvl)
481+
dict_list = build_dictionary_list(args)
482+
483+
spell = create_checker.create_checker(dict_list, output_lvl > 1)
484+
485+
if args.bibtex:
486+
add_bibtex_words(spell, args.bibtex, verbose=output_lvl > 1)
473487

474488
file_list = []
475489
if len(args.filenames):
@@ -515,7 +529,7 @@ def main():
515529
print(f"\nChecking {x}")
516530
result = spell_check_file(
517531
x,
518-
spell_checker,
532+
spell,
519533
args.mime_type,
520534
output_lvl=output_lvl,
521535
prefixes=prefixes,
@@ -532,7 +546,7 @@ def main():
532546
# f is a file, so spell check it
533547
result = spell_check_file(
534548
f,
535-
spell_checker,
549+
spell,
536550
args.mime_type,
537551
output_lvl=output_lvl,
538552
prefixes=prefixes,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# he/him/his
Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
""" Load Bibtex files into a spell checking dictionary. """
22

33
import bibtexparser
4+
import spellchecker
45

56

6-
def split_bibtex_name(name):
7+
def split_bibtex_name(name: str):
78
"""
89
Split a Bibtex name, which is two words seperated by a number.
910
"""
@@ -17,19 +18,22 @@ def split_bibtex_name(name):
1718
return words
1819

1920

20-
def add_bibtex(enchant_dict, filename, verbose=False):
21-
"""Update ``enchant_dict`` spell checking dictionary with names
21+
def add_bibtex(spell: spellchecker.SpellChecker, filename: str, verbose: bool = False):
22+
"""Update ``spell`` spell checking dictionary with names
2223
from ``filename``, a Bibtex file."""
2324

2425
if verbose:
2526
print(f"Bibtex file: {filename}")
2627

28+
word_list = []
29+
2730
with open(filename, "rt", encoding="utf-8") as biblatex_file:
2831
bib_database = bibtexparser.load(biblatex_file)
2932

3033
for k in bib_database.get_entry_dict().keys():
3134
words = split_bibtex_name(k)
32-
for w in words:
33-
enchant_dict.add(w)
34-
if verbose:
35-
print("Added Bibtex word:", w)
35+
word_list.extend(words)
36+
37+
if verbose:
38+
print(f"Words: {word_list}")
39+
spell.word_frequency.load_words(word_list)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#! /usr/bin/env python
2+
3+
""" Create a case sensitive spell checker with the English dictionary and
4+
additional dictionaries if provided.
5+
"""
6+
7+
import sys
8+
import importlib.resources
9+
import spellchecker
10+
11+
12+
def create_checker(dict_list="", verbose=False):
13+
"""Create a case sensitive spell checker with the English dictionary and
14+
additional dictionaries if provided."""
15+
16+
# create an empty SpellChecker object, because we want a case
17+
# sensitive checker
18+
checker = spellchecker.SpellChecker(language=None, case_sensitive=True)
19+
20+
# load the English dictionary
21+
lib_path = importlib.resources.files(spellchecker)
22+
english_dict = str(lib_path) + "/resources/en.json.gz"
23+
if verbose:
24+
print("Loading English dictionary from: ", english_dict)
25+
checker.word_frequency.load_dictionary(english_dict)
26+
27+
# load the additional dictionaries
28+
if not isinstance(dict_list, list):
29+
return checker
30+
if len(dict_list) > 0:
31+
for d in dict_list:
32+
if verbose:
33+
print("Loading additional dictionary from: ", d)
34+
checker.word_frequency.load_text_file(d)
35+
36+
return checker
37+
38+
39+
if __name__ == "__main__":
40+
print(sys.argv[1:])
41+
spell = create_checker(sys.argv[1:], True)
42+
43+
# find those words that may be misspelled
44+
misspelled = spell.unknown(["something", "is", "hapenning", "here"])
45+
46+
for word in misspelled:
47+
print("\nMisspelled: ", word)
48+
# Get the one `most likely` answer
49+
print(spell.correction(word))
50+
51+
# Get a list of `likely` options
52+
print(spell.candidates(word))
53+
54+
# test if case sensitive checking from the additional dictionaries works
55+
my_words = [
56+
"Zsize",
57+
"Zuerich",
58+
"accesor",
59+
"accessor",
60+
"zsize",
61+
"zuerich",
62+
"sdfasdfas",
63+
]
64+
65+
for w in my_words:
66+
print(w, w in spell)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
comment_parser
2-
pyenchant
2+
pyspellchecker
33
bibtexparser

tests/example.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
int test_int;
3131
int hello_world() {
32-
// Sup, bro?
32+
// Sup, dude?
3333
print("Mmmm, pie.");
3434
}
3535

tests/test_comment_spell_check.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def test_basic(self):
3535
[
3636
"python",
3737
"comment_spell_check.py",
38-
"--verbose",
38+
"--miss",
3939
"--dict",
4040
"../tests/dict.txt",
4141
"--prefix",
@@ -91,7 +91,6 @@ def test_bibtex(self):
9191
[
9292
"python",
9393
"comment_spell_check.py",
94-
"--verbose",
9594
"--bibtex",
9695
"../tests/itk.bib",
9796
"../tests/bibtest.py",
@@ -100,3 +99,7 @@ def test_bibtex(self):
10099
stdout=subprocess.PIPE,
101100
)
102101
self.assertEqual(runresult.returncode, 0, runresult.stdout)
102+
103+
104+
if __name__ == "__main__":
105+
unittest.main()

0 commit comments

Comments
 (0)