Pure python spell checker

dave3d · dave3d · commit 1b40eb09142e · 2025-03-06T15:25:03.000-05:00
This PR changes the spell checker used from pyenchant to pyspellchecker.
The latter is pure Python so no C library is required.
diff --git a/comment_spell_check/__init__.py b/comment_spell_check/__init__.py
@@ -0,0 +1,8 @@
+""" comment_spell_check """
+
+from comment_spell_check import comment_spell_check
+
+
+def main():
+    """Entry point for the application script"""
+    comment_spell_check.main()
diff --git a/comment_spell_check/additional_dictionary.txt b/comment_spell_check/additional_dictionary.txt
@@ -1476,6 +1476,7 @@ optimizied
 or
 or_eq
 orangered
+org
 organising
 orthogonally
 orthonormal
diff --git a/comment_spell_check/comment_spell_check.py b/comment_spell_check/comment_spell_check.py
@@ -18,7 +18,7 @@
 #
 # ==========================================================================*/
 
-""" spell check the comments in code. """
+"""spell check the comments in code."""
 
 import sys
 import os
@@ -29,16 +29,12 @@
 from pathlib import Path
 from importlib.metadata import version, PackageNotFoundError
 
-from enchant.checker import SpellChecker
-from enchant.tokenize import EmailFilter, URLFilter
-from enchant import Dict
-
 from comment_parser import comment_parser
 
-try:
-    from comment_spell_check.lib import bibtex_loader
-except ImportError:
-    from lib import bibtex_loader
+from spellchecker import SpellChecker
+
+from lib import bibtex_loader
+from lib import create_checker
 
 __version__ = "unknown"
 
@@ -113,13 +109,31 @@ def load_text_file(filename):
 def spell_check_words(spell_checker: SpellChecker, words: list[str]):
     """Check each word and report False if at least one has an spelling error."""
     for word in words:
-        if not spell_checker.check(word):
+        if not (word in spell_checker or word.lower() in spell_checker):
             return False
     return True
 
 
+def find_misspellings(
+    spell: SpellChecker, line: str, verbose: bool = False
+) -> list[str]:
+    """Find misspellings in a line of text."""
+
+    l2 = re.sub(r"[^a-zA-Z]", " ", line)
+    words = l2.split()
+
+    mistakes = []
+
+    for word in words:
+        if not (word.lower() in spell or word in spell):
+            if verbose:
+                print(f"Misspelled word: {word}")
+            mistakes.append(word)
+    return mistakes
+
+
 def spell_check_comment(
-    spell_checker: SpellChecker,
+    spell: SpellChecker,
     c: comment_parser.common.Comment,
     prefixes: list[str] = None,
     output_lvl=2,
@@ -129,12 +143,10 @@ def spell_check_comment(
     if output_lvl > 1:
         print(f"Line {c.line_number()}: {c}")
 
-    mistakes = []
-    spell_checker.set_text(c.text())
-
-    for error in spell_checker:
-        error_word = error.word
+    bad_words = find_misspellings(spell, c.text(), verbose=output_lvl > 1)
 
+    mistakes = []
+    for error_word in bad_words:
         if output_lvl > 1:
             print(f"    Error: {error_word}")
 
@@ -150,8 +162,7 @@ def spell_check_comment(
                         "    Stripping contraction: "
                         + f"{original_error_word} -> {error_word}"
                     )
-                if spell_checker.check(error_word):
-                    valid = True
+                valid = error_word in spell
                 break
 
         if valid:
@@ -177,18 +188,13 @@ def spell_check_comment(
                 if output_lvl > 1:
                     print(f"    Trying without '{pre}' prefix: {error_word} -> {wrd}")
                 try:
-                    if spell_checker.check(wrd):
-                        valid = True
-                    else:
+                    valid = wrd in spell
+                    if not valid:
                         # Try splitting camel case words and checking each sub-words
                         if output_lvl > 1:
-                            print(f"    Trying splitting camel case word: {wrd}")
+                            print("Trying splitting camel case word: {wrd}")
                         sub_words = split_camel_case(wrd)
-                        if output_lvl > 1:
-                            print("    Sub-words: ", sub_words)
-                        if len(sub_words) > 1 and spell_check_words(
-                            spell_checker, sub_words
-                        ):
+                        if len(sub_words) > 1 and spell_check_words(spell, sub_words):
                             valid = True
                             break
                 except TypeError:
@@ -201,11 +207,14 @@ def spell_check_comment(
         if output_lvl > 1:
             print(f"    Trying splitting camel case word: {error_word}")
         sub_words = split_camel_case(error_word)
-        if len(sub_words) > 1 and spell_check_words(spell_checker, sub_words):
+        if len(sub_words) > 1 and spell_check_words(spell, sub_words):
             continue
 
         if output_lvl > 1:
-            msg = f"    Error: '{error_word}', suggestions: {spell_checker.suggest()}"
+            msg = (
+                f"    error: '{error_word}', "
+                + f"suggestions: {spell.candidates(error_word)}"
+            )
         else:
             msg = error_word
         mistakes.append(msg)
@@ -426,33 +435,33 @@ def add_dict(enchant_dict, filename, verbose=False):
             enchant_dict.add(wrd)
 
 
-def create_spell_checker(args, output_lvl):
-    """Create a SpellChecker."""
-
-    my_dict = Dict("en_US")
-
-    # Load the dictionary files
-    #
+def build_dictionary_list(args):
+    """build a list of dictionaries to use for spell checking."""
+    dict_list = []
     initial_dct = Path(__file__).parent / "additional_dictionary.txt"
-    if not initial_dct.exists():
-        initial_dct = None
+
+    if initial_dct.exists():
+        dict_list.append(initial_dct)
     else:
-        add_dict(my_dict, str(initial_dct), any([args.brief, output_lvl >= 0]))
+        print("Warning: initial dictionary not found.", initial_dct)
 
-    if args.dict is not None:
-        for d in args.dict:
-            add_dict(my_dict, d, any([args.brief, output_lvl >= 0]))
+    if not isinstance(args.dict, list):
+        return dict_list
 
-    # Load the bibliography files
-    #
-    if args.bibtex is not None:
-        for bib in args.bibtex:
-            bibtex_loader.add_bibtex(my_dict, bib, any([args.brief, output_lvl >= 0]))
+    for d in args.dict:
+        dpath = Path(d)
+        if dpath.exists():
+            dict_list.append(dpath)
 
-    # Create the spell checking object
-    spell_checker = SpellChecker(my_dict, filters=[EmailFilter, URLFilter])
+    return dict_list
 
-    return spell_checker
+
+def add_bibtex_words(spell, bibtex_files, verbose=False):
+    """Add words from bibtex files to the spell checker."""
+    for bibtex_file in bibtex_files:
+        if verbose:
+            print(f"Loading bibtex file: {bibtex_file}")
+        bibtex_loader.add_bibtex(spell, bibtex_file, verbose=verbose)
 
 
 def main():
@@ -469,7 +478,12 @@ def main():
     if args.miss:
         output_lvl = -1
 
-    spell_checker = create_spell_checker(args, output_lvl)
+    dict_list = build_dictionary_list(args)
+
+    spell = create_checker.create_checker(dict_list, output_lvl > 1)
+
+    if args.bibtex:
+        add_bibtex_words(spell, args.bibtex, verbose=output_lvl > 1)
 
     file_list = []
     if len(args.filenames):
@@ -515,7 +529,7 @@ def main():
                     print(f"\nChecking {x}")
                 result = spell_check_file(
                     x,
-                    spell_checker,
+                    spell,
                     args.mime_type,
                     output_lvl=output_lvl,
                     prefixes=prefixes,
@@ -532,7 +546,7 @@ def main():
             # f is a file, so spell check it
             result = spell_check_file(
                 f,
-                spell_checker,
+                spell,
                 args.mime_type,
                 output_lvl=output_lvl,
                 prefixes=prefixes,
diff --git a/comment_spell_check/lib/__init__.py b/comment_spell_check/lib/__init__.py
@@ -0,0 +1 @@
+# he/him/his
diff --git a/comment_spell_check/lib/bibtex_loader.py b/comment_spell_check/lib/bibtex_loader.py
@@ -1,9 +1,10 @@
 """ Load Bibtex files into a spell checking dictionary. """
 
 import bibtexparser
+import spellchecker
 
 
-def split_bibtex_name(name):
+def split_bibtex_name(name: str):
     """
     Split a Bibtex name, which is two words seperated by a number.
     """
@@ -17,19 +18,22 @@ def split_bibtex_name(name):
     return words
 
 
-def add_bibtex(enchant_dict, filename, verbose=False):
-    """Update ``enchant_dict`` spell checking dictionary with names
+def add_bibtex(spell: spellchecker.SpellChecker, filename: str, verbose: bool = False):
+    """Update ``spell`` spell checking dictionary with names
     from ``filename``, a Bibtex file."""
 
     if verbose:
         print(f"Bibtex file: {filename}")
 
+    word_list = []
+
     with open(filename, "rt", encoding="utf-8") as biblatex_file:
         bib_database = bibtexparser.load(biblatex_file)
 
         for k in bib_database.get_entry_dict().keys():
             words = split_bibtex_name(k)
-            for w in words:
-                enchant_dict.add(w)
-                if verbose:
-                    print("Added Bibtex word:", w)
+            word_list.extend(words)
+
+        if verbose:
+            print(f"Words: {word_list}")
+        spell.word_frequency.load_words(word_list)
diff --git a/comment_spell_check/lib/create_checker.py b/comment_spell_check/lib/create_checker.py
@@ -0,0 +1,66 @@
+#! /usr/bin/env python
+
+""" Create a case sensitive spell checker with the English dictionary and
+    additional dictionaries if provided.
+"""
+
+import sys
+import importlib.resources
+import spellchecker
+
+
+def create_checker(dict_list="", verbose=False):
+    """Create a case sensitive spell checker with the English dictionary and
+    additional dictionaries if provided."""
+
+    # create an empty SpellChecker object, because we want a case
+    # sensitive checker
+    checker = spellchecker.SpellChecker(language=None, case_sensitive=True)
+
+    # load the English dictionary
+    lib_path = importlib.resources.files(spellchecker)
+    english_dict = str(lib_path) + "/resources/en.json.gz"
+    if verbose:
+        print("Loading English dictionary from: ", english_dict)
+    checker.word_frequency.load_dictionary(english_dict)
+
+    # load the additional dictionaries
+    if not isinstance(dict_list, list):
+        return checker
+    if len(dict_list) > 0:
+        for d in dict_list:
+            if verbose:
+                print("Loading additional dictionary from: ", d)
+            checker.word_frequency.load_text_file(d)
+
+    return checker
+
+
+if __name__ == "__main__":
+    print(sys.argv[1:])
+    spell = create_checker(sys.argv[1:], True)
+
+    # find those words that may be misspelled
+    misspelled = spell.unknown(["something", "is", "hapenning", "here"])
+
+    for word in misspelled:
+        print("\nMisspelled: ", word)
+        # Get the one `most likely` answer
+        print(spell.correction(word))
+
+        # Get a list of `likely` options
+        print(spell.candidates(word))
+
+    # test if case sensitive checking from the additional dictionaries works
+    my_words = [
+        "Zsize",
+        "Zuerich",
+        "accesor",
+        "accessor",
+        "zsize",
+        "zuerich",
+        "sdfasdfas",
+    ]
+
+    for w in my_words:
+        print(w, w in spell)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 comment_parser
-pyenchant
+pyspellchecker
 bibtexparser
diff --git a/tests/example.h b/tests/example.h
@@ -29,7 +29,7 @@
 
 int test_int;
 int hello_world() {
-  // Sup, bro?
+  // Sup, dude?
   print("Mmmm, pie.");
 }
 
diff --git a/tests/test_comment_spell_check.py b/tests/test_comment_spell_check.py
@@ -35,7 +35,7 @@ def test_basic(self):
             [
                 "python",
                 "comment_spell_check.py",
-                "--verbose",
+                "--miss",
                 "--dict",
                 "../tests/dict.txt",
                 "--prefix",
@@ -91,7 +91,6 @@ def test_bibtex(self):
             [
                 "python",
                 "comment_spell_check.py",
-                "--verbose",
                 "--bibtex",
                 "../tests/itk.bib",
                 "../tests/bibtest.py",
@@ -100,3 +99,7 @@ def test_bibtex(self):
             stdout=subprocess.PIPE,
         )
         self.assertEqual(runresult.returncode, 0, runresult.stdout)
+
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`
`30`	`30`	`int test_int;`
`31`	`31`	`int hello_world() {`
`32`		`- // Sup, bro?`
	`32`	`+ // Sup, dude?`
`33`	`33`	`print("Mmmm, pie.");`
`34`	`34`	`}`
`35`	`35`