Pure python spell checker

dave3d · dave3d · commit a3023f96ab19 · 2025-03-06T14:37:15.000-05:00
diff --git a/comment_spell_check/__init__.py b/comment_spell_check/__init__.py
@@ -0,0 +1,6 @@
+""" comment_spell_check """
+from comment_spell_check import comment_spell_check
+
+def main():
+    """Entry point for the application script"""
+    comment_spell_check.main()
diff --git a/comment_spell_check/additional_dictionary.txt b/comment_spell_check/additional_dictionary.txt
@@ -1476,6 +1476,7 @@ optimizied
 or
 or_eq
 orangered
+org
 organising
 orthogonally
 orthonormal
diff --git a/comment_spell_check/comment_spell_check.py b/comment_spell_check/comment_spell_check.py
@@ -29,16 +29,12 @@
 from pathlib import Path
 from importlib.metadata import version, PackageNotFoundError
 
-from enchant.checker import SpellChecker
-from enchant.tokenize import EmailFilter, URLFilter
-from enchant import Dict
-
 from comment_parser import comment_parser
 
-try:
-    from comment_spell_check.lib import bibtex_loader
-except ImportError:
-    from lib import bibtex_loader
+from spellchecker import SpellChecker
+
+from lib import bibtex_loader
+from lib import create_checker
 
 __version__ = "unknown"
 
@@ -113,13 +109,27 @@ def load_text_file(filename):
 def spell_check_words(spell_checker: SpellChecker, words: list[str]):
     """Check each word and report False if at least one has an spelling error."""
     for word in words:
-        if not spell_checker.check(word):
+        if not (word in spell_checker or word.lower() in spell_checker):
             return False
     return True
 
+def find_misspellings(spell: SpellChecker, line: str, verbose: bool = False) -> list[str]:
+    """Find misspellings in a line of text."""
+
+    l2 = re.sub(r'[^a-zA-Z]', ' ', line)
+    words = l2.split()
+
+    mistakes = []
+
+    for word in words:
+        if not (word.lower() in spell or word in spell):
+            if verbose:
+                print(f"Misspelled word: {word}")
+            mistakes.append(word)
+    return mistakes
 
 def spell_check_comment(
-    spell_checker: SpellChecker,
+    spell: SpellChecker,
     c: comment_parser.common.Comment,
     prefixes: list[str] = None,
     output_lvl=2,
@@ -129,11 +139,10 @@ def spell_check_comment(
     if output_lvl > 1:
         print(f"Line {c.line_number()}: {c}")
 
-    mistakes = []
-    spell_checker.set_text(c.text())
+    bad_words = find_misspellings(spell, c.text(), verbose=output_lvl > 1)
 
-    for error in spell_checker:
-        error_word = error.word
+    mistakes = []
+    for error_word in bad_words:
 
         if output_lvl > 1:
             print(f"    Error: {error_word}")
@@ -150,8 +159,7 @@ def spell_check_comment(
                         "    Stripping contraction: "
                         + f"{original_error_word} -> {error_word}"
                     )
-                if spell_checker.check(error_word):
-                    valid = True
+                valid = error_word in spell
                 break
 
         if valid:
@@ -177,17 +185,14 @@ def spell_check_comment(
                 if output_lvl > 1:
                     print(f"    Trying without '{pre}' prefix: {error_word} -> {wrd}")
                 try:
-                    if spell_checker.check(wrd):
-                        valid = True
-                    else:
+                    valid = wrd in spell
+                    if not valid:
                         # Try splitting camel case words and checking each sub-words
                         if output_lvl > 1:
-                            print(f"    Trying splitting camel case word: {wrd}")
+                            print("Trying splitting camel case word: {wrd}")
                         sub_words = split_camel_case(wrd)
-                        if output_lvl > 1:
-                            print("    Sub-words: ", sub_words)
                         if len(sub_words) > 1 and spell_check_words(
-                            spell_checker, sub_words
+                            spell, sub_words
                         ):
                             valid = True
                             break
@@ -201,11 +206,11 @@ def spell_check_comment(
         if output_lvl > 1:
             print(f"    Trying splitting camel case word: {error_word}")
         sub_words = split_camel_case(error_word)
-        if len(sub_words) > 1 and spell_check_words(spell_checker, sub_words):
+        if len(sub_words) > 1 and spell_check_words(spell, sub_words):
             continue
 
         if output_lvl > 1:
-            msg = f"    Error: '{error_word}', suggestions: {spell_checker.suggest()}"
+            msg = f"    error: '{error_word}', suggestions: {spell.candidates(error_word)}"
         else:
             msg = error_word
         mistakes.append(msg)
@@ -426,34 +431,33 @@ def add_dict(enchant_dict, filename, verbose=False):
             enchant_dict.add(wrd)
 
 
-def create_spell_checker(args, output_lvl):
-    """Create a SpellChecker."""
-
-    my_dict = Dict("en_US")
-
-    # Load the dictionary files
-    #
+def build_dictionary_list(args):
+    """build a list of dictionaries to use for spell checking."""
+    dict_list = []
     initial_dct = Path(__file__).parent / "additional_dictionary.txt"
-    if not initial_dct.exists():
-        initial_dct = None
+
+    if initial_dct.exists():
+        dict_list.append(initial_dct)
     else:
-        add_dict(my_dict, str(initial_dct), any([args.brief, output_lvl >= 0]))
+        print("Warning: initial dictionary not found.", initial_dct)
 
-    if args.dict is not None:
-        for d in args.dict:
-            add_dict(my_dict, d, any([args.brief, output_lvl >= 0]))
+    if not isinstance(args.dict, list):
+        return dict_list
 
-    # Load the bibliography files
-    #
-    if args.bibtex is not None:
-        for bib in args.bibtex:
-            bibtex_loader.add_bibtex(my_dict, bib, any([args.brief, output_lvl >= 0]))
+    for d in args.dict:
+        dpath = Path(d)
+        if dpath.exists():
+            dict_list.append(dpath)
 
-    # Create the spell checking object
-    spell_checker = SpellChecker(my_dict, filters=[EmailFilter, URLFilter])
+    return dict_list
 
-    return spell_checker
 
+def add_bibtex_words(spell, bibtex_files, verbose=False):
+    """Add words from bibtex files to the spell checker."""
+    for bibtex_file in bibtex_files:
+        if verbose:
+            print(f"Loading bibtex file: {bibtex_file}")
+        bibtex_loader.add_bibtex(spell, bibtex_file, verbose=verbose)
 
 def main():
     """comment_spell_check main function."""
@@ -469,7 +473,12 @@ def main():
     if args.miss:
         output_lvl = -1
 
-    spell_checker = create_spell_checker(args, output_lvl)
+    dict_list = build_dictionary_list(args)
+
+    spell = create_checker.create_checker(dict_list, output_lvl>1)
+
+    if args.bibtex:
+        add_bibtex_words(spell, args.bibtex, verbose=output_lvl > 1)
 
     file_list = []
     if len(args.filenames):
@@ -515,7 +524,7 @@ def main():
                     print(f"\nChecking {x}")
                 result = spell_check_file(
                     x,
-                    spell_checker,
+                    spell,
                     args.mime_type,
                     output_lvl=output_lvl,
                     prefixes=prefixes,
@@ -532,7 +541,7 @@ def main():
             # f is a file, so spell check it
             result = spell_check_file(
                 f,
-                spell_checker,
+                spell,
                 args.mime_type,
                 output_lvl=output_lvl,
                 prefixes=prefixes,
diff --git a/comment_spell_check/lib/bibtex_loader.py b/comment_spell_check/lib/bibtex_loader.py
@@ -1,9 +1,10 @@
+
 """ Load Bibtex files into a spell checking dictionary. """
 
 import bibtexparser
+import spellchecker
 
-
-def split_bibtex_name(name):
+def split_bibtex_name(name:str):
     """
     Split a Bibtex name, which is two words seperated by a number.
     """
@@ -17,19 +18,22 @@ def split_bibtex_name(name):
     return words
 
 
-def add_bibtex(enchant_dict, filename, verbose=False):
-    """Update ``enchant_dict`` spell checking dictionary with names
+def add_bibtex(spell: spellchecker.SpellChecker, filename:str, verbose:bool=False):
+    """Update ``spell_checker`` spell checking dictionary with names
     from ``filename``, a Bibtex file."""
 
     if verbose:
         print(f"Bibtex file: {filename}")
 
+    word_list = []
+
     with open(filename, "rt", encoding="utf-8") as biblatex_file:
         bib_database = bibtexparser.load(biblatex_file)
 
         for k in bib_database.get_entry_dict().keys():
             words = split_bibtex_name(k)
-            for w in words:
-                enchant_dict.add(w)
-                if verbose:
-                    print("Added Bibtex word:", w)
+            word_list.extend(words)
+
+        if verbose:
+            print(f"Words: {word_list}")
+        spell.word_frequency.load_words(word_list)
diff --git a/tests/example.h b/tests/example.h
@@ -29,7 +29,7 @@
 
 int test_int;
 int hello_world() {
-  // Sup, bro?
+  // Sup, dude?
   print("Mmmm, pie.");
 }
 
diff --git a/tests/test_comment_spell_check.py b/tests/test_comment_spell_check.py
@@ -35,7 +35,7 @@ def test_basic(self):
             [
                 "python",
                 "comment_spell_check.py",
-                "--verbose",
+                "--miss",
                 "--dict",
                 "../tests/dict.txt",
                 "--prefix",
@@ -91,7 +91,6 @@ def test_bibtex(self):
             [
                 "python",
                 "comment_spell_check.py",
-                "--verbose",
                 "--bibtex",
                 "../tests/itk.bib",
                 "../tests/bibtest.py",
@@ -100,3 +99,6 @@ def test_bibtex(self):
             stdout=subprocess.PIPE,
         )
         self.assertEqual(runresult.returncode, 0, runresult.stdout)
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`
`30`	`30`	`int test_int;`
`31`	`31`	`int hello_world() {`
`32`		`- // Sup, bro?`
	`32`	`+ // Sup, dude?`
`33`	`33`	`print("Mmmm, pie.");`
`34`	`34`	`}`
`35`	`35`