Refactor with a pure Python spell checker

dave3d · dave3d · commit 89d4c1e81437 · 2025-05-02T14:41:05.000-04:00
This is a major re-write of the code.  It now uses pyspellchecker,
a pure Python package, replacing pyenchant, which required the C
enchant library underneath.

Also code has been re-organized to make seperate functions into
smaller, more manageable parts and to adhere to Pylint's coding standards.
diff --git a/comment_spell_check/__init__.py b/comment_spell_check/__init__.py
diff --git a/comment_spell_check/utils/__init__.py b/comment_spell_check/utils/__init__.py
diff --git a/comment_spell_check/utils/bibtex_loader.py b/comment_spell_check/utils/bibtex_loader.py
@@ -0,0 +1,39 @@
+"""Load Bibtex files into a spell checking dictionary."""
+
+import logging
+import bibtexparser
+import spellchecker
+
+
+def split_bibtex_name(name: str):
+    """
+    Split a Bibtex name, which is two words seperated by a number.
+    """
+
+    # map any digit to space
+    mytable = str.maketrans("0123456789", "          ")
+    new_name = name.translate(mytable)
+
+    # split by space
+    words = new_name.split()
+    return words
+
+
+def add_bibtex(spell: spellchecker.SpellChecker, filename: str):
+    """Update ``spell`` spell checking dictionary with names
+    from ``filename``, a Bibtex file."""
+
+    logger = logging.getLogger("comment_spell_check.bibtex_loader")
+    logger.info("Bibtex file: %s", filename)
+
+    word_list = []
+
+    with open(filename, "rt", encoding="utf-8") as biblatex_file:
+        bib_database = bibtexparser.load(biblatex_file)
+
+        for k in bib_database.get_entry_dict().keys():
+            words = split_bibtex_name(k)
+            word_list.extend(words)
+
+        logger.info("Words: %s", word_list)
+        spell.word_frequency.load_words(word_list)
diff --git a/comment_spell_check/utils/create_checker.py b/comment_spell_check/utils/create_checker.py
@@ -0,0 +1,34 @@
+"""Create a case sensitive spell checker with the English dictionary and
+additional dictionaries if provided.
+"""
+
+import logging
+import importlib.resources
+import spellchecker
+
+
+def create_checker(dict_list: list[str] = None) -> spellchecker.SpellChecker:
+    """Create a case sensitive spell checker with the English dictionary and
+    additional dictionaries if provided."""
+
+    logger = logging.getLogger("comment_spell_check.create_checker")
+
+    # create an empty SpellChecker object, because we want a case
+    # sensitive checker
+    checker = spellchecker.SpellChecker(language=None, case_sensitive=True)
+
+    # load the English dictionary
+    lib_path = importlib.resources.files(spellchecker)
+    english_dict = str(lib_path) + "/resources/en.json.gz"
+    logger.info("Loading English dictionary from: %s", english_dict)
+    checker.word_frequency.load_dictionary(english_dict)
+
+    # load the additional dictionaries
+    if not isinstance(dict_list, list):
+        return checker
+    if len(dict_list) > 0:
+        for d in dict_list:
+            logger.info("Loading additional dictionary from: %s", d)
+            checker.word_frequency.load_text_file(d)
+
+    return checker
diff --git a/comment_spell_check/utils/parseargs.py b/comment_spell_check/utils/parseargs.py
@@ -0,0 +1,137 @@
+import argparse
+from importlib.metadata import version, PackageNotFoundError
+
+__version__ = "unknown"
+
+try:
+    __version__ = version("comment_spell_check")
+except PackageNotFoundError:
+    # package is not installed
+    pass
+
+
+def create_parser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("filenames", nargs="*")
+
+    parser.add_argument(
+        "--brief",
+        "-b",
+        action="store_true",
+        default=False,
+        dest="brief",
+        help="Make output brief",
+    )
+
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        default=False,
+        dest="verbose",
+        help="Make output verbose",
+    )
+
+    parser.add_argument(
+        "--first",
+        "-f",
+        action="store_true",
+        default=False,
+        dest="first",
+        help="Show only first occurrence of a mispelling",
+    )
+
+    parser.add_argument(
+        "--vim",
+        "-V",
+        action="store_true",
+        default=False,
+        dest="vim",
+        help="Output results in vim command format",
+    )
+
+    parser.add_argument(
+        "--dict",
+        "-d",
+        "--ignore-words",
+        "-I",
+        action="append",
+        dest="dict",
+        help="File that contains words that will be ignored."
+        " Argument can be passed multiple times."
+        " File must contain 1 word per line.",
+    )
+
+    parser.add_argument(
+        "--exclude",
+        "-e",
+        action="append",
+        dest="exclude",
+        help="Specify regex for excluding files."
+        " Argument can be passed multiple times.",
+    )
+
+    parser.add_argument(
+        "--skip",
+        "-S",
+        action="append",
+        help="Comma-separated list of files to skip. It "
+        "accepts globs as well. E.g.: if you want "
+        "coment_spell_check.py to skip .eps and .txt files, "
+        'you\'d give "*.eps,*.txt" to this option.'
+        " Argument can be passed multiple times.",
+    )
+
+    parser.add_argument(
+        "--prefix",
+        "-p",
+        action="append",
+        default=[],
+        dest="prefixes",
+        help="Add word prefix. Argument can be passed multiple times.",
+    )
+
+    parser.add_argument(
+        "--miss",
+        "-m",
+        action="store_true",
+        default=False,
+        dest="miss",
+        help="Only output the misspelt words",
+    )
+
+    parser.add_argument(
+        "--suffix",
+        "-s",
+        action="append",
+        default=[".h"],
+        dest="suffix",
+        help="File name suffix. Argument can be passed multiple times.",
+    )
+
+    parser.add_argument(
+        "--type",
+        "-t",
+        action="store",
+        default="",
+        dest="mime_type",
+        help="Set file mime type. File name suffix will be ignored.",
+    )
+
+    parser.add_argument(
+        "--bibtex",
+        action="append",
+        dest="bibtex",
+        help="Bibtex file to load for additional dictionary words.",
+    )
+
+    parser.add_argument("--version", action="version", version=f"{__version__}")
+    return parser
+
+
+def parse_args(parser=create_parser()):
+    """parse the command-line arguments."""
+
+    args = parser.parse_args()
+    return args
diff --git a/comment_spell_check/utils/url_remove.py b/comment_spell_check/utils/url_remove.py
@@ -0,0 +1,20 @@
+"""Module to remove URLs from a string."""
+
+import re
+
+
+def remove_urls(text):
+    """
+    Removes URLs from a string using a regular expression.
+
+    Args:
+        text: The input string.
+
+    Returns:
+        The string with URLs removed.
+    """
+    url_pattern = re.compile(
+        r"(?:https?:\/\/)?[\w.-]+\.[\w.-]+[^\s]*",
+        re.IGNORECASE,
+    )
+    return url_pattern.sub("", text)