Skip to content

Commit 89d4c1e

Browse files
committed
Refactor with a pure Python spell checker
This is a major re-write of the code. It now uses pyspellchecker, a pure Python package, replacing pyenchant, which required the C enchant library underneath. Also code has been re-organized to make seperate functions into smaller, more manageable parts and to adhere to Pylint's coding standards.
1 parent 59d771f commit 89d4c1e

File tree

6 files changed

+230
-0
lines changed

6 files changed

+230
-0
lines changed

comment_spell_check/__init__.py

Whitespace-only changes.

comment_spell_check/utils/__init__.py

Whitespace-only changes.
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Load Bibtex files into a spell checking dictionary."""
2+
3+
import logging
4+
import bibtexparser
5+
import spellchecker
6+
7+
8+
def split_bibtex_name(name: str):
9+
"""
10+
Split a Bibtex name, which is two words seperated by a number.
11+
"""
12+
13+
# map any digit to space
14+
mytable = str.maketrans("0123456789", " ")
15+
new_name = name.translate(mytable)
16+
17+
# split by space
18+
words = new_name.split()
19+
return words
20+
21+
22+
def add_bibtex(spell: spellchecker.SpellChecker, filename: str):
23+
"""Update ``spell`` spell checking dictionary with names
24+
from ``filename``, a Bibtex file."""
25+
26+
logger = logging.getLogger("comment_spell_check.bibtex_loader")
27+
logger.info("Bibtex file: %s", filename)
28+
29+
word_list = []
30+
31+
with open(filename, "rt", encoding="utf-8") as biblatex_file:
32+
bib_database = bibtexparser.load(biblatex_file)
33+
34+
for k in bib_database.get_entry_dict().keys():
35+
words = split_bibtex_name(k)
36+
word_list.extend(words)
37+
38+
logger.info("Words: %s", word_list)
39+
spell.word_frequency.load_words(word_list)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Create a case sensitive spell checker with the English dictionary and
2+
additional dictionaries if provided.
3+
"""
4+
5+
import logging
6+
import importlib.resources
7+
import spellchecker
8+
9+
10+
def create_checker(dict_list: list[str] = None) -> spellchecker.SpellChecker:
11+
"""Create a case sensitive spell checker with the English dictionary and
12+
additional dictionaries if provided."""
13+
14+
logger = logging.getLogger("comment_spell_check.create_checker")
15+
16+
# create an empty SpellChecker object, because we want a case
17+
# sensitive checker
18+
checker = spellchecker.SpellChecker(language=None, case_sensitive=True)
19+
20+
# load the English dictionary
21+
lib_path = importlib.resources.files(spellchecker)
22+
english_dict = str(lib_path) + "/resources/en.json.gz"
23+
logger.info("Loading English dictionary from: %s", english_dict)
24+
checker.word_frequency.load_dictionary(english_dict)
25+
26+
# load the additional dictionaries
27+
if not isinstance(dict_list, list):
28+
return checker
29+
if len(dict_list) > 0:
30+
for d in dict_list:
31+
logger.info("Loading additional dictionary from: %s", d)
32+
checker.word_frequency.load_text_file(d)
33+
34+
return checker
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import argparse
2+
from importlib.metadata import version, PackageNotFoundError
3+
4+
__version__ = "unknown"
5+
6+
try:
7+
__version__ = version("comment_spell_check")
8+
except PackageNotFoundError:
9+
# package is not installed
10+
pass
11+
12+
13+
def create_parser():
14+
parser = argparse.ArgumentParser()
15+
16+
parser.add_argument("filenames", nargs="*")
17+
18+
parser.add_argument(
19+
"--brief",
20+
"-b",
21+
action="store_true",
22+
default=False,
23+
dest="brief",
24+
help="Make output brief",
25+
)
26+
27+
parser.add_argument(
28+
"--verbose",
29+
"-v",
30+
action="store_true",
31+
default=False,
32+
dest="verbose",
33+
help="Make output verbose",
34+
)
35+
36+
parser.add_argument(
37+
"--first",
38+
"-f",
39+
action="store_true",
40+
default=False,
41+
dest="first",
42+
help="Show only first occurrence of a mispelling",
43+
)
44+
45+
parser.add_argument(
46+
"--vim",
47+
"-V",
48+
action="store_true",
49+
default=False,
50+
dest="vim",
51+
help="Output results in vim command format",
52+
)
53+
54+
parser.add_argument(
55+
"--dict",
56+
"-d",
57+
"--ignore-words",
58+
"-I",
59+
action="append",
60+
dest="dict",
61+
help="File that contains words that will be ignored."
62+
" Argument can be passed multiple times."
63+
" File must contain 1 word per line.",
64+
)
65+
66+
parser.add_argument(
67+
"--exclude",
68+
"-e",
69+
action="append",
70+
dest="exclude",
71+
help="Specify regex for excluding files."
72+
" Argument can be passed multiple times.",
73+
)
74+
75+
parser.add_argument(
76+
"--skip",
77+
"-S",
78+
action="append",
79+
help="Comma-separated list of files to skip. It "
80+
"accepts globs as well. E.g.: if you want "
81+
"coment_spell_check.py to skip .eps and .txt files, "
82+
'you\'d give "*.eps,*.txt" to this option.'
83+
" Argument can be passed multiple times.",
84+
)
85+
86+
parser.add_argument(
87+
"--prefix",
88+
"-p",
89+
action="append",
90+
default=[],
91+
dest="prefixes",
92+
help="Add word prefix. Argument can be passed multiple times.",
93+
)
94+
95+
parser.add_argument(
96+
"--miss",
97+
"-m",
98+
action="store_true",
99+
default=False,
100+
dest="miss",
101+
help="Only output the misspelt words",
102+
)
103+
104+
parser.add_argument(
105+
"--suffix",
106+
"-s",
107+
action="append",
108+
default=[".h"],
109+
dest="suffix",
110+
help="File name suffix. Argument can be passed multiple times.",
111+
)
112+
113+
parser.add_argument(
114+
"--type",
115+
"-t",
116+
action="store",
117+
default="",
118+
dest="mime_type",
119+
help="Set file mime type. File name suffix will be ignored.",
120+
)
121+
122+
parser.add_argument(
123+
"--bibtex",
124+
action="append",
125+
dest="bibtex",
126+
help="Bibtex file to load for additional dictionary words.",
127+
)
128+
129+
parser.add_argument("--version", action="version", version=f"{__version__}")
130+
return parser
131+
132+
133+
def parse_args(parser=create_parser()):
134+
"""parse the command-line arguments."""
135+
136+
args = parser.parse_args()
137+
return args
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""Module to remove URLs from a string."""
2+
3+
import re
4+
5+
6+
def remove_urls(text):
7+
"""
8+
Removes URLs from a string using a regular expression.
9+
10+
Args:
11+
text: The input string.
12+
13+
Returns:
14+
The string with URLs removed.
15+
"""
16+
url_pattern = re.compile(
17+
r"(?:https?:\/\/)?[\w.-]+\.[\w.-]+[^\s]*",
18+
re.IGNORECASE,
19+
)
20+
return url_pattern.sub("", text)

0 commit comments

Comments
 (0)