Skip to content

Commit e830c50

Browse files
committed
Rewrite line spellchecking and move most of it into the Spellchecker
With this rewrite, performance improved slightly and is now down to 7% slower than the baseline (6s vs. 5.6s). There is deliberate an over-indentation left in this commit, since that makes this commit easier to review (without ignoring space changes).
1 parent 280e858 commit e830c50

File tree

2 files changed

+61
-32
lines changed

2 files changed

+61
-32
lines changed

codespell_lib/_codespell.py

Lines changed: 16 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,13 @@
3737
Sequence,
3838
Set,
3939
Tuple,
40-
Callable,
4140
)
4241

4342
# autogenerated by setuptools_scm
4443
from ._version import ( # type: ignore[import-not-found]
4544
__version__ as VERSION, # noqa: N812
4645
)
47-
from .spellchecker import Misspelling, Spellchecker
46+
from .spellchecker import Spellchecker, LineTokenizer, DetectedMisspelling
4847
from ._text_util import fix_case
4948

5049
word_regex_def = r"[\w\-'’]+" # noqa: RUF001
@@ -712,15 +711,17 @@ def is_text_file(filename: str) -> bool:
712711

713712
def ask_for_word_fix(
714713
line: str,
715-
match: Match[str],
716-
misspelling: Misspelling,
714+
issue: DetectedMisspelling,
717715
interactivity: int,
718716
colors: TermColors,
719717
) -> Tuple[bool, Sequence[str]]:
720-
wrongword = match.group()
718+
wrongword = issue.word
719+
misspelling = issue.misspelling
721720
if interactivity <= 0:
722721
return misspelling.fix, fix_case(wrongword, misspelling.candidates)
723722

723+
match = issue.re_match
724+
724725
line_ui = (
725726
f"{line[:match.start()]}"
726727
f"{colors.WWORD}{wrongword}{colors.DISABLE}"
@@ -835,7 +836,7 @@ def line_tokenizer_factory(
835836
uri_regex: Pattern[str],
836837
word_regex: Pattern[str],
837838
ignore_word_regex: Optional[Pattern[str]],
838-
) -> Callable[[str], Iterable[re.Match[str]]]:
839+
) -> LineTokenizer:
839840
def line_tokenizer(line: str) -> Iterable[Match[str]]:
840841
# If all URI spelling errors will be ignored, erase any URI before
841842
# extracting words. Otherwise, apply ignores after extracting words.
@@ -863,7 +864,6 @@ def parse_file(
863864
colors: TermColors,
864865
summary: Optional[Summary],
865866
spellchecker: Spellchecker,
866-
ignore_words_cased: Set[str],
867867
exclude_lines: Set[str],
868868
file_opener: FileOpener,
869869
word_regex: Pattern[str],
@@ -884,7 +884,7 @@ def parse_file(
884884
else:
885885
if options.check_filenames:
886886
for word in extract_words(filename, word_regex, ignore_word_regex):
887-
if word in ignore_words_cased:
887+
if word in spellchecker.ignore_words_cased:
888888
continue
889889
lword = word.lower()
890890
misspelling = spellchecker.check_lower_cased_word(lword)
@@ -958,25 +958,12 @@ def parse_file(
958958
fixed_words = set()
959959
asked_for = set()
960960

961-
for match in line_tokenizer(line):
962-
word = match.group()
963-
if word in ignore_words_cased:
964-
continue
965-
lword = word.lower()
966-
misspelling = spellchecker.check_lower_cased_word(lword)
967-
if misspelling is not None and lword not in extra_words_to_ignore:
968-
# Sometimes we find a 'misspelling' which is actually a valid word
969-
# preceded by a string escape sequence. Ignore such cases as
970-
# they're usually false alarms; see issue #17 among others.
971-
char_before_idx = match.start() - 1
972-
if (
973-
char_before_idx >= 0
974-
and line[char_before_idx] == "\\"
975-
# bell, backspace, formfeed, newline, carriage-return, tab, vtab.
976-
and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
977-
and spellchecker.check_lower_cased_word(lword[1:]) is None
978-
):
979-
continue
961+
issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore)
962+
for issue in issues:
963+
# TODO: De-indent in next commit
964+
misspelling = issue.misspelling
965+
word = issue.word
966+
lword = issue.lword
980967

981968
context_shown = False
982969
fix = misspelling.fix
@@ -988,8 +975,7 @@ def parse_file(
988975
print_context(lines, i, context)
989976
fix, candidates = ask_for_word_fix(
990977
lines[i],
991-
match,
992-
misspelling,
978+
issue,
993979
options.interactive,
994980
colors=colors,
995981
)
@@ -1193,6 +1179,7 @@ def main(*args: str) -> int:
11931179
return EX_USAGE
11941180
use_dictionaries.append(dictionary)
11951181
spellchecker = Spellchecker()
1182+
spellchecker.ignore_words_cased = ignore_words_cased
11961183
for dictionary in use_dictionaries:
11971184
spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
11981185
colors = TermColors()
@@ -1270,7 +1257,6 @@ def main(*args: str) -> int:
12701257
colors,
12711258
summary,
12721259
spellchecker,
1273-
ignore_words_cased,
12741260
exclude_lines,
12751261
file_opener,
12761262
word_regex,
@@ -1295,7 +1281,6 @@ def main(*args: str) -> int:
12951281
colors,
12961282
summary,
12971283
spellchecker,
1298-
ignore_words_cased,
12991284
exclude_lines,
13001285
file_opener,
13011286
word_regex,

codespell_lib/spellchecker.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,67 @@
1515
Copyright (C) 2010-2011 Lucas De Marchi <[email protected]>
1616
Copyright (C) 2011 ProFUSION embedded systems
1717
"""
18-
from typing import Dict, Sequence, Container, Optional
18+
import re
19+
from typing import Dict, Sequence, Container, Optional, Iterable, Callable
1920

2021
# Pass all misspellings through this translation table to generate
2122
# alternative misspellings and fixes.
2223
alt_chars = (("'", "’"),) # noqa: RUF001
2324

2425

26+
LineTokenizer = Callable[[str], Iterable[re.Match[str]]]
27+
28+
2529
class Misspelling:
2630
def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
2731
self.candidates = candidates
2832
self.fix = fix
2933
self.reason = reason
3034

3135

36+
class DetectedMisspelling:
37+
38+
def __init__(self, word: str, lword: str, misspelling: Misspelling, match: re.Match[str]) -> None:
39+
self.word = word
40+
self.lword = lword
41+
self.misspelling = misspelling
42+
self.re_match = match
43+
44+
3245
class Spellchecker:
3346
def __init__(self) -> None:
3447
self._misspellings: Dict[str, Misspelling] = {}
48+
self.ignore_words_cased: Container[str] = frozenset()
49+
50+
def spellcheck_line(
51+
self,
52+
line: str,
53+
tokenizer: Callable[[str], Iterable[re.Match[str]]],
54+
*,
55+
extra_words_to_ignore: Container[str] = frozenset()
56+
) -> Iterable[DetectedMisspelling]:
57+
misspellings = self._misspellings
58+
ignore_words_cased = self.ignore_words_cased
59+
for match in tokenizer(line):
60+
word = match.group()
61+
if word in ignore_words_cased:
62+
continue
63+
lword = word.lower()
64+
misspelling = misspellings.get(lword)
65+
if misspelling is not None and lword not in extra_words_to_ignore:
66+
# Sometimes we find a 'misspelling' which is actually a valid word
67+
# preceded by a string escape sequence. Ignore such cases as
68+
# they're usually false alarms; see issue #17 among others.
69+
char_before_idx = match.start() - 1
70+
if (
71+
char_before_idx >= 0
72+
and line[char_before_idx] == "\\"
73+
# bell, backspace, formfeed, newline, carriage-return, tab, vtab.
74+
and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
75+
and lword[1:] not in misspellings
76+
):
77+
continue
78+
yield DetectedMisspelling(word, lword, misspelling, match)
3579

3680
def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
3781
"""Check a given word against the loaded dictionaries

0 commit comments

Comments
 (0)