Skip to content

Commit 280e858

Browse files
committed
Refactor line tokenization to simplify an outer loop
The refactor is a stepping stone towards the next commit where the inner loop is moved to the `Spellchecker`.
1 parent 1ec338b commit 280e858

File tree

1 file changed

+37
-17
lines changed

1 file changed

+37
-17
lines changed

codespell_lib/_codespell.py

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
Sequence,
3838
Set,
3939
Tuple,
40+
Callable,
4041
)
4142

4243
# autogenerated by setuptools_scm
@@ -829,6 +830,34 @@ def apply_uri_ignore_words(
829830
return check_matches
830831

831832

833+
def line_tokenizer_factory(
834+
uri_ignore_words: Set[str],
835+
uri_regex: Pattern[str],
836+
word_regex: Pattern[str],
837+
ignore_word_regex: Optional[Pattern[str]],
838+
) -> Callable[[str], Iterable[re.Match[str]]]:
839+
def line_tokenizer(line: str) -> Iterable[Match[str]]:
840+
# If all URI spelling errors will be ignored, erase any URI before
841+
# extracting words. Otherwise, apply ignores after extracting words.
842+
# This ensures that if a URI ignore word occurs both inside a URI and
843+
# outside, it will still be a spelling error.
844+
if "*" in uri_ignore_words:
845+
line = uri_regex.sub(" ", line)
846+
check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
847+
if "*" not in uri_ignore_words:
848+
check_matches = apply_uri_ignore_words(
849+
check_matches,
850+
line,
851+
word_regex,
852+
ignore_word_regex,
853+
uri_regex,
854+
uri_ignore_words,
855+
)
856+
return check_matches
857+
858+
return line_tokenizer
859+
860+
832861
def parse_file(
833862
filename: str,
834863
colors: TermColors,
@@ -906,6 +935,13 @@ def parse_file(
906935
except OSError:
907936
return bad_count
908937

938+
line_tokenizer = line_tokenizer_factory(
939+
uri_ignore_words,
940+
uri_regex,
941+
word_regex,
942+
ignore_word_regex,
943+
)
944+
909945
for i, line in enumerate(lines):
910946
if line.rstrip() in exclude_lines:
911947
continue
@@ -922,23 +958,7 @@ def parse_file(
922958
fixed_words = set()
923959
asked_for = set()
924960

925-
# If all URI spelling errors will be ignored, erase any URI before
926-
# extracting words. Otherwise, apply ignores after extracting words.
927-
# This ensures that if a URI ignore word occurs both inside a URI and
928-
# outside, it will still be a spelling error.
929-
if "*" in uri_ignore_words:
930-
line = uri_regex.sub(" ", line)
931-
check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
932-
if "*" not in uri_ignore_words:
933-
check_matches = apply_uri_ignore_words(
934-
check_matches,
935-
line,
936-
word_regex,
937-
ignore_word_regex,
938-
uri_regex,
939-
uri_ignore_words,
940-
)
941-
for match in check_matches:
961+
for match in line_tokenizer(line):
942962
word = match.group()
943963
if word in ignore_words_cased:
944964
continue

0 commit comments

Comments
 (0)