-
-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Fix pathologically slow assertion diffs for large inputs (#8998) #14543
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| Assertion failures comparing very large strings, lists, or dataclasses no longer hang for a long time (sometimes minutes) while building the diff. | ||
|
|
||
| When the inputs are large enough that :func:`difflib.ndiff` would be pathologically slow, pytest now falls back to a faster line-level diff and notes this in the output. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Iterator | ||
| from collections.abc import Sequence | ||
|
|
||
| from _pytest.assertion._typing import _HighlightFunc | ||
|
|
||
|
|
||
| # Above this combined input size (in characters), ``difflib.ndiff`` becomes | ||
| # pathologically slow: its character-level "fancy replace" step is quadratic in | ||
| # the size of the differing region, so a few tens of kilobytes of differing text | ||
| # can hang for minutes (see issue #8998). | ||
| NDIFF_MAX_INPUT_SIZE = 10_000 | ||
|
|
||
| # Above this number of lines, both ``ndiff`` and ``unified_diff`` get slow, | ||
| # since the underlying ``SequenceMatcher`` is quadratic in the number of lines | ||
| # (a large nested structure can pretty-print to hundreds of thousands of lines). | ||
| # We both fall back and cap the fallback's input at this many lines. | ||
| DIFF_MAX_LINES = 1_000 | ||
|
|
||
|
|
||
| def ndiff_too_slow(left_lines: Sequence[str], right_lines: Sequence[str]) -> bool: | ||
| """Return True if ``difflib.ndiff`` would likely be pathologically slow.""" | ||
| if len(left_lines) > DIFF_MAX_LINES or len(right_lines) > DIFF_MAX_LINES: | ||
| return True | ||
| size = sum(len(line) for line in left_lines) + sum( | ||
| len(line) for line in right_lines | ||
| ) | ||
| return size > NDIFF_MAX_INPUT_SIZE | ||
|
|
||
|
|
||
| def fast_unified_diff( | ||
| left_lines: Sequence[str], | ||
| right_lines: Sequence[str], | ||
| highlighter: _HighlightFunc, | ||
| ) -> Iterator[str]: | ||
| """Yield a fast, coarse line-level diff for inputs too large for ``ndiff``. | ||
|
|
||
| Unlike ``ndiff`` this does not produce character-level "?" guide lines, and | ||
| it only diffs the first ``DIFF_MAX_LINES`` lines of each side, but it | ||
| completes in milliseconds where ``ndiff`` would hang (see issue #8998). | ||
|
|
||
| "right" is the expected base against which we compare "left", | ||
| see https://github.com/pytest-dev/pytest/issues/3333. | ||
| """ | ||
| from difflib import unified_diff | ||
|
|
||
| yield ( | ||
| f"Diff too large to compute in full (over {NDIFF_MAX_INPUT_SIZE} " | ||
| "characters); showing a faster line-level diff instead:" | ||
| ) | ||
|
Comment on lines
+48
to
+51
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Message is wrong here, could be either too many line or too many chars. |
||
| left = [line.rstrip("\n") for line in left_lines[:DIFF_MAX_LINES]] | ||
| right = [line.rstrip("\n") for line in right_lines[:DIFF_MAX_LINES]] | ||
| hidden = max(len(left_lines), len(right_lines)) - DIFF_MAX_LINES | ||
| if hidden > 0: | ||
| yield f"Diffing only the first {DIFF_MAX_LINES} lines; {hidden} more hidden" | ||
| diff = unified_diff(right, left, n=3, lineterm="") | ||
| # The first two lines are the always-empty "--- "/"+++ " file headers. | ||
| next(diff, None) | ||
| next(diff, None) | ||
| yield from highlighter("\n".join(diff), lexer="diff").splitlines() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,8 @@ | |
| from collections.abc import Iterator | ||
|
|
||
| from _pytest._io.saferepr import saferepr | ||
| from _pytest.assertion._diff import fast_unified_diff | ||
| from _pytest.assertion._diff import ndiff_too_slow | ||
| from _pytest.assertion._typing import _AssertionTextDiffStyle | ||
| from _pytest.assertion._typing import _HighlightFunc | ||
| from _pytest.assertion.highlight import dummy_highlighter | ||
|
|
@@ -75,13 +77,15 @@ def _diff_text( | |
| left = repr(str(left)) | ||
| right = repr(str(right)) | ||
| yield "Strings contain only whitespace, escaping them using repr()" | ||
| left_lines = left.splitlines(keepends) | ||
| right_lines = right.splitlines(keepends) | ||
|
Comment on lines
+80
to
+81
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have to split lines ? Can't we just count the line separator ? |
||
| if ndiff_too_slow(left_lines, right_lines): | ||
| yield from fast_unified_diff(left_lines, right_lines, highlighter) | ||
| return | ||
| # "right" is the expected base against which we compare "left", | ||
| # see https://github.com/pytest-dev/pytest/issues/3333 | ||
| yield from highlighter( | ||
| "\n".join( | ||
| line.strip("\n") | ||
| for line in ndiff(right.splitlines(keepends), left.splitlines(keepends)) | ||
| ), | ||
| "\n".join(line.strip("\n") for line in ndiff(right_lines, left_lines)), | ||
| lexer="diff", | ||
| ).splitlines() | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| from _pytest.assertion import truncate | ||
| from _pytest.assertion import util | ||
| from _pytest.assertion._compare_any import _compare_eq_cls | ||
| from _pytest.assertion._diff import ndiff_too_slow | ||
| from _pytest.assertion.compare_text import _compare_eq_text | ||
| from _pytest.config import Config as _Config | ||
| from _pytest.monkeypatch import MonkeyPatch | ||
|
|
@@ -459,6 +460,19 @@ def callequal( | |
| ) | ||
|
|
||
|
|
||
| class TestNdiffTooSlow: | ||
| """Heuristic guarding against pathologically slow diffs (#8998).""" | ||
|
|
||
| def test_small_input_uses_ndiff(self) -> None: | ||
| assert ndiff_too_slow(["spam"], ["eggs"]) is False | ||
|
|
||
| def test_many_characters_is_too_slow(self) -> None: | ||
| assert ndiff_too_slow(["a" * 6000], ["b" * 6000]) is True | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's mock the values, we don't have to actually construct an enormous list to test the behavior |
||
|
|
||
| def test_many_lines_is_too_slow(self) -> None: | ||
| assert ndiff_too_slow(["x"] * 1001, ["y"]) is True | ||
|
|
||
|
|
||
| class TestAssert_reprcompare: | ||
| def test_different_types(self) -> None: | ||
| assert callequal([0, 1], "foo") is None | ||
|
|
@@ -513,6 +527,32 @@ def test_text_skipping_verbose(self) -> None: | |
| assert "- " + "a" * 50 + "eggs" in lines | ||
| assert "+ " + "a" * 50 + "spam" in lines | ||
|
|
||
| def test_text_diff_large_input_skips_ndiff(self) -> None: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's also mock here |
||
| # A single huge differing line is above the character cutoff and falls | ||
| # back to a fast line-level diff instead of the pathologically slow | ||
| # ndiff (#8998). | ||
| left = "a" + "x" * 20000 | ||
| right = "b" + "y" * 20000 | ||
| lines = callequal(left, right, verbose=1) | ||
| assert lines is not None | ||
| assert any("Diff too large to compute in full" in line for line in lines) | ||
| # The character-level "?" guide lines produced by ndiff must not appear. | ||
| assert not any(line.startswith("? ") for line in lines) | ||
|
|
||
| def test_text_diff_many_lines_skips_ndiff(self) -> None: | ||
| # Many lines are above the line cutoff and fall back, capping the | ||
| # number of lines actually diffed (#8998). | ||
| left = "\n".join(f"left line {i}" for i in range(2000)) | ||
| right = "\n".join(f"right line {i}" for i in range(2000)) | ||
| lines = callequal(left, right, verbose=1) | ||
| assert lines is not None | ||
| assert any("Diff too large to compute in full" in line for line in lines) | ||
| assert any("Diffing only the first 1000 lines" in line for line in lines) | ||
| assert not any(line.startswith("? ") for line in lines) | ||
| # The fallback still shows which lines differ. | ||
| assert "-right line 0" in lines | ||
| assert "+left line 0" in lines | ||
|
|
||
| def test_multiline_text_diff(self) -> None: | ||
| left = "foo\nspam\nbar" | ||
| right = "foo\neggs\nbar" | ||
|
|
@@ -673,6 +713,17 @@ def test_iterable_quiet(self) -> None: | |
| "Use -v to get more diff", | ||
| ] | ||
|
|
||
| def test_iterable_large_input_skips_ndiff(self) -> None: | ||
| # Large iterables fall back to a fast line-level diff instead of the | ||
| # pathologically slow ndiff over their pprint output (#8998). | ||
| left = [f"item-{i}" for i in range(2000)] | ||
| right = [f"other-{i}" for i in range(2000)] | ||
| lines = callequal(left, right, verbose=1) | ||
| assert lines is not None | ||
| assert "Full diff:" in lines | ||
| assert any("Diff too large to compute in full" in line for line in lines) | ||
| assert not any(line.startswith("? ") for line in lines) | ||
|
|
||
| def test_iterable_full_diff_ci( | ||
| self, monkeypatch: MonkeyPatch, pytester: Pytester | ||
| ) -> None: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We're summing everything here, we need to fast exit as soon as size become greater than NDIFF_MAX_INPUT_SIZE