Skip to content

Commit 844b799

Browse files
authored
Merge pull request #10847 from pradyunsg/better-html5lib-fallback
2 parents cc35c93 + a78845a commit 844b799

File tree

5 files changed

+49
-6
lines changed

5 files changed

+49
-6
lines changed

news/10847.removal.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Instead of failing on index pages that use non-compliant HTML 5, print a deprecation warning and fall back to ``html5lib``-based parsing for now. This simplifies the migration for non-compliant index pages, by letting such indexes function with a warning.

src/pip/_internal/index/collector.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from pip._internal.models.search_scope import SearchScope
3939
from pip._internal.network.session import PipSession
4040
from pip._internal.network.utils import raise_for_status
41+
from pip._internal.utils.deprecation import deprecated
4142
from pip._internal.utils.filetypes import is_archive_file
4243
from pip._internal.utils.misc import pairwise, redact_auth_from_url
4344
from pip._internal.vcs import vcs
@@ -342,12 +343,34 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
342343
"""
343344
Parse an HTML document, and yield its anchor elements as Link objects.
344345
"""
346+
encoding = page.encoding or "utf-8"
347+
348+
# Check if the page starts with a valid doctype, to decide whether to use
349+
# http.parser or (deprecated) html5lib for parsing -- unless explicitly
350+
# requested to use html5lib.
351+
if not use_deprecated_html5lib:
352+
expected_doctype = "<!doctype html>".encode(encoding)
353+
actual_start = page.content[: len(expected_doctype)]
354+
if actual_start.decode(encoding).lower() != "<!doctype html>":
355+
deprecated(
356+
reason=(
357+
f"The HTML index page being used ({page.url}) is not a proper "
358+
"HTML 5 document. This is in violation of PEP 503 which requires "
359+
"these pages to be well-formed HTML 5 documents. Please reach out "
360+
"to the owners of this index page, and ask them to update this "
361+
"index page to a valid HTML 5 document."
362+
),
363+
replacement=None,
364+
gone_in="22.2",
365+
issue=10825,
366+
)
367+
use_deprecated_html5lib = True
368+
345369
if use_deprecated_html5lib:
346370
yield from _parse_links_html5lib(page)
347371
return
348372

349373
parser = HTMLLinkParser()
350-
encoding = page.encoding or "utf-8"
351374
parser.feed(page.content.decode(encoding))
352375

353376
url = page.url

tests/functional/test_install_config.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
)
1717
from tests.lib.venv import VirtualEnvironment
1818

19+
TEST_PYPI_INITOOLS = "https://test.pypi.org/simple/initools/"
20+
1921

2022
def test_options_from_env_vars(script: PipTestEnvironment) -> None:
2123
"""
@@ -94,7 +96,7 @@ def test_command_line_append_flags(
9496
variables.
9597
9698
"""
97-
script.environ["PIP_FIND_LINKS"] = "https://test.pypi.org"
99+
script.environ["PIP_FIND_LINKS"] = TEST_PYPI_INITOOLS
98100
result = script.pip(
99101
"install",
100102
"-vvv",
@@ -133,7 +135,7 @@ def test_command_line_appends_correctly(
133135
Test multiple appending options set by environmental variables.
134136
135137
"""
136-
script.environ["PIP_FIND_LINKS"] = f"https://test.pypi.org {data.find_links}"
138+
script.environ["PIP_FIND_LINKS"] = f"{TEST_PYPI_INITOOLS} {data.find_links}"
137139
result = script.pip(
138140
"install",
139141
"-vvv",

tests/functional/test_new_resolver_hashes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def _create_find_links(script: PipTestEnvironment) -> _FindLinks:
3636
wheel_url=path_to_url(wheel_path),
3737
wheel_hash=wheel_hash,
3838
wheel_path=wheel_path,
39-
)
39+
).strip()
4040
)
4141

4242
return _FindLinks(index_html, sdist_hash, wheel_hash)

tests/unit/test_collector.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -540,9 +540,9 @@ def test_parse_links_caches_same_page_by_url() -> None:
540540

541541

542542
def test_parse_link_handles_deprecated_usage_properly() -> None:
543-
html = b'<a href="/pkg1-1.0.tar.gz"><a href="/pkg1-2.0.tar.gz">'
543+
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
544544
url = "https://example.com/simple/"
545-
page = HTMLPage(html, encoding=None, url=url)
545+
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)
546546

547547
parsed_links = list(parse_links(page, use_deprecated_html5lib=True))
548548

@@ -551,6 +551,23 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
551551
assert "pkg1-2.0" in parsed_links[1].url
552552

553553

554+
@mock.patch("pip._internal.index.collector.deprecated")
555+
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
556+
mock_deprecated: mock.Mock,
557+
) -> None:
558+
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
559+
url = "https://example.com/simple/"
560+
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)
561+
562+
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
563+
564+
assert len(parsed_links) == 2, parsed_links
565+
assert "pkg1-1.0" in parsed_links[0].url
566+
assert "pkg1-2.0" in parsed_links[1].url
567+
568+
mock_deprecated.assert_called_once()
569+
570+
554571
@mock.patch("pip._internal.index.collector.raise_for_status")
555572
def test_request_http_error(
556573
mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture

0 commit comments

Comments
 (0)