Skip to content

Commit af34057

Browse files
committed
Remove unnecessary html.unescape() calls in index/collector.py
The html5lib package (as well as stdlib html.parser) already unescapes attributes. There is no need to do so a second time. Unnecessary since cba4521.
1 parent 09103e8 commit af34057

File tree

3 files changed

+46
-23
lines changed

3 files changed

+46
-23
lines changed

news/10378.bugfix.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix double unescape of HTML ``data-requires-python`` and ``data-yanked`` attributes.

src/pip/_internal/index/collector.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import cgi
66
import collections
77
import functools
8-
import html
98
import itertools
109
import logging
1110
import os
@@ -248,11 +247,7 @@ def _create_link_from_element(
248247

249248
url = _clean_link(urllib.parse.urljoin(base_url, href))
250249
pyrequire = anchor.get("data-requires-python")
251-
pyrequire = html.unescape(pyrequire) if pyrequire else None
252-
253250
yanked_reason = anchor.get("data-yanked")
254-
if yanked_reason:
255-
yanked_reason = html.unescape(yanked_reason)
256251

257252
link = Link(
258253
url,

tests/unit/test_collector.py

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,45 @@ def test_clean_link(url, clean_url):
411411
assert _clean_link(url) == clean_url
412412

413413

414+
def _test_parse_links_data_attribute(anchor_html, attr, expected):
415+
html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>'
416+
html_bytes = html.encode("utf-8")
417+
page = HTMLPage(
418+
html_bytes,
419+
encoding=None,
420+
# parse_links() is cached by url, so we inject a random uuid to ensure
421+
# the page content isn't cached.
422+
url=f"https://example.com/simple-{uuid.uuid4()}/",
423+
)
424+
links = list(parse_links(page))
425+
(link,) = links
426+
actual = getattr(link, attr)
427+
assert actual == expected
428+
429+
430+
@pytest.mark.parametrize(
431+
"anchor_html, expected",
432+
[
433+
# Test not present.
434+
('<a href="/pkg-1.0.tar.gz"></a>', None),
435+
# Test present with no value.
436+
('<a href="/pkg-1.0.tar.gz" data-requires-python></a>', None),
437+
# Test a value with an escaped character.
438+
(
439+
'<a href="/pkg-1.0.tar.gz" data-requires-python="&gt;=3.6"></a>',
440+
">=3.6",
441+
),
442+
# Test requires python is unescaped once.
443+
(
444+
'<a href="/pkg-1.0.tar.gz" data-requires-python="&amp;gt;=3.6"></a>',
445+
"&gt;=3.6",
446+
),
447+
],
448+
)
449+
def test_parse_links__requires_python(anchor_html, expected):
450+
_test_parse_links_data_attribute(anchor_html, "requires_python", expected)
451+
452+
414453
@pytest.mark.parametrize(
415454
"anchor_html, expected",
416455
[
@@ -429,27 +468,15 @@ def test_clean_link(url, clean_url):
429468
'<a href="/pkg-1.0.tar.gz" data-yanked="curlyquote \u2018"></a>',
430469
"curlyquote \u2018",
431470
),
471+
# Test yanked reason is unescaped once.
472+
(
473+
'<a href="/pkg-1.0.tar.gz" data-yanked="version &amp;lt; 1"></a>',
474+
"version &lt; 1",
475+
),
432476
],
433477
)
434478
def test_parse_links__yanked_reason(anchor_html, expected):
435-
html = (
436-
# Mark this as a unicode string for Python 2 since anchor_html
437-
# can contain non-ascii.
438-
'<html><head><meta charset="utf-8"><head>'
439-
"<body>{}</body></html>"
440-
).format(anchor_html)
441-
html_bytes = html.encode("utf-8")
442-
page = HTMLPage(
443-
html_bytes,
444-
encoding=None,
445-
# parse_links() is cached by url, so we inject a random uuid to ensure
446-
# the page content isn't cached.
447-
url=f"https://example.com/simple-{uuid.uuid4()}/",
448-
)
449-
links = list(parse_links(page))
450-
(link,) = links
451-
actual = link.yanked_reason
452-
assert actual == expected
479+
_test_parse_links_data_attribute(anchor_html, "yanked_reason", expected)
453480

454481

455482
def test_parse_links_caches_same_page_by_url():

0 commit comments

Comments
 (0)