Remove unnecessary html.unescape() calls in index/collector.py

jdufresne · jdufresne · commit af3405738be5 · 2021-08-20T10:25:32.000-06:00
The html5lib package (as well as stdlib html.parser) already unescapes attributes. There is no need to do so a second time. Unnecessary since cba4521.
diff --git a/news/10378.bugfix.rst b/news/10378.bugfix.rst
@@ -0,0 +1 @@
+Fix double unescape of HTML ``data-requires-python`` and ``data-yanked`` attributes.
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -5,7 +5,6 @@
 import cgi
 import collections
 import functools
-import html
 import itertools
 import logging
 import os
@@ -248,11 +247,7 @@ def _create_link_from_element(
 
     url = _clean_link(urllib.parse.urljoin(base_url, href))
     pyrequire = anchor.get("data-requires-python")
-    pyrequire = html.unescape(pyrequire) if pyrequire else None
-
     yanked_reason = anchor.get("data-yanked")
-    if yanked_reason:
-        yanked_reason = html.unescape(yanked_reason)
 
     link = Link(
         url,
diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -411,6 +411,45 @@ def test_clean_link(url, clean_url):
     assert _clean_link(url) == clean_url
 
 
+def _test_parse_links_data_attribute(anchor_html, attr, expected):
+    html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>'
+    html_bytes = html.encode("utf-8")
+    page = HTMLPage(
+        html_bytes,
+        encoding=None,
+        # parse_links() is cached by url, so we inject a random uuid to ensure
+        # the page content isn't cached.
+        url=f"https://example.com/simple-{uuid.uuid4()}/",
+    )
+    links = list(parse_links(page))
+    (link,) = links
+    actual = getattr(link, attr)
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "anchor_html, expected",
+    [
+        # Test not present.
+        ('<a href="/pkg-1.0.tar.gz"></a>', None),
+        # Test present with no value.
+        ('<a href="/pkg-1.0.tar.gz" data-requires-python></a>', None),
+        # Test a value with an escaped character.
+        (
+            '<a href="/pkg-1.0.tar.gz" data-requires-python="&gt;=3.6"></a>',
+            ">=3.6",
+        ),
+        # Test requires python is unescaped once.
+        (
+            '<a href="/pkg-1.0.tar.gz" data-requires-python="&amp;gt;=3.6"></a>',
+            "&gt;=3.6",
+        ),
+    ],
+)
+def test_parse_links__requires_python(anchor_html, expected):
+    _test_parse_links_data_attribute(anchor_html, "requires_python", expected)
+
+
 @pytest.mark.parametrize(
     "anchor_html, expected",
     [
@@ -429,27 +468,15 @@ def test_clean_link(url, clean_url):
             '<a href="/pkg-1.0.tar.gz" data-yanked="curlyquote \u2018"></a>',
             "curlyquote \u2018",
         ),
+        # Test yanked reason is unescaped once.
+        (
+            '<a href="/pkg-1.0.tar.gz" data-yanked="version &amp;lt; 1"></a>',
+            "version &lt; 1",
+        ),
     ],
 )
 def test_parse_links__yanked_reason(anchor_html, expected):
-    html = (
-        # Mark this as a unicode string for Python 2 since anchor_html
-        # can contain non-ascii.
-        '<html><head><meta charset="utf-8"><head>'
-        "<body>{}</body></html>"
-    ).format(anchor_html)
-    html_bytes = html.encode("utf-8")
-    page = HTMLPage(
-        html_bytes,
-        encoding=None,
-        # parse_links() is cached by url, so we inject a random uuid to ensure
-        # the page content isn't cached.
-        url=f"https://example.com/simple-{uuid.uuid4()}/",
-    )
-    links = list(parse_links(page))
-    (link,) = links
-    actual = link.yanked_reason
-    assert actual == expected
+    _test_parse_links_data_attribute(anchor_html, "yanked_reason", expected)
 
 
 def test_parse_links_caches_same_page_by_url():

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Fix double unescape of HTML ``data-requires-python`` and ``data-yanked`` attributes.