pythongh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (pythonGH-95215)

sissbruecker · miss-islington · commit 318884d21521 · 2025-05-07T15:50:15.000Z
According to the HTML5 spec, named character references in attribute values should only be processed if they are not followed by an ASCII alphanumeric, or an equals sign. (cherry picked from commit 77b14a6) Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@googlemail.com> https: //html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
@@ -12,6 +12,7 @@
 import _markupbase
 
 from html import unescape
+from html.entities import html5 as html5_entities
 
 
 __all__ = ['HTMLParser']
@@ -23,6 +24,7 @@
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
 
 starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
@@ -57,6 +59,22 @@
 # </ and the tag name, so maybe this should be fixed
 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+    ref = match.group(0)
+    # Numeric / hex char refs must always be unescaped
+    if ref.startswith('&#'):
+        return unescape(ref)
+    # Named character / entity references must only be unescaped
+    # if they are an exact match, and they are not followed by an equals sign
+    if not ref.endswith('=') and ref[1:] in html5_entities:
+        return unescape(ref)
+    # Otherwise do not unescape
+    return ref
+
+def _unescape_attrvalue(s):
+    return attr_charref.sub(_replace_attr_charref, s)
 
 
 class HTMLParser(_markupbase.ParserBase):
@@ -323,7 +341,7 @@ def parse_starttag(self, i):
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
-                attrvalue = unescape(attrvalue)
+                attrvalue = _unescape_attrvalue(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()
 
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
@@ -348,18 +348,16 @@ def test_convert_charrefs(self):
         collector = lambda: EventCollectorCharrefs()
         self.assertTrue(collector().convert_charrefs)
         charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
-        # check charrefs in the middle of the text/attributes
-        expected = [('starttag', 'a', [('href', 'foo"zar')]),
-                    ('data', 'a"z'), ('endtag', 'a')]
+        # check charrefs in the middle of the text
+        expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
         for charref in charrefs:
-            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+            self._run_check('<a>a{0}z</a>'.format(charref),
                             expected, collector=collector())
-        # check charrefs at the beginning/end of the text/attributes
-        expected = [('data', '"'),
-                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+        # check charrefs at the beginning/end of the text
+        expected = [('data', '"'), ('starttag', 'a', []),
                     ('data', '"'), ('endtag', 'a'), ('data', '"')]
         for charref in charrefs:
-            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+            self._run_check('{0}<a>'
                             '{0}</a>{0}'.format(charref),
                             expected, collector=collector())
         # check charrefs in <script>/<style> elements
@@ -382,6 +380,35 @@ def test_convert_charrefs(self):
         self._run_check('no charrefs here', [('data', 'no charrefs here')],
                         collector=collector())
 
+    def test_convert_charrefs_in_attribute_values(self):
+        # default value for convert_charrefs is now True
+        collector = lambda: EventCollectorCharrefs()
+        self.assertTrue(collector().convert_charrefs)
+
+        # always unescape terminated entity refs, numeric and hex char refs:
+        # - regardless whether they are at start, middle, end of attribute
+        # - or followed by alphanumeric, non-alphanumeric, or equals char
+        charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
+        expected = [('starttag', 'a',
+                     [('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
+                      ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
+                    ('endtag', 'a')]
+        for charref in charrefs:
+            self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+                            '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
+                            .format(charref), expected, collector=collector())
+
+        # only unescape unterminated entity matches if they are not followed by
+        # an alphanumeric or an equals sign
+        charref = '&cent'
+        expected = [('starttag', 'a',
+                     [('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
+                      ('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
+                    ('endtag', 'a')]
+        self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+                        '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
+                        .format(charref), expected, collector=collector())
+
     # the remaining tests were for the "tolerant" parser (which is now
     # the default), and check various kind of broken markup
     def test_tolerant_parsing(self):
diff --git a/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst b/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst
@@ -0,0 +1,3 @@
+Fix :class:`html.parser.HTMLParser` to not unescape character entities in
+attribute values if they are followed by an ASCII alphanumeric or an equals
+sign.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Fix :class:`html.parser.HTMLParser` to not unescape character entities in
	`2`	`+attribute values if they are followed by an ASCII alphanumeric or an equals`
	`3`	`+sign.`