Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
{% set ver2 = '.'.join(version.split('.')[0:2]) %}
{% set ver2nd = ''.join(version.split('.')[0:2]) %}
{% set ver3nd = ''.join(version.split('.')[0:3]) %}
{% set build_number = 0 %}
{% set build_number = 1 %}

# this makes the linter happy
{% set channel_targets = channel_targets or 'conda-forge main' %}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
From 8da82ac397729a33bf005a6000458b4f832fc5ca Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <[email protected]>
Date: Fri, 13 Jun 2025 19:57:48 +0300
Subject: [PATCH 27/27] gh-135462: Fix quadratic complexity in processing
special input in HTMLParser (GH-135464)

End-of-file errors are now handled according to the HTML5 specs --
comments and declarations are automatically closed, tags are ignored.
(cherry picked from commit 6eb6c5dbfb528bd07d77b60fd71fd05d81d45c41)

Co-authored-by: Serhiy Storchaka <[email protected]>
---
Lib/html/parser.py | 41 +++++---
Lib/test/test_htmlparser.py | 95 ++++++++++++++++---
...-06-13-15-55-22.gh-issue-135462.KBeJpc.rst | 4 +
3 files changed, 117 insertions(+), 23 deletions(-)
create mode 100644 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 58f6bb3b1e9..94f4aaecfc6 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -25,6 +25,7 @@
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

starttagopen = re.compile('<[a-zA-Z]')
+endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
@@ -176,7 +177,7 @@ def goahead(self, end):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
- elif (i + 1) < n:
+ elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
@@ -184,17 +185,35 @@ def goahead(self, end):
if k < 0:
if not end:
break
- k = rawdata.find('>', i + 1)
- if k < 0:
- k = rawdata.find('<', i + 1)
- if k < 0:
- k = i + 1
- else:
- k += 1
- if self.convert_charrefs and not self.cdata_elem:
- self.handle_data(unescape(rawdata[i:k]))
+ if starttagopen.match(rawdata, i): # < + letter
+ pass
+ elif startswith("</", i):
+ if i + 2 == n:
+ self.handle_data("</")
+ elif endtagopen.match(rawdata, i): # </ + letter
+ pass
+ else:
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<!--", i):
+ j = n
+ for suffix in ("--!", "--", "-"):
+ if rawdata.endswith(suffix, i+4):
+ j -= len(suffix)
+ break
+ self.handle_comment(rawdata[i+4:j])
+ elif startswith("<![CDATA[", i):
+ self.unknown_decl(rawdata[i+3:])
+ elif rawdata[i:i+9].lower() == '<!doctype':
+ self.handle_decl(rawdata[i+2:])
+ elif startswith("<!", i):
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<?", i):
+ self.handle_pi(rawdata[i+2:])
else:
- self.handle_data(rawdata[i:k])
+ raise AssertionError("we should not get here!")
+ k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 12917755a56..df775c11310 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -4,6 +4,8 @@
import pprint
import unittest

+from test import support
+

class EventCollector(html.parser.HTMLParser):

@@ -391,28 +393,34 @@ def test_tolerant_parsing(self):
('data', '<'),
('starttag', 'bc<', [('a', None)]),
('endtag', 'html'),
- ('data', '\n<img src="URL>'),
- ('comment', '/img'),
- ('endtag', 'html<')])
+ ('data', '\n')])

def test_starttag_junk_chars(self):
+ self._run_check("<", [('data', '<')])
+ self._run_check("<>", [('data', '<>')])
+ self._run_check("< >", [('data', '< >')])
+ self._run_check("< ", [('data', '< ')])
self._run_check("</>", [])
+ self._run_check("<$>", [('data', '<$>')])
self._run_check("</$>", [('comment', '$')])
self._run_check("</", [('data', '</')])
- self._run_check("</a", [('data', '</a')])
+ self._run_check("</a", [])
+ self._run_check("</ a>", [('endtag', 'a')])
+ self._run_check("</ a", [('comment', ' a')])
self._run_check("<a<a>", [('starttag', 'a<a', [])])
self._run_check("</a<a>", [('endtag', 'a<a')])
- self._run_check("<!", [('data', '<!')])
- self._run_check("<a", [('data', '<a')])
- self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
- self._run_check("<a foo='bar", [('data', "<a foo='bar")])
- self._run_check("<a foo='>'", [('data', "<a foo='>'")])
- self._run_check("<a foo='>", [('data', "<a foo='>")])
+ self._run_check("<!", [('comment', '')])
+ self._run_check("<a", [])
+ self._run_check("<a foo='bar'", [])
+ self._run_check("<a foo='bar", [])
+ self._run_check("<a foo='>'", [])
+ self._run_check("<a foo='>", [])
self._run_check("<a$>", [('starttag', 'a$', [])])
self._run_check("<a$b>", [('starttag', 'a$b', [])])
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
self._run_check("<a$b >", [('starttag', 'a$b', [])])
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
+ self._run_check("</a$b>", [('endtag', 'a$b')])

def test_slashes_in_starttag(self):
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
@@ -537,13 +545,56 @@ def test_EOF_in_charref(self):
for html, expected in data:
self._run_check(html, expected)

- def test_broken_comments(self):
- html = ('<! not really a comment >'
+ def test_eof_in_comments(self):
+ data = [
+ ('<!--', [('comment', '')]),
+ ('<!---', [('comment', '')]),
+ ('<!----', [('comment', '')]),
+ ('<!-----', [('comment', '-')]),
+ ('<!------', [('comment', '--')]),
+ ('<!----!', [('comment', '')]),
+ ('<!---!', [('comment', '-!')]),
+ ('<!---!>', [('comment', '-!>')]),
+ ('<!--foo', [('comment', 'foo')]),
+ ('<!--foo-', [('comment', 'foo')]),
+ ('<!--foo--', [('comment', 'foo')]),
+ ('<!--foo--!', [('comment', 'foo')]),
+ ('<!--<!--', [('comment', '<!')]),
+ ('<!--<!--!', [('comment', '<!')]),
+ ]
+ for html, expected in data:
+ self._run_check(html, expected)
+
+ def test_eof_in_declarations(self):
+ data = [
+ ('<!', [('comment', '')]),
+ ('<!-', [('comment', '-')]),
+ ('<![', [('comment', '[')]),
+ ('<![CDATA[', [('unknown decl', 'CDATA[')]),
+ ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
+ ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
+ ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
+ ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
+ ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
+ ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
+ ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
+ ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
+ ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
+ ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
+ [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
+ ]
+ for html, expected in data:
+ self._run_check(html, expected)
+
+ def test_bogus_comments(self):
+ html = ('<!ELEMENT br EMPTY>'
+ '<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!><!<-- this was an empty comment>'
'<!!! another bogus comment !!!>')
expected = [
+ ('comment', 'ELEMENT br EMPTY'),
('comment', ' not really a comment '),
('comment', ' not a comment either --'),
('comment', ' -- close enough --'),
@@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self):
('endtag', 'a'), ('data', ' bar & baz')]
)

+ @support.requires_resource('cpu')
+ def test_eof_no_quadratic_complexity(self):
+ # Each of these examples used to take about an hour.
+ # Now they take a fraction of a second.
+ def check(source):
+ parser = html.parser.HTMLParser()
+ parser.feed(source)
+ parser.close()
+ n = 120_000
+ check("<a " * n)
+ check("<a a=" * n)
+ check("</a " * 14 * n)
+ check("</a a=" * 11 * n)
+ check("<!--" * 4 * n)
+ check("<!" * 60 * n)
+ check("<?" * 19 * n)
+ check("</$" * 15 * n)
+ check("<![CDATA[" * 9 * n)
+ check("<!doctype" * 35 * n)
+

class AttributesTestCase(TestCaseBase):

diff --git a/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst b/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
new file mode 100644
index 00000000000..cf9aa8dbdf2
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
@@ -0,0 +1,4 @@
+Fix quadratic complexity in processing specially crafted input in
+:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
+to the HTML5 specs -- comments and declarations are automatically closed,
+tags are ignored.
Loading