From b7900cd4cbe129eb7d6b405f6e3507156cc35fd4 Mon Sep 17 00:00:00 2001
From: Timon Viola <44016238+timonviola@users.noreply.github.com>
Date: Mon, 9 Jun 2025 20:24:09 +0200
Subject: [PATCH 1/8] fix: fix html parser raw text escapable mode
---
Lib/html/parser.py | 20 ++++++++-----
Lib/test/test_htmlparser.py | 59 +++++++++++++++++++++++++++++++++++++
2 files changed, 72 insertions(+), 7 deletions(-)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 1e30956fe24f83..6a7a2d982aaba6 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -98,8 +98,8 @@ class HTMLParser(_markupbase.ParserBase):
containing respectively the named or numeric reference as the
argument.
"""
-
- CDATA_CONTENT_ELEMENTS = ("script", "style")
+ # For escapable raw text elements (textarea and title), CDATA mode is reused
+ CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title")
def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
@@ -117,6 +117,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
+ self._raw_escapable = False
super().reset()
def feed(self, data):
@@ -140,11 +141,16 @@ def get_starttag_text(self):
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
+ if self.cdata_elem in ["textarea", "title"]:
+ self._raw_escapable = True
+ self.interesting = re.compile('[&]')
+ else:
+ self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
+ self._raw_escapable = False
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
@@ -154,7 +160,7 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
@@ -177,7 +183,7 @@ def goahead(self, end):
break
j = n
if i < j:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
@@ -210,7 +216,7 @@ def goahead(self, end):
k = i + 1
else:
k += 1
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
self.handle_data(unescape(rawdata[i:k]))
else:
self.handle_data(rawdata[i:k])
@@ -261,7 +267,7 @@ def goahead(self, end):
assert 0, "interesting.search() lied"
# end while
if end and i < n:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 61fa24fab574f2..9ae600c07b13cb 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -295,6 +295,65 @@ def test_cdata_content(self):
("data", content),
("endtag", element_lower)])
+ def test_raw_text_content(self):
+ # Tags should be treated as text in raw text and escapable raw text content.
+ content = """
tagshould be handled as text"""
+ elements = [
+ "script",
+ "style",
+ "title",
+ "textarea",
+ "SCRIPT",
+ "STYLE",
+ "TITLE",
+ "TEXTAREA",
+ "Script",
+ "Style",
+ "Title",
+ "Textarea",
+ ]
+ for element in elements:
+ source = f"<{element}>{content}"
+ self._run_check(source, [
+ ("starttag", element.lower(), []),
+ ("data", content)
+ ])
+
+ def test_escapable_raw_text_content(self):
+ # Charrefs should be escaped in esacapable raw text content.
+ class Collector(EventCollector):
+ pass
+
+ content = "Timon & Pumba"
+ expected = "Timon & Pumba"
+ elements = [
+ "title",
+ "textarea",
+ "TITLE",
+ "TEXTAREA",
+ "Title",
+ "Textarea",
+ ]
+ for element in elements:
+ source = f"<{element}>{content}"
+ self._run_check(
+ source, [
+ ("starttag", element.lower(), []),
+ ('data', expected),
+ ],
+ collector=Collector(convert_charrefs=True),
+ )
+ # test with convert_charrefs=False
+ self._run_check(
+ source, [
+ ("starttag", element.lower(), []),
+ ('data', 'Timon '),
+ ('entityref', 'amp'),
+ ('data', ' Pumba')
+ ],
+ )
+
+
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
From 8f60744b68fdcc37f2a58c48a5bccb34fa07eb2a Mon Sep 17 00:00:00 2001
From: Timon Viola <44016238+timonviola@users.noreply.github.com>
Date: Mon, 9 Jun 2025 20:38:38 +0200
Subject: [PATCH 2/8] docs: add blurb new fragment
---
.../next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
new file mode 100644
index 00000000000000..a9754421d5ac97
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
@@ -0,0 +1 @@
+Fix a bug in html parser related to escapable raw text mode.
From 7824ee88b82f86440bd9aedae5f7ef0ac815463d Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka
Date: Mon, 14 Jul 2025 20:03:37 +0300
Subject: [PATCH 3/8] Fix errors and rewrite tests.
---
Lib/html/parser.py | 9 ++-
Lib/test/test_htmlparser.py | 150 ++++++++++++++++++++++--------------
2 files changed, 99 insertions(+), 60 deletions(-)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index b55f6c65900a83..92a25d62666451 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -169,9 +169,10 @@ def get_starttag_text(self):
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
- if self.cdata_elem in ["textarea", "title"]:
- self._raw_escapable = True
- self.interesting = re.compile('[&]')
+ self._raw_escapable = self.cdata_elem in ("textarea", "title")
+ if self._raw_escapable and not self.convert_charrefs:
+ self.interesting = re.compile(r'&|%s(?=[\t\n\r\f />])' % self.cdata_elem,
+ re.IGNORECASE|re.ASCII)
else:
self.interesting = re.compile(r'%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
@@ -189,7 +190,7 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
- if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
+ if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 62b662b666a169..6286b34f734911 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -317,63 +317,48 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])
- def test_raw_text_content(self):
- # Tags should be treated as text in raw text and escapable raw text content.
- content = """tagshould be handled as text"""
- elements = [
- "script",
- "style",
- "title",
- "textarea",
- "SCRIPT",
- "STYLE",
- "TITLE",
- "TEXTAREA",
- "Script",
- "Style",
- "Title",
- "Textarea",
- ]
- for element in elements:
- source = f"<{element}>{content}"
- self._run_check(source, [
- ("starttag", element.lower(), []),
- ("data", content)
- ])
+ @support.subTests('content', [
+ '',
+ "",
+ '',
+ '',
+ '',
+ '\u2603',
+ '< /title>',
+ ' title>',
+ '',
+ '',
+ '',
+ '',
+ ])
+ def test_title_content(self, content):
+ source = f"{content}"
+ self._run_check(source, [
+ ("starttag", "title", []),
+ ("data", content),
+ ("endtag", "title"),
+ ])
- def test_escapable_raw_text_content(self):
- # Charrefs should be escaped in esacapable raw text content.
- class Collector(EventCollector):
- pass
-
- content = "Timon & Pumba"
- expected = "Timon & Pumba"
- elements = [
- "title",
- "textarea",
- "TITLE",
- "TEXTAREA",
- "Title",
- "Textarea",
- ]
- for element in elements:
- source = f"<{element}>{content}"
- self._run_check(
- source, [
- ("starttag", element.lower(), []),
- ('data', expected),
- ],
- collector=Collector(convert_charrefs=True),
- )
- # test with convert_charrefs=False
- self._run_check(
- source, [
- ("starttag", element.lower(), []),
- ('data', 'Timon '),
- ('entityref', 'amp'),
- ('data', ' Pumba')
- ],
- )
+ @support.subTests('content', [
+ '',
+ "",
+ '',
+ '',
+ '',
+ '\u2603',
+ '< /textarea>',
+ ' textarea>',
+ '',
+ '',
+ '',
+ ])
+ def test_textarea_content(self, content):
+ source = f""
+ self._run_check(source, [
+ ("starttag", "textarea", []),
+ ("data", content),
+ ("endtag", "textarea"),
+ ])
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
'script/', 'script foo=bar', 'script foo=">"'])
@@ -404,6 +389,38 @@ def test_style_closing_tag(self, endtag):
("endtag", "style")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
+ @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
+ 'title/', 'title foo=bar', 'title foo=">"'])
+ def test_title_closing_tag(self, endtag):
+ content = "Egg & Spam"
+ s = f'{content}{endtag}>'
+ self._run_check(s, [("starttag", "title", []),
+ ('data', 'Egg & Spam'),
+ ("endtag", "title")],
+ collector=EventCollectorNoNormalize(convert_charrefs=True))
+ self._run_check(s, [("starttag", "title", []),
+ ('data', 'Egg '),
+ ('entityref', 'amp'),
+ ('data', ' Spam'),
+ ("endtag", "title")],
+ collector=EventCollectorNoNormalize(convert_charrefs=False))
+
+ @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
+ 'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
+ def test_textarea_closing_tag(self, endtag):
+ content = "Egg & Spam"
+ s = f'