From b7900cd4cbe129eb7d6b405f6e3507156cc35fd4 Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Mon, 9 Jun 2025 20:24:09 +0200 Subject: [PATCH 1/8] fix: fix html parser raw text escapable mode --- Lib/html/parser.py | 20 ++++++++----- Lib/test/test_htmlparser.py | 59 +++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 1e30956fe24f83..6a7a2d982aaba6 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -98,8 +98,8 @@ class HTMLParser(_markupbase.ParserBase): containing respectively the named or numeric reference as the argument. """ - - CDATA_CONTENT_ELEMENTS = ("script", "style") + # For escapable raw text elements (textarea and title), CDATA mode is reused + CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title") def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. @@ -117,6 +117,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._raw_escapable = False super().reset() def feed(self, data): @@ -140,11 +141,16 @@ def get_starttag_text(self): def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) + if self.cdata_elem in ["textarea", "title"]: + self._raw_escapable = True + self.interesting = re.compile('[&]') + else: + self.interesting = re.compile(r'' % self.cdata_elem, re.I) def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None + self._raw_escapable = False # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is @@ -154,7 +160,7 @@ def goahead(self, end): i = 0 n = len(rawdata) while i < n: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end @@ -177,7 +183,7 @@ def goahead(self, end): break j = n if i < j: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): self.handle_data(unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) @@ -210,7 +216,7 @@ def goahead(self, end): k = i + 1 else: k += 1 - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): self.handle_data(unescape(rawdata[i:k])) else: self.handle_data(rawdata[i:k]) @@ -261,7 +267,7 @@ def goahead(self, end): assert 0, "interesting.search() lied" # end while if end and i < n: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): self.handle_data(unescape(rawdata[i:n])) else: self.handle_data(rawdata[i:n]) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 61fa24fab574f2..9ae600c07b13cb 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -295,6 +295,65 @@ def test_cdata_content(self): ("data", content), ("endtag", element_lower)]) + def test_raw_text_content(self): + # Tags should be treated as text in raw text and escapable raw text content. + content = """

tagshould be handled as text""" + elements = [ + "script", + "style", + "title", + "textarea", + "SCRIPT", + "STYLE", + "TITLE", + "TEXTAREA", + "Script", + "Style", + "Title", + "Textarea", + ] + for element in elements: + source = f"<{element}>{content}" + self._run_check(source, [ + ("starttag", element.lower(), []), + ("data", content) + ]) + + def test_escapable_raw_text_content(self): + # Charrefs should be escaped in esacapable raw text content. + class Collector(EventCollector): + pass + + content = "Timon & Pumba" + expected = "Timon & Pumba" + elements = [ + "title", + "textarea", + "TITLE", + "TEXTAREA", + "Title", + "Textarea", + ] + for element in elements: + source = f"<{element}>{content}" + self._run_check( + source, [ + ("starttag", element.lower(), []), + ('data', expected), + ], + collector=Collector(convert_charrefs=True), + ) + # test with convert_charrefs=False + self._run_check( + source, [ + ("starttag", element.lower(), []), + ('data', 'Timon '), + ('entityref', 'amp'), + ('data', ' Pumba') + ], + ) + + def test_cdata_with_closing_tags(self): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. From 8f60744b68fdcc37f2a58c48a5bccb34fa07eb2a Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Mon, 9 Jun 2025 20:38:38 +0200 Subject: [PATCH 2/8] docs: add blurb new fragment --- .../next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst new file mode 100644 index 00000000000000..a9754421d5ac97 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst @@ -0,0 +1 @@ +Fix a bug in html parser related to escapable raw text mode. From 7824ee88b82f86440bd9aedae5f7ef0ac815463d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 14 Jul 2025 20:03:37 +0300 Subject: [PATCH 3/8] Fix errors and rewrite tests. --- Lib/html/parser.py | 9 ++- Lib/test/test_htmlparser.py | 150 ++++++++++++++++++++++-------------- 2 files changed, 99 insertions(+), 60 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index b55f6c65900a83..92a25d62666451 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -169,9 +169,10 @@ def get_starttag_text(self): def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() - if self.cdata_elem in ["textarea", "title"]: - self._raw_escapable = True - self.interesting = re.compile('[&]') + self._raw_escapable = self.cdata_elem in ("textarea", "title") + if self._raw_escapable and not self.convert_charrefs: + self.interesting = re.compile(r'&|])' % self.cdata_elem, + re.IGNORECASE|re.ASCII) else: self.interesting = re.compile(r'])' % self.cdata_elem, re.IGNORECASE|re.ASCII) @@ -189,7 +190,7 @@ def goahead(self, end): i = 0 n = len(rawdata) while i < n: - if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): + if self.convert_charrefs and not self.cdata_elem: j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 62b662b666a169..6286b34f734911 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -317,63 +317,48 @@ def test_style_content(self, content): ("data", content), ("endtag", "style")]) - def test_raw_text_content(self): - # Tags should be treated as text in raw text and escapable raw text content. - content = """

tagshould be handled as text""" - elements = [ - "script", - "style", - "title", - "textarea", - "SCRIPT", - "STYLE", - "TITLE", - "TEXTAREA", - "Script", - "Style", - "Title", - "Textarea", - ] - for element in elements: - source = f"<{element}>{content}" - self._run_check(source, [ - ("starttag", element.lower(), []), - ("data", content) - ]) + @support.subTests('content', [ + '', + "", + '', + '', + '', + '\u2603', + '< /title>', + '', + '', + '', + '', + '', + ]) + def test_title_content(self, content): + source = f"{content}" + self._run_check(source, [ + ("starttag", "title", []), + ("data", content), + ("endtag", "title"), + ]) - def test_escapable_raw_text_content(self): - # Charrefs should be escaped in esacapable raw text content. - class Collector(EventCollector): - pass - - content = "Timon & Pumba" - expected = "Timon & Pumba" - elements = [ - "title", - "textarea", - "TITLE", - "TEXTAREA", - "Title", - "Textarea", - ] - for element in elements: - source = f"<{element}>{content}" - self._run_check( - source, [ - ("starttag", element.lower(), []), - ('data', expected), - ], - collector=Collector(convert_charrefs=True), - ) - # test with convert_charrefs=False - self._run_check( - source, [ - ("starttag", element.lower(), []), - ('data', 'Timon '), - ('entityref', 'amp'), - ('data', ' Pumba') - ], - ) + @support.subTests('content', [ + '', + "", + '', + '', + '', + '\u2603', + '< /textarea>', + '', + '', + '', + '', + ]) + def test_textarea_content(self, content): + source = f"" + self._run_check(source, [ + ("starttag", "textarea", []), + ("data", content), + ("endtag", "textarea"), + ]) @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', 'script/', 'script foo=bar', 'script foo=">"']) @@ -404,6 +389,38 @@ def test_style_closing_tag(self, endtag): ("endtag", "style")], collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n', + 'title/', 'title foo=bar', 'title foo=">"']) + def test_title_closing_tag(self, endtag): + content = "Egg & Spam" + s = f'{content}</{endtag}>' + self._run_check(s, [("starttag", "title", []), + ('data', '<!-- not a comment --><i>Egg & Spam</i>'), + ("endtag", "title")], + collector=EventCollectorNoNormalize(convert_charrefs=True)) + self._run_check(s, [("starttag", "title", []), + ('data', '<!-- not a comment --><i>Egg '), + ('entityref', 'amp'), + ('data', ' Spam</i>'), + ("endtag", "title")], + collector=EventCollectorNoNormalize(convert_charrefs=False)) + + @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n', + 'textarea/', 'textarea foo=bar', 'textarea foo=">"']) + def test_textarea_closing_tag(self, endtag): + content = "<!-- not a comment --><i>Egg & Spam</i>" + s = f'<TexTarEa>{content}</{endtag}>' + self._run_check(s, [("starttag", "textarea", []), + ('data', '<!-- not a comment --><i>Egg & Spam</i>'), + ("endtag", "textarea")], + collector=EventCollectorNoNormalize(convert_charrefs=True)) + self._run_check(s, [("starttag", "textarea", []), + ('data', '<!-- not a comment --><i>Egg '), + ('entityref', 'amp'), + ('data', ' Spam</i>'), + ("endtag", "textarea")], + collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('tail,end', [ ('', False), ('<', False), @@ -421,6 +438,27 @@ def test_eof_in_script(self, tail, end): ("data", content if end else content + tail)], collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('tail,end', [ + ('', False), + ('<', False), + ('</', False), + ('</t', False), + ('</title', False), + ('" '' From 18c6ea80b2387103eff3ced2fb11e1baff8aba84 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 14 Jul 2025 20:11:56 +0300 Subject: [PATCH 4/8] Refactoring. --- Lib/html/parser.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 92a25d62666451..9e3507ced4e710 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -126,8 +126,8 @@ class HTMLParser(_markupbase.ParserBase): containing respectively the named or numeric reference as the argument. """ - # For escapable raw text elements (textarea and title), CDATA mode is reused - CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title") + CDATA_CONTENT_ELEMENTS = ("script", "style") + RCDATA_CONTENT_ELEMENTS = ("textarea", "title") def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. @@ -145,7 +145,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None - self._raw_escapable = False + self._escapable = True super().reset() def feed(self, data): @@ -167,10 +167,10 @@ def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text - def set_cdata_mode(self, elem): + def set_cdata_mode(self, elem, escapable=False): self.cdata_elem = elem.lower() - self._raw_escapable = self.cdata_elem in ("textarea", "title") - if self._raw_escapable and not self.convert_charrefs: + self._escapable = escapable + if escapable and not self.convert_charrefs: self.interesting = re.compile(r'&|])' % self.cdata_elem, re.IGNORECASE|re.ASCII) else: @@ -180,7 +180,7 @@ def set_cdata_mode(self, elem): def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None - self._raw_escapable = False + self._escapable = True # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is @@ -213,7 +213,7 @@ def goahead(self, end): break j = n if i < j: - if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): + if self.convert_charrefs and self._escapable: self.handle_data(unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) @@ -315,7 +315,7 @@ def goahead(self, end): assert 0, "interesting.search() lied" # end while if end and i < n: - if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): + if self.convert_charrefs and self._escapable: self.handle_data(unescape(rawdata[i:n])) else: self.handle_data(rawdata[i:n]) @@ -427,6 +427,8 @@ def parse_starttag(self, i): self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) + elif tag in self.RCDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag, True) return endpos # Internal -- check to see if we have a complete starttag; return end From 051516af7b13b564df8c46139f237c4103b35f76 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 14 Jul 2025 20:13:39 +0300 Subject: [PATCH 5/8] Update the NEWS entry. --- .../Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst index a9754421d5ac97..f364c133813551 100644 --- a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst +++ b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst @@ -1 +1,2 @@ -Fix a bug in html parser related to escapable raw text mode. +Fix support of escapable raw text mode (elements "textarea" and "title") +in :class:`http.parser.HTMLParser`. From c93a7718608a21f2f926adcade8c6f43a4d87e8e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 14 Jul 2025 20:14:42 +0300 Subject: [PATCH 6/8] Reclassify --- .../2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/{Library => Security}/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst (100%) diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst similarity index 100% rename from Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst rename to Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst From a27159f4f923259c787c6028c2289401d955f8e0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 15 Jul 2025 07:47:19 +0300 Subject: [PATCH 7/8] Update 2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst --- .../Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst index f364c133813551..6ad3caf33b2201 100644 --- a/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst +++ b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst @@ -1,2 +1,2 @@ Fix support of escapable raw text mode (elements "textarea" and "title") -in :class:`http.parser.HTMLParser`. +in :class:`html.parser.HTMLParser`. From 5681bbc7a2793eeaa58869f3f1d27add53a59f0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 22 Jul 2025 12:39:37 +0200 Subject: [PATCH 8/8] Make `escapable=` kwarg-only --- Lib/html/parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 9e3507ced4e710..9c06a42dc9eddf 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -126,6 +126,7 @@ class HTMLParser(_markupbase.ParserBase): containing respectively the named or numeric reference as the argument. """ + CDATA_CONTENT_ELEMENTS = ("script", "style") RCDATA_CONTENT_ELEMENTS = ("textarea", "title") @@ -167,7 +168,7 @@ def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text - def set_cdata_mode(self, elem, escapable=False): + def set_cdata_mode(self, elem, *, escapable=False): self.cdata_elem = elem.lower() self._escapable = escapable if escapable and not self.convert_charrefs: @@ -428,7 +429,7 @@ def parse_starttag(self, i): if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) elif tag in self.RCDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag, True) + self.set_cdata_mode(tag, escapable=True) return endpos # Internal -- check to see if we have a complete starttag; return end