From 408f770fda443801e24744793e9cc67a478fc1ac Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:27:13 +0200 Subject: [PATCH] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (cherry picked from commit 4d02f31cdd45d81b95540d9076222b709d4f2335) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com> Co-authored-by: Serhiy Storchaka Co-authored-by: Łukasz Langa --- Lib/html/parser.py | 20 +++- Lib/test/test_htmlparser.py | 96 +++++++++++++++++++ ...-06-09-20-38-25.gh-issue-118350.KgWCcP.rst | 2 + 3 files changed, 113 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 8278915ffd0542..b8ee81ce80d5b8 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase): """ CDATA_CONTENT_ELEMENTS = ("script", "style") + RCDATA_CONTENT_ELEMENTS = ("textarea", "title") def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. @@ -127,6 +128,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._escapable = True super().reset() def feed(self, data): @@ -148,14 +150,20 @@ def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text - def set_cdata_mode(self, elem): + def set_cdata_mode(self, elem, *, escapable=False): self.cdata_elem = elem.lower() - self.interesting = re.compile(r'])' % self.cdata_elem, - re.IGNORECASE|re.ASCII) + self._escapable = escapable + if escapable and not self.convert_charrefs: + self.interesting = re.compile(r'&|])' % self.cdata_elem, + re.IGNORECASE|re.ASCII) + else: + self.interesting = re.compile(r'])' % self.cdata_elem, + re.IGNORECASE|re.ASCII) def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None + self._escapable = True # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is @@ -188,7 +196,7 @@ def goahead(self, end): break j = n if i < j: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and self._escapable: self.handle_data(unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) @@ -290,7 +298,7 @@ def goahead(self, end): assert 0, "interesting.search() lied" # end while if end and i < n: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and self._escapable: self.handle_data(unescape(rawdata[i:n])) else: self.handle_data(rawdata[i:n]) @@ -402,6 +410,8 @@ def parse_starttag(self, i): self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) + elif tag in self.RCDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag, escapable=True) return endpos # Internal -- check to see if we have a complete starttag; return end diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index c14ab9bb5be27d..15f9714c1d0c6f 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -317,6 +317,49 @@ def test_style_content(self, content): ("data", content), ("endtag", "style")]) + @support.subTests('content', [ + '', + "", + '', + '', + '', + '\u2603', + '< /title>', + '', + '', + '', + '', + '', + ]) + def test_title_content(self, content): + source = f"{content}" + self._run_check(source, [ + ("starttag", "title", []), + ("data", content), + ("endtag", "title"), + ]) + + @support.subTests('content', [ + '', + "", + '', + '', + '', + '\u2603', + '< /textarea>', + '', + '', + '', + '', + ]) + def test_textarea_content(self, content): + source = f"" + self._run_check(source, [ + ("starttag", "textarea", []), + ("data", content), + ("endtag", "textarea"), + ]) + @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', 'script/', 'script foo=bar', 'script foo=">"']) def test_script_closing_tag(self, endtag): @@ -346,6 +389,38 @@ def test_style_closing_tag(self, endtag): ("endtag", "style")], collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n', + 'title/', 'title foo=bar', 'title foo=">"']) + def test_title_closing_tag(self, endtag): + content = "Egg & Spam" + s = f'{content}</{endtag}>' + self._run_check(s, [("starttag", "title", []), + ('data', '<!-- not a comment --><i>Egg & Spam</i>'), + ("endtag", "title")], + collector=EventCollectorNoNormalize(convert_charrefs=True)) + self._run_check(s, [("starttag", "title", []), + ('data', '<!-- not a comment --><i>Egg '), + ('entityref', 'amp'), + ('data', ' Spam</i>'), + ("endtag", "title")], + collector=EventCollectorNoNormalize(convert_charrefs=False)) + + @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n', + 'textarea/', 'textarea foo=bar', 'textarea foo=">"']) + def test_textarea_closing_tag(self, endtag): + content = "<!-- not a comment --><i>Egg & Spam</i>" + s = f'<TexTarEa>{content}</{endtag}>' + self._run_check(s, [("starttag", "textarea", []), + ('data', '<!-- not a comment --><i>Egg & Spam</i>'), + ("endtag", "textarea")], + collector=EventCollectorNoNormalize(convert_charrefs=True)) + self._run_check(s, [("starttag", "textarea", []), + ('data', '<!-- not a comment --><i>Egg '), + ('entityref', 'amp'), + ('data', ' Spam</i>'), + ("endtag", "textarea")], + collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('tail,end', [ ('', False), ('<', False), @@ -363,6 +438,27 @@ def test_eof_in_script(self, tail, end): ("data", content if end else content + tail)], collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('tail,end', [ + ('', False), + ('<', False), + ('</', False), + ('</t', False), + ('</title', False), + ('" '' diff --git a/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst new file mode 100644 index 00000000000000..6ad3caf33b2201 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst @@ -0,0 +1,2 @@ +Fix support of escapable raw text mode (elements "textarea" and "title") +in :class:`html.parser.HTMLParser`.