gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser

serhiy-storchaka · serhiy-storchaka · commit bb7b873ed1e1 · 2025-08-15T23:08:48.000+03:00
* the "plaintext" element
* the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes"
* optionally RAWTEXT (if scripting=True) element "noscript"
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
@@ -15,14 +15,17 @@
 This module defines a class :class:`HTMLParser` which serves as the basis for
 parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
 
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
 
    Create a parser instance able to parse invalid markup.
 
    If *convert_charrefs* is ``True`` (the default), all character
    references (except the ones in ``script``/``style`` elements) are
    automatically converted to the corresponding Unicode characters.
 
+   If *scripting* is true, the `noscript` element is parsed in the
+   RAWTEXT mode.
+
    An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
    when start tags, end tags, text, comments, and other markup elements are
    encountered.  The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    .. versionchanged:: 3.5
       The default value for argument *convert_charrefs* is now ``True``.
 
+   .. versionchanged:: 3.13.8
+      Added the *scripting* parameter.
+
 
 Example HTML Parser Application
 -------------------------------
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
@@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase):
     argument.
     """
 
-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # See the HTML5 specs section "13.4 Parsing HTML fragments".
+    # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
     RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
-    def __init__(self, *, convert_charrefs=True):
+    def __init__(self, *, convert_charrefs=True, scripting=False):
         """Initialize and reset this instance.
 
-        If convert_charrefs is True (the default), all character references
+        If convert_charrefs is true (the default), all character references
         are automatically converted to the corresponding Unicode characters.
+
+        If scripting is true, the noscript element is parsed in the
+        RAWTEXT mode.
         """
         super().__init__()
         self.convert_charrefs = convert_charrefs
+        self.scripting = scripting
         self.reset()
 
     def reset(self):
@@ -454,6 +460,11 @@ def parse_starttag(self, i):
                 self.set_cdata_mode(tag)
             elif tag in self.RCDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag, escapable=True)
+            elif self.scripting and tag == "noscript":
+                self.set_cdata_mode(tag)
+            elif tag == "plaintext":
+                self.set_cdata_mode(tag)
+                self.interesting = re.compile(r'\z')
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
@@ -324,49 +324,138 @@ def test_style_content(self, content):
                             ("data", content),
                             ("endtag", "style")])
 
-    @support.subTests('content', [
-            '<!-- not a comment -->',
-            "<not a='start tag'>",
-            '<![CDATA[not a cdata]]>',
-            '<!not a bogus comment>',
-            '</not a bogus comment>',
-            '\u2603',
-            '< /title>',
-            '</ title>',
-            '</titled>',
-            '</title\v>',
-            '</title\xa0>',
-            '</tıtle>',
+    @support.subTests('tag', ['title', 'textarea'])
+    def test_rcdata_content(self, tag):
+        content = (
+            '<!-- not a comment -->'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /{tag}>'
+            f'</ {tag}>'
+            f'</{tag}x>'
+            f'</{tag}\v>'
+            f'</{tag}\xa0>'
+        )
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", content),
+            ("endtag", tag),
         ])
-    def test_title_content(self, content):
-        source = f"<title>{content}</title>"
+        source = f"<{tag}>&amp;</{tag}>"
         self._run_check(source, [
-            ("starttag", "title", []),
+            ("starttag", tag, []),
+            ('entityref', 'amp'),
+            ("endtag", tag),
+        ])
+
+    @support.subTests('tag',
+            ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
+    def test_rawtext_content(self, tag):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /{tag}>'
+            f'</ {tag}>'
+            f'</{tag}x>'
+            f'</{tag}\v>'
+            f'</{tag}\xa0>'
+        )
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
             ("data", content),
-            ("endtag", "title"),
+            ("endtag", tag),
         ])
 
-    @support.subTests('content', [
-            '<!-- not a comment -->',
-            "<not a='start tag'>",
-            '<![CDATA[not a cdata]]>',
-            '<!not a bogus comment>',
-            '</not a bogus comment>',
-            '\u2603',
-            '< /textarea>',
-            '</ textarea>',
-            '</textareable>',
-            '</textarea\v>',
-            '</textarea\xa0>',
+    def test_noscript_content(self):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /noscript>'
+            f'</ noscript>'
+            f'</noscriptx>'
+            f'</noscript\v>'
+            f'</noscript\xa0>'
+        )
+        source = f"<noscript>{content}</noscript>"
+        self._run_check(source, [
+            ('starttag', 'noscript', []),
+            ('comment', ' not a comment '),
+            ('entityref', 'not'),
+            ('data', '-an-entity-ref;'),
+            ('starttag', 'not', [('a', 'start tag')]),
+            ('unknown decl', 'CDATA[not a cdata'),
+            ('comment', 'not a bogus comment'),
+            ('endtag', 'not'),
+            ('data', '☃< /noscript>'),
+            ('comment', ' noscript'),
+            ('endtag', 'noscriptx'),
+            ('endtag', 'noscript\x0b'),
+            ('endtag', 'noscript\xa0'),
+            ('endtag', 'noscript')
         ])
-    def test_textarea_content(self, content):
-        source = f"<textarea>{content}</textarea>"
         self._run_check(source, [
-            ("starttag", "textarea", []),
+            ("starttag", "noscript", []),
+            ("data", content),
+            ("endtag", "noscript"),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
+    def test_plaintext_content(self):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            '</plaintext>'
+        )
+        source = f"<plaintext>{content}"
+        self._run_check(source, [
+            ("starttag", "plaintext", []),
             ("data", content),
-            ("endtag", "textarea"),
         ])
 
+    @support.subTests('tag,endtag', [
+            ('title', 'tıtle'),
+            ('style', 'ſtyle'),
+            ('style', 'ﬅyle'),
+            ('style', 'ﬆyle'),
+            ('iframe', 'ıframe'),
+            ('noframes', 'noframeſ'),
+            ('noscript', 'noſcript'),
+            ('noscript', 'noscrıpt'),
+            ('script', 'ſcript'),
+            ('script', 'scrıpt'),
+        ])
+    def test_invalid_nonascii_closing_tag(self, tag, endtag):
+        source = f"<{tag}><a></{endtag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", f"<a></{endtag}>"),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+        source = f"<{tag}><a></{endtag}></{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", f"<a></{endtag}>"),
+            ("endtag", tag),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
     @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
                                  'script/', 'script foo=bar', 'script foo=">"'])
     def test_script_closing_tag(self, endtag):
@@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag):
                             ("endtag", "textarea")],
                         collector=EventCollectorNoNormalize(convert_charrefs=False))
 
+    @support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP',
+                                   'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt',
+                                   'ScrIPt'])
+    def test_closing_tag(self, starttag):
+        tag = starttag.lower()
+        for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
+                       f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
+            content = "<!-- not a comment --><i>Spam</i>"
+            s = f'<{starttag}>{content}</{endtag}>'
+            self._run_check(s, [("starttag", tag, []),
+                                ('data', content),
+                                ("endtag", tag)],
+                            collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))
+
     @support.subTests('tail,end', [
         ('', False),
         ('<', False),
diff --git a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
@@ -0,0 +1,3 @@
+Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",
+"noembed" and "noframes", and optionally RAWTEXT element "noscript" in
+:class:`html.parser.HTMLParser`.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",`
	`2`	`+"noembed" and "noframes", and optionally RAWTEXT element "noscript" in`
	`3`	+:class:`html.parser.HTMLParser`.