diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index dd67fc34e856f1..81b9239185aab1 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -15,7 +15,7 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
Create a parser instance able to parse invalid markup.
@@ -23,6 +23,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
references (except the ones in ``script``/``style`` elements) are
automatically converted to the corresponding Unicode characters.
+ If *scripting* is true, the ``noscript`` element is parsed in the
+ RAWTEXT mode.
+
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.
+ .. versionchanged:: 3.13.8
+ Added the *scripting* parameter.
+
Example HTML Parser Application
-------------------------------
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 75bf8adae6d70a..79850fa6981d55 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""
- CDATA_CONTENT_ELEMENTS = ("script", "style")
+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+ CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
- def __init__(self, *, convert_charrefs=True):
+ def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.
- If convert_charrefs is True (the default), all character references
+ If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.
+
+ If scripting is true, the noscript element is parsed in the
+ RAWTEXT mode.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
+ self.scripting = scripting
self.reset()
def reset(self):
@@ -454,6 +460,11 @@ def parse_starttag(self, i):
self.set_cdata_mode(tag)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
+ elif self.scripting and tag == "noscript":
+ self.set_cdata_mode(tag)
+ elif tag == "plaintext":
+ self.set_cdata_mode(tag)
+ self.interesting = re.compile(r'\z')
return endpos
# Internal -- check to see if we have a complete starttag; return end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index fff41dab321acd..64cc6d8f1893f2 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -324,49 +324,138 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])
- @support.subTests('content', [
- '',
- "",
- '',
- '',
- '',
- '\u2603',
- '< /title>',
- ' title>',
- '',
- '',
- '',
- '',
+ @support.subTests('tag', ['title', 'textarea'])
+ def test_rcdata_content(self, tag):
+ content = (
+ ''
+ ""
+ ''
+ ''
+ ''
+ '\u2603'
+ f'< /{tag}>'
+ f' {tag}>'
+ f'{tag}x>'
+ f'{tag}\v>'
+ f'{tag}\xa0>'
+ )
+ source = f"<{tag}>{content}{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", content),
+ ("endtag", tag),
])
- def test_title_content(self, content):
- source = f"