Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Doc/library/html.parser.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.

.. class:: HTMLParser(*, convert_charrefs=True)
.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)

Create a parser instance able to parse invalid markup.

If *convert_charrefs* is ``True`` (the default), all character
references (except the ones in ``script``/``style`` elements) are
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be updated now that the list has been expanded.

It might be easier to have a short section about parsing modes, listing each mode, which elements trigger it, whether charrefs are converted or not, and when the state is terminated.

Here we could then say

Suggested change
references (except the ones in ``script``/``style`` elements) are
references (except the ones in RAWTEXT tags) are

with RAWTEXT linking to that section.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to document this here? This is a part of the HTML5 specification. What will the user get from this information?

automatically converted to the corresponding Unicode characters.

If *scripting* is true, the ``noscript`` element is parsed in the
RAWTEXT mode.

An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
Expand All @@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.

.. versionchanged:: 3.14.1
Added the *scripting* parameter.


Example HTML Parser Application
-------------------------------
Expand Down
17 changes: 14 additions & 3 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
# See the HTML5 specs section "13.4 Parsing HTML fragments".
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")

def __init__(self, *, convert_charrefs=True):
def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.

If convert_charrefs is True (the default), all character references
If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.

If scripting is true, the noscript element is parsed in the
RAWTEXT mode.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
self.scripting = scripting
self.reset()

def reset(self):
Expand Down Expand Up @@ -448,6 +454,11 @@ def parse_starttag(self, i):
self.set_cdata_mode(tag)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
elif self.scripting and tag == "noscript":
self.set_cdata_mode(tag)
elif tag == "plaintext":
self.set_cdata_mode(tag)
self.interesting = re.compile(r'\z')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to move this in set_cdata_mode by adding a third branch to the if/else that sets self.interesting.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered this option. But should we repeat condition tag == "plaintext" in two places or add "plaintext" to CDATA_CONTENT_ELEMENTS or RCDATA_CONTENT_ELEMENTS? In any case we will need to repeat "plaintext" twice. This can also create asymmetry with "noscript" if special cases will be handled in different places. So I came to the current code.

Other option is to use special value escapable=None to switch to the PLAINTEXT mode.

return endpos

# Internal -- check to see if we have a complete starttag; return end
Expand Down
169 changes: 136 additions & 33 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,49 +324,138 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])

@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /title>',
'</ title>',
'</titled>',
'</title\v>',
'</title\xa0>',
'</tıtle>',
@support.subTests('tag', ['title', 'textarea'])
def test_rcdata_content(self, tag):
content = (
'<!-- not a comment -->'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
f'< /{tag}>'
f'</ {tag}>'
f'</{tag}x>'
f'</{tag}\v>'
f'</{tag}\xa0>'
)
source = f"<{tag}>{content}</{tag}>"
self._run_check(source, [
("starttag", tag, []),
("data", content),
("endtag", tag),
])
def test_title_content(self, content):
source = f"<title>{content}</title>"
source = f"<{tag}>&amp;</{tag}>"
self._run_check(source, [
("starttag", "title", []),
("starttag", tag, []),
('entityref', 'amp'),
("endtag", tag),
])

@support.subTests('tag',
['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
def test_rawtext_content(self, tag):
content = (
'<!-- not a comment -->'
'&not-an-entity-ref;'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
f'< /{tag}>'
f'</ {tag}>'
f'</{tag}x>'
f'</{tag}\v>'
f'</{tag}\xa0>'
)
source = f"<{tag}>{content}</{tag}>"
self._run_check(source, [
("starttag", tag, []),
("data", content),
("endtag", "title"),
("endtag", tag),
])

@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /textarea>',
'</ textarea>',
'</textareable>',
'</textarea\v>',
'</textarea\xa0>',
def test_noscript_content(self):
content = (
'<!-- not a comment -->'
'&not-an-entity-ref;'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
f'< /noscript>'
f'</ noscript>'
f'</noscriptx>'
f'</noscript\v>'
f'</noscript\xa0>'
)
source = f"<noscript>{content}</noscript>"
self._run_check(source, [
('starttag', 'noscript', []),
('comment', ' not a comment '),
('entityref', 'not'),
('data', '-an-entity-ref;'),
('starttag', 'not', [('a', 'start tag')]),
('unknown decl', 'CDATA[not a cdata'),
('comment', 'not a bogus comment'),
('endtag', 'not'),
('data', '☃< /noscript>'),
('comment', ' noscript'),
('endtag', 'noscriptx'),
('endtag', 'noscript\x0b'),
('endtag', 'noscript\xa0'),
('endtag', 'noscript')
])
def test_textarea_content(self, content):
source = f"<textarea>{content}</textarea>"
self._run_check(source, [
("starttag", "textarea", []),
("starttag", "noscript", []),
("data", content),
("endtag", "noscript"),
], collector=EventCollector(convert_charrefs=False, scripting=True))

def test_plaintext_content(self):
content = (
'<!-- not a comment -->'
'&not-an-entity-ref;'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
'</plaintext>'
)
source = f"<plaintext>{content}"
self._run_check(source, [
("starttag", "plaintext", []),
("data", content),
("endtag", "textarea"),
])

@support.subTests('tag,endtag', [
('title', 'tıtle'),
('style', 'ſtyle'),
('style', 'ſtyle'),
('style', 'style'),
('iframe', 'ıframe'),
('noframes', 'noframeſ'),
('noscript', 'noſcript'),
('noscript', 'noscrıpt'),
('script', 'ſcript'),
('script', 'scrıpt'),
])
def test_invalid_nonascii_closing_tag(self, tag, endtag):
source = f"<{tag}><a></{endtag}>"
self._run_check(source, [
("starttag", tag, []),
("data", f"<a></{endtag}>"),
], collector=EventCollector(convert_charrefs=False, scripting=True))
source = f"<{tag}><a></{endtag}></{tag}>"
self._run_check(source, [
("starttag", tag, []),
("data", f"<a></{endtag}>"),
("endtag", tag),
], collector=EventCollector(convert_charrefs=False, scripting=True))

@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
'script/', 'script foo=bar', 'script foo=">"'])
def test_script_closing_tag(self, endtag):
Expand Down Expand Up @@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag):
("endtag", "textarea")],
collector=EventCollectorNoNormalize(convert_charrefs=False))

@support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP',
'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt',
'ScrIPt'])
def test_closing_tag(self, starttag):
tag = starttag.lower()
for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
content = "<!-- not a comment --><i>Spam</i>"
s = f'<{starttag}>{content}</{endtag}>'
self._run_check(s, [("starttag", tag, []),
('data', content),
("endtag", tag)],
collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))

@support.subTests('tail,end', [
('', False),
('<', False),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",
"noembed" and "noframes", and optionally RAWTEXT element "noscript" in
:class:`html.parser.HTMLParser`.
Loading