Skip to content

Commit bb7b873

Browse files
gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser
* the "plaintext" element * the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes" * optionally RAWTEXT (if scripting=True) element "noscript"
1 parent 4e08a9f commit bb7b873

File tree

4 files changed

+160
-37
lines changed

4 files changed

+160
-37
lines changed

Doc/library/html.parser.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,17 @@
1515
This module defines a class :class:`HTMLParser` which serves as the basis for
1616
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
1717

18-
.. class:: HTMLParser(*, convert_charrefs=True)
18+
.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
1919

2020
Create a parser instance able to parse invalid markup.
2121

2222
If *convert_charrefs* is ``True`` (the default), all character
2323
references (except the ones in ``script``/``style`` elements) are
2424
automatically converted to the corresponding Unicode characters.
2525

26+
If *scripting* is true, the `noscript` element is parsed in the
27+
RAWTEXT mode.
28+
2629
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
2730
when start tags, end tags, text, comments, and other markup elements are
2831
encountered. The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
3740
.. versionchanged:: 3.5
3841
The default value for argument *convert_charrefs* is now ``True``.
3942

43+
.. versionchanged:: 3.13.8
44+
Added the *scripting* parameter.
45+
4046

4147
Example HTML Parser Application
4248
-------------------------------

Lib/html/parser.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase):
127127
argument.
128128
"""
129129

130-
CDATA_CONTENT_ELEMENTS = ("script", "style")
130+
# See the HTML5 specs section "13.4 Parsing HTML fragments".
131+
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
132+
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
131133
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
132134

133-
def __init__(self, *, convert_charrefs=True):
135+
def __init__(self, *, convert_charrefs=True, scripting=False):
134136
"""Initialize and reset this instance.
135137
136-
If convert_charrefs is True (the default), all character references
138+
If convert_charrefs is true (the default), all character references
137139
are automatically converted to the corresponding Unicode characters.
140+
141+
If scripting is true, the noscript element is parsed in the
142+
RAWTEXT mode.
138143
"""
139144
super().__init__()
140145
self.convert_charrefs = convert_charrefs
146+
self.scripting = scripting
141147
self.reset()
142148

143149
def reset(self):
@@ -454,6 +460,11 @@ def parse_starttag(self, i):
454460
self.set_cdata_mode(tag)
455461
elif tag in self.RCDATA_CONTENT_ELEMENTS:
456462
self.set_cdata_mode(tag, escapable=True)
463+
elif self.scripting and tag == "noscript":
464+
self.set_cdata_mode(tag)
465+
elif tag == "plaintext":
466+
self.set_cdata_mode(tag)
467+
self.interesting = re.compile(r'\z')
457468
return endpos
458469

459470
# Internal -- check to see if we have a complete starttag; return end

Lib/test/test_htmlparser.py

Lines changed: 136 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -324,49 +324,138 @@ def test_style_content(self, content):
324324
("data", content),
325325
("endtag", "style")])
326326

327-
@support.subTests('content', [
328-
'<!-- not a comment -->',
329-
"<not a='start tag'>",
330-
'<![CDATA[not a cdata]]>',
331-
'<!not a bogus comment>',
332-
'</not a bogus comment>',
333-
'\u2603',
334-
'< /title>',
335-
'</ title>',
336-
'</titled>',
337-
'</title\v>',
338-
'</title\xa0>',
339-
'</tıtle>',
327+
@support.subTests('tag', ['title', 'textarea'])
328+
def test_rcdata_content(self, tag):
329+
content = (
330+
'<!-- not a comment -->'
331+
"<not a='start tag'>"
332+
'<![CDATA[not a cdata]]>'
333+
'<!not a bogus comment>'
334+
'</not a bogus comment>'
335+
'\u2603'
336+
f'< /{tag}>'
337+
f'</ {tag}>'
338+
f'</{tag}x>'
339+
f'</{tag}\v>'
340+
f'</{tag}\xa0>'
341+
)
342+
source = f"<{tag}>{content}</{tag}>"
343+
self._run_check(source, [
344+
("starttag", tag, []),
345+
("data", content),
346+
("endtag", tag),
340347
])
341-
def test_title_content(self, content):
342-
source = f"<title>{content}</title>"
348+
source = f"<{tag}>&amp;</{tag}>"
343349
self._run_check(source, [
344-
("starttag", "title", []),
350+
("starttag", tag, []),
351+
('entityref', 'amp'),
352+
("endtag", tag),
353+
])
354+
355+
@support.subTests('tag',
356+
['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
357+
def test_rawtext_content(self, tag):
358+
content = (
359+
'<!-- not a comment -->'
360+
'&not-an-entity-ref;'
361+
"<not a='start tag'>"
362+
'<![CDATA[not a cdata]]>'
363+
'<!not a bogus comment>'
364+
'</not a bogus comment>'
365+
'\u2603'
366+
f'< /{tag}>'
367+
f'</ {tag}>'
368+
f'</{tag}x>'
369+
f'</{tag}\v>'
370+
f'</{tag}\xa0>'
371+
)
372+
source = f"<{tag}>{content}</{tag}>"
373+
self._run_check(source, [
374+
("starttag", tag, []),
345375
("data", content),
346-
("endtag", "title"),
376+
("endtag", tag),
347377
])
348378

349-
@support.subTests('content', [
350-
'<!-- not a comment -->',
351-
"<not a='start tag'>",
352-
'<![CDATA[not a cdata]]>',
353-
'<!not a bogus comment>',
354-
'</not a bogus comment>',
355-
'\u2603',
356-
'< /textarea>',
357-
'</ textarea>',
358-
'</textareable>',
359-
'</textarea\v>',
360-
'</textarea\xa0>',
379+
def test_noscript_content(self):
380+
content = (
381+
'<!-- not a comment -->'
382+
'&not-an-entity-ref;'
383+
"<not a='start tag'>"
384+
'<![CDATA[not a cdata]]>'
385+
'<!not a bogus comment>'
386+
'</not a bogus comment>'
387+
'\u2603'
388+
f'< /noscript>'
389+
f'</ noscript>'
390+
f'</noscriptx>'
391+
f'</noscript\v>'
392+
f'</noscript\xa0>'
393+
)
394+
source = f"<noscript>{content}</noscript>"
395+
self._run_check(source, [
396+
('starttag', 'noscript', []),
397+
('comment', ' not a comment '),
398+
('entityref', 'not'),
399+
('data', '-an-entity-ref;'),
400+
('starttag', 'not', [('a', 'start tag')]),
401+
('unknown decl', 'CDATA[not a cdata'),
402+
('comment', 'not a bogus comment'),
403+
('endtag', 'not'),
404+
('data', '☃< /noscript>'),
405+
('comment', ' noscript'),
406+
('endtag', 'noscriptx'),
407+
('endtag', 'noscript\x0b'),
408+
('endtag', 'noscript\xa0'),
409+
('endtag', 'noscript')
361410
])
362-
def test_textarea_content(self, content):
363-
source = f"<textarea>{content}</textarea>"
364411
self._run_check(source, [
365-
("starttag", "textarea", []),
412+
("starttag", "noscript", []),
413+
("data", content),
414+
("endtag", "noscript"),
415+
], collector=EventCollector(convert_charrefs=False, scripting=True))
416+
417+
def test_plaintext_content(self):
418+
content = (
419+
'<!-- not a comment -->'
420+
'&not-an-entity-ref;'
421+
"<not a='start tag'>"
422+
'<![CDATA[not a cdata]]>'
423+
'<!not a bogus comment>'
424+
'</not a bogus comment>'
425+
'\u2603'
426+
'</plaintext>'
427+
)
428+
source = f"<plaintext>{content}"
429+
self._run_check(source, [
430+
("starttag", "plaintext", []),
366431
("data", content),
367-
("endtag", "textarea"),
368432
])
369433

434+
@support.subTests('tag,endtag', [
435+
('title', 'tıtle'),
436+
('style', 'ſtyle'),
437+
('style', 'ſtyle'),
438+
('style', 'style'),
439+
('iframe', 'ıframe'),
440+
('noframes', 'noframeſ'),
441+
('noscript', 'noſcript'),
442+
('noscript', 'noscrıpt'),
443+
('script', 'ſcript'),
444+
('script', 'scrıpt'),
445+
])
446+
def test_invalid_nonascii_closing_tag(self, tag, endtag):
447+
source = f"<{tag}><a></{endtag}>"
448+
self._run_check(source, [
449+
("starttag", tag, []),
450+
("data", f"<a></{endtag}>"),
451+
], collector=EventCollector(convert_charrefs=False, scripting=True))
452+
source = f"<{tag}><a></{endtag}></{tag}>"
453+
self._run_check(source, [
454+
("starttag", tag, []),
455+
("data", f"<a></{endtag}>"),
456+
("endtag", tag),
457+
], collector=EventCollector(convert_charrefs=False, scripting=True))
458+
370459
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
371460
'script/', 'script foo=bar', 'script foo=">"'])
372461
def test_script_closing_tag(self, endtag):
@@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag):
428517
("endtag", "textarea")],
429518
collector=EventCollectorNoNormalize(convert_charrefs=False))
430519

520+
@support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP',
521+
'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt',
522+
'ScrIPt'])
523+
def test_closing_tag(self, starttag):
524+
tag = starttag.lower()
525+
for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
526+
f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
527+
content = "<!-- not a comment --><i>Spam</i>"
528+
s = f'<{starttag}>{content}</{endtag}>'
529+
self._run_check(s, [("starttag", tag, []),
530+
('data', content),
531+
("endtag", tag)],
532+
collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))
533+
431534
@support.subTests('tail,end', [
432535
('', False),
433536
('<', False),
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",
2+
"noembed" and "noframes", and optionally RAWTEXT element "noscript" in
3+
:class:`html.parser.HTMLParser`.

0 commit comments

Comments
 (0)