Skip to content

Commit ed904d5

Browse files
miss-islingtonserhiy-storchakaambv
authored
[3.9] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (GH-137774) (GH-139661)
"] ]>" and "]] >" no longer end the CDATA section. Make CDATA section parsing context depending. Add private method HTMLParser._set_support_cdata() to change the context. If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>". If called with False, "<[CDATA[" starts a bogus comments which ends with ">". (cherry picked from commit 0cbbfc4) (cherry picked from commit dcf2476) Co-authored-by: Serhiy Storchaka <[email protected]> Co-authored-by: Łukasz Langa <[email protected]>
1 parent f3d8338 commit ed904d5

File tree

3 files changed

+104
-29
lines changed

3 files changed

+104
-29
lines changed

Lib/html/parser.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def reset(self):
127127
self.lasttag = '???'
128128
self.interesting = interesting_normal
129129
self.cdata_elem = None
130+
self._support_cdata = True
130131
self._escapable = True
131132
_markupbase.ParserBase.reset(self)
132133

@@ -164,6 +165,19 @@ def clear_cdata_mode(self):
164165
self.cdata_elem = None
165166
self._escapable = True
166167

168+
def _set_support_cdata(self, flag=True):
169+
"""Enable or disable support of the CDATA sections.
170+
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
171+
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
172+
173+
This method is not called by default. Its purpose is to be called
174+
in custom handle_starttag() and handle_endtag() methods, with
175+
value that depends on the adjusted current node.
176+
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
177+
for details.
178+
"""
179+
self._support_cdata = flag
180+
167181
# Internal -- handle data as far as reasonable. May leave state
168182
# and data to be processed by a subsequent call. If 'end' is
169183
# true, force handling all data as if followed by EOF marker.
@@ -238,7 +252,7 @@ def goahead(self, end):
238252
j -= len(suffix)
239253
break
240254
self.handle_comment(rawdata[i+4:j])
241-
elif startswith("<![CDATA[", i):
255+
elif startswith("<![CDATA[", i) and self._support_cdata:
242256
self.unknown_decl(rawdata[i+3:])
243257
elif rawdata[i:i+9].lower() == '<!doctype':
244258
self.handle_decl(rawdata[i+2:])
@@ -314,15 +328,28 @@ def parse_html_declaration(self, i):
314328
if rawdata[i:i+4] == '<!--':
315329
# this case is actually already handled in goahead()
316330
return self.parse_comment(i)
317-
elif rawdata[i:i+3] == '<![':
318-
return self.parse_marked_section(i)
331+
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
332+
j = rawdata.find(']]>', i+9)
333+
if j < 0:
334+
return -1
335+
self.unknown_decl(rawdata[i+3: j])
336+
return j + 3
319337
elif rawdata[i:i+9].lower() == '<!doctype':
320338
# find the closing >
321339
gtpos = rawdata.find('>', i+9)
322340
if gtpos == -1:
323341
return -1
324342
self.handle_decl(rawdata[i+2:gtpos])
325343
return gtpos+1
344+
elif rawdata[i:i+3] == '<![':
345+
j = rawdata.find('>', i+3)
346+
if j < 0:
347+
return -1
348+
if rawdata[j-1] == ']':
349+
self.unknown_decl(rawdata[i+3: j-1])
350+
else:
351+
self.handle_comment(rawdata[i+2: j])
352+
return j + 1
326353
else:
327354
return self.parse_bogus_comment(i)
328355

Lib/test/test_htmlparser.py

Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,13 @@
99

1010
class EventCollector(html.parser.HTMLParser):
1111

12-
def __init__(self, *args, **kw):
12+
def __init__(self, *args, autocdata=False, **kw):
13+
self.autocdata = autocdata
1314
self.events = []
1415
self.append = self.events.append
1516
html.parser.HTMLParser.__init__(self, *args, **kw)
17+
if autocdata:
18+
self._set_support_cdata(False)
1619

1720
def get_events(self):
1821
# Normalize the list of events so that buffer artefacts don't
@@ -33,12 +36,16 @@ def get_events(self):
3336

3437
def handle_starttag(self, tag, attrs):
3538
self.append(("starttag", tag, attrs))
39+
if self.autocdata and tag == 'svg':
40+
self._set_support_cdata(True)
3641

3742
def handle_startendtag(self, tag, attrs):
3843
self.append(("startendtag", tag, attrs))
3944

4045
def handle_endtag(self, tag):
4146
self.append(("endtag", tag))
47+
if self.autocdata and tag == 'svg':
48+
self._set_support_cdata(False)
4249

4350
# all other markup
4451

@@ -739,10 +746,6 @@ def test_eof_in_declarations(self):
739746
('<!', [('comment', '')]),
740747
('<!-', [('comment', '-')]),
741748
('<![', [('comment', '[')]),
742-
('<![CDATA[', [('unknown decl', 'CDATA[')]),
743-
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
744-
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
745-
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
746749
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
747750
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
748751
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@@ -755,6 +758,18 @@ def test_eof_in_declarations(self):
755758
for html, expected in data:
756759
self._run_check(html, expected)
757760

761+
@support.subTests('content', ['', 'x', 'x]', 'x]]'])
762+
def test_eof_in_cdata(self, content):
763+
self._run_check('<![CDATA[' + content,
764+
[('unknown decl', 'CDATA[' + content)])
765+
self._run_check('<![CDATA[' + content,
766+
[('comment', '[CDATA[' + content)],
767+
collector=EventCollector(autocdata=True))
768+
self._run_check('<svg><text y="100"><![CDATA[' + content,
769+
[('starttag', 'svg', []),
770+
('starttag', 'text', [('y', '100')]),
771+
('unknown decl', 'CDATA[' + content)])
772+
758773
def test_bogus_comments(self):
759774
html = ('<!ELEMENT br EMPTY>'
760775
'<! not really a comment >'
@@ -804,7 +819,56 @@ def test_broken_condcoms(self):
804819
('startendtag', 'img', [('src', 'mammoth.bmp')]),
805820
('unknown decl', 'endif')
806821
]
822+
823+
self._run_check(html, expected)
824+
825+
@support.subTests('content', [
826+
'just some plain text',
827+
'<!-- not a comment -->',
828+
'&not-an-entity-ref;',
829+
"<not a='start tag'>",
830+
'',
831+
'[[I have many brackets]]',
832+
'I have a > in the middle',
833+
'I have a ]] in the middle',
834+
'] ]>',
835+
']] >',
836+
('\n'
837+
' if (a < b && a > b) {\n'
838+
' printf("[<marquee>How?</marquee>]");\n'
839+
' }\n'),
840+
])
841+
def test_cdata_section_content(self, content):
842+
# See "13.2.5.42 Markup declaration open state",
843+
# "13.2.5.69 CDATA section state", and issue bpo-32876.
844+
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
845+
expected = [
846+
('starttag', 'svg', []),
847+
('starttag', 'text', [('y', '100')]),
848+
('unknown decl', 'CDATA[' + content),
849+
('endtag', 'text'),
850+
('endtag', 'svg'),
851+
]
807852
self._run_check(html, expected)
853+
self._run_check(html, expected, collector=EventCollector(autocdata=True))
854+
855+
def test_cdata_section(self):
856+
# See "13.2.5.42 Markup declaration open state".
857+
html = ('<![CDATA[foo<br>bar]]>'
858+
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
859+
'<![CDATA[foo<br>bar]]>')
860+
expected = [
861+
('comment', '[CDATA[foo<br'),
862+
('data', 'bar]]>'),
863+
('starttag', 'svg', []),
864+
('starttag', 'text', [('y', '100')]),
865+
('unknown decl', 'CDATA[foo<br>bar'),
866+
('endtag', 'text'),
867+
('endtag', 'svg'),
868+
('comment', '[CDATA[foo<br'),
869+
('data', 'bar]]>'),
870+
]
871+
self._run_check(html, expected, collector=EventCollector(autocdata=True))
808872

809873
def test_convert_charrefs_dropped_text(self):
810874
# #23144: make sure that all the events are triggered when
@@ -1041,27 +1105,6 @@ def test_weird_chars_in_unquoted_attribute_values(self):
10411105
('starttag', 'form',
10421106
[('action', 'bogus|&#()value')])])
10431107

1044-
def test_invalid_keyword_error_exception(self):
1045-
# bpo-34480: check that subclasses that define an
1046-
# error method that raises an exception work
1047-
class InvalidMarkupException(Exception):
1048-
pass
1049-
class MyHTMLParser(html.parser.HTMLParser):
1050-
def error(self, message):
1051-
raise InvalidMarkupException(message)
1052-
parser = MyHTMLParser()
1053-
with self.assertRaises(InvalidMarkupException):
1054-
parser.feed('<![invalid>')
1055-
1056-
def test_invalid_keyword_error_pass(self):
1057-
# bpo-34480: check that subclasses that define an
1058-
# error method that doesn't raise an exception work
1059-
class MyHTMLParser(html.parser.HTMLParser):
1060-
def error(self, message):
1061-
pass
1062-
parser = MyHTMLParser()
1063-
self.assertEqual(parser.feed('<![invalid>'), None)
1064-
10651108

10661109
if __name__ == "__main__":
10671110
unittest.main()
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
2+
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
3+
Add private method ``_set_support_cdata()`` which can be used to specify
4+
how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
5+
(SVG or MathML) or as a bogus comment in the HTML namespace.

0 commit comments

Comments
 (0)