Skip to content

Commit 2706182

Browse files
gh-135661: Fix comment parsing in HTMLParser
* "--!>" now ends the comment. * "-- >" no longer ends the comment. * Support abnormally ended empty comments "<-->" and "<--->".
1 parent 1c7efaf commit 2706182

File tree

3 files changed

+46
-3
lines changed

3 files changed

+46
-3
lines changed

Lib/html/parser.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
starttagopen = re.compile('<[a-zA-Z]')
3030
endtagopen = re.compile('</[a-zA-Z]')
3131
piclose = re.compile('>')
32-
commentclose = re.compile(r'--\s*>')
32+
commentclose = re.compile(r'--!?>')
33+
commentabruptclose = re.compile(r'-?>')
3334
# Note:
3435
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
3536
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
@@ -309,6 +310,21 @@ def parse_html_declaration(self, i):
309310
else:
310311
return self.parse_bogus_comment(i)
311312

313+
# Internal -- parse comment, return length or -1 if not terminated
314+
# see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
315+
def parse_comment(self, i, report=True):
316+
rawdata = self.rawdata
317+
assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
318+
match = commentclose.search(rawdata, i+4)
319+
if not match:
320+
match = commentabruptclose.match(rawdata, i+4)
321+
if not match:
322+
return -1
323+
if report:
324+
j = match.start()
325+
self.handle_comment(rawdata[i+4: j])
326+
return match.end()
327+
312328
# Internal -- parse bogus comment, return length or -1 if not terminated
313329
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
314330
def parse_bogus_comment(self, i, report=1):

Lib/test/test_htmlparser.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,17 +332,41 @@ def test_comments(self):
332332
html = ("<!-- I'm a valid comment -->"
333333
'<!--me too!-->'
334334
'<!------>'
335+
'<!----->'
335336
'<!---->'
337+
# abrupt-closing-of-empty-comment
338+
'<!--->'
339+
'<!-->'
336340
'<!----I have many hyphens---->'
337341
'<!-- I have a > in the middle -->'
338-
'<!-- and I have -- in the middle! -->')
342+
'<!-- and I have -- in the middle! -->'
343+
'<!--incorrectly-closed-comment--!>'
344+
'<!----!>'
345+
'<!----!-->'
346+
'<!---- >-->'
347+
# nested-comment
348+
'<!-- <!-- nested --> -->'
349+
'<!--<!-->'
350+
'<!--<!--!>'
351+
)
339352
expected = [('comment', " I'm a valid comment "),
340353
('comment', 'me too!'),
341354
('comment', '--'),
355+
('comment', '-'),
356+
('comment', ''),
357+
('comment', ''),
342358
('comment', ''),
343359
('comment', '--I have many hyphens--'),
344360
('comment', ' I have a > in the middle '),
345-
('comment', ' and I have -- in the middle! ')]
361+
('comment', ' and I have -- in the middle! '),
362+
('comment', 'incorrectly-closed-comment'),
363+
('comment', ''),
364+
('comment', '--!'),
365+
('comment', '-- >'),
366+
('comment', ' <!-- nested '), ('data', ' -->'),
367+
('comment', '<!'),
368+
('comment', '<!'),
369+
]
346370
self._run_check(html, expected)
347371

348372
def test_condcoms(self):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix comment parsing in :class:`html.parser.HTMLParser`. ``--!>`` now
2+
ends the comment. ``-- >`` no longer ends the comment. Support abnormally
3+
ended empty comments ``<-->`` and ``<--->``.

0 commit comments

Comments
 (0)