55import unittest
66
77from unittest .mock import patch
8+ from test import support
89
910
1011class EventCollector (html .parser .HTMLParser ):
@@ -430,28 +431,34 @@ def test_tolerant_parsing(self):
430431 ('data' , '<' ),
431432 ('starttag' , 'bc<' , [('a' , None )]),
432433 ('endtag' , 'html' ),
433- ('data' , '\n <img src="URL>' ),
434- ('comment' , '/img' ),
435- ('endtag' , 'html<' )])
434+ ('data' , '\n ' )])
436435
437436 def test_starttag_junk_chars (self ):
437+ self ._run_check ("<" , [('data' , '<' )])
438+ self ._run_check ("<>" , [('data' , '<>' )])
439+ self ._run_check ("< >" , [('data' , '< >' )])
440+ self ._run_check ("< " , [('data' , '< ' )])
438441 self ._run_check ("</>" , [])
442+ self ._run_check ("<$>" , [('data' , '<$>' )])
439443 self ._run_check ("</$>" , [('comment' , '$' )])
440444 self ._run_check ("</" , [('data' , '</' )])
441- self ._run_check ("</a" , [('data' , '</a' )])
445+ self ._run_check ("</a" , [])
446+ self ._run_check ("</ a>" , [('endtag' , 'a' )])
447+ self ._run_check ("</ a" , [('comment' , ' a' )])
442448 self ._run_check ("<a<a>" , [('starttag' , 'a<a' , [])])
443449 self ._run_check ("</a<a>" , [('endtag' , 'a<a' )])
444- self ._run_check ("<!" , [('data ' , '<! ' )])
445- self ._run_check ("<a" , [( 'data' , '<a' ) ])
446- self ._run_check ("<a foo='bar'" , [( 'data' , "<a foo='bar'" ) ])
447- self ._run_check ("<a foo='bar" , [( 'data' , "<a foo='bar" ) ])
448- self ._run_check ("<a foo='>'" , [( 'data' , "<a foo='>'" ) ])
449- self ._run_check ("<a foo='>" , [( 'data' , "<a foo='>" ) ])
450+ self ._run_check ("<!" , [('comment ' , '' )])
451+ self ._run_check ("<a" , [])
452+ self ._run_check ("<a foo='bar'" , [])
453+ self ._run_check ("<a foo='bar" , [])
454+ self ._run_check ("<a foo='>'" , [])
455+ self ._run_check ("<a foo='>" , [])
450456 self ._run_check ("<a$>" , [('starttag' , 'a$' , [])])
451457 self ._run_check ("<a$b>" , [('starttag' , 'a$b' , [])])
452458 self ._run_check ("<a$b/>" , [('startendtag' , 'a$b' , [])])
453459 self ._run_check ("<a$b >" , [('starttag' , 'a$b' , [])])
454460 self ._run_check ("<a$b />" , [('startendtag' , 'a$b' , [])])
461+ self ._run_check ("</a$b>" , [('endtag' , 'a$b' )])
455462
456463 def test_slashes_in_starttag (self ):
457464 self ._run_check ('<a foo="var"/>' , [('startendtag' , 'a' , [('foo' , 'var' )])])
@@ -576,21 +583,50 @@ def test_EOF_in_charref(self):
576583 for html , expected in data :
577584 self ._run_check (html , expected )
578585
579- def test_EOF_in_comments_or_decls (self ):
586+ def test_eof_in_comments (self ):
580587 data = [
581- ('<!' , [('data' , '<!' )]),
582- ('<!-' , [('data' , '<!-' )]),
583- ('<!--' , [('data' , '<!--' )]),
584- ('<![' , [('data' , '<![' )]),
585- ('<![CDATA[' , [('data' , '<![CDATA[' )]),
586- ('<![CDATA[x' , [('data' , '<![CDATA[x' )]),
587- ('<!DOCTYPE' , [('data' , '<!DOCTYPE' )]),
588- ('<!DOCTYPE HTML' , [('data' , '<!DOCTYPE HTML' )]),
588+ ('<!--' , [('comment' , '' )]),
589+ ('<!---' , [('comment' , '' )]),
590+ ('<!----' , [('comment' , '' )]),
591+ ('<!-----' , [('comment' , '-' )]),
592+ ('<!------' , [('comment' , '--' )]),
593+ ('<!----!' , [('comment' , '' )]),
594+ ('<!---!' , [('comment' , '-!' )]),
595+ ('<!---!>' , [('comment' , '-!>' )]),
596+ ('<!--foo' , [('comment' , 'foo' )]),
597+ ('<!--foo-' , [('comment' , 'foo' )]),
598+ ('<!--foo--' , [('comment' , 'foo' )]),
599+ ('<!--foo--!' , [('comment' , 'foo' )]),
600+ ('<!--<!--' , [('comment' , '<!' )]),
601+ ('<!--<!--!' , [('comment' , '<!' )]),
589602 ]
590603 for html , expected in data :
591604 self ._run_check (html , expected )
605+
606+ def test_eof_in_declarations (self ):
607+ data = [
608+ ('<!' , [('comment' , '' )]),
609+ ('<!-' , [('comment' , '-' )]),
610+ ('<![' , [('comment' , '[' )]),
611+ ('<![CDATA[' , [('unknown decl' , 'CDATA[' )]),
612+ ('<![CDATA[x' , [('unknown decl' , 'CDATA[x' )]),
613+ ('<![CDATA[x]' , [('unknown decl' , 'CDATA[x]' )]),
614+ ('<![CDATA[x]]' , [('unknown decl' , 'CDATA[x]]' )]),
615+ ('<!DOCTYPE' , [('decl' , 'DOCTYPE' )]),
616+ ('<!DOCTYPE ' , [('decl' , 'DOCTYPE ' )]),
617+ ('<!DOCTYPE html' , [('decl' , 'DOCTYPE html' )]),
618+ ('<!DOCTYPE html ' , [('decl' , 'DOCTYPE html ' )]),
619+ ('<!DOCTYPE html PUBLIC' , [('decl' , 'DOCTYPE html PUBLIC' )]),
620+ ('<!DOCTYPE html PUBLIC "foo' , [('decl' , 'DOCTYPE html PUBLIC "foo' )]),
621+ ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo' ,
622+ [('decl' , 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo' )]),
623+ ]
624+ for html , expected in data :
625+ self ._run_check (html , expected )
626+
592627 def test_bogus_comments (self ):
593- html = ('<! not really a comment >'
628+ html = ('<!ELEMENT br EMPTY>'
629+ '<! not really a comment >'
594630 '<! not a comment either -->'
595631 '<! -- close enough -->'
596632 '<!><!<-- this was an empty comment>'
@@ -604,6 +640,7 @@ def test_bogus_comments(self):
604640 '<![CDATA]]>' # required '[' after CDATA
605641 )
606642 expected = [
643+ ('comment' , 'ELEMENT br EMPTY' ),
607644 ('comment' , ' not really a comment ' ),
608645 ('comment' , ' not a comment either --' ),
609646 ('comment' , ' -- close enough --' ),
@@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self):
684721 ('endtag' , 'a' ), ('data' , ' bar & baz' )]
685722 )
686723
724+ @support .requires_resource ('cpu' )
725+ def test_eof_no_quadratic_complexity (self ):
726+ # Each of these examples used to take about an hour.
727+ # Now they take a fraction of a second.
728+ def check (source ):
729+ parser = html .parser .HTMLParser ()
730+ parser .feed (source )
731+ parser .close ()
732+ n = 120_000
733+ check ("<a " * n )
734+ check ("<a a=" * n )
735+ check ("</a " * 14 * n )
736+ check ("</a a=" * 11 * n )
737+ check ("<!--" * 4 * n )
738+ check ("<!" * 60 * n )
739+ check ("<?" * 19 * n )
740+ check ("</$" * 15 * n )
741+ check ("<![CDATA[" * 9 * n )
742+ check ("<!doctype" * 35 * n )
743+
687744
688745class AttributesTestCase (TestCaseBase ):
689746
0 commit comments