|  | 
| 4 | 4 | import pprint | 
| 5 | 5 | import unittest | 
| 6 | 6 | 
 | 
|  | 7 | +from test import support | 
|  | 8 | + | 
| 7 | 9 | 
 | 
| 8 | 10 | class EventCollector(html.parser.HTMLParser): | 
| 9 | 11 | 
 | 
| @@ -391,28 +393,34 @@ def test_tolerant_parsing(self): | 
| 391 | 393 |                             ('data', '<'), | 
| 392 | 394 |                             ('starttag', 'bc<', [('a', None)]), | 
| 393 | 395 |                             ('endtag', 'html'), | 
| 394 |  | -                            ('data', '\n<img src="URL>'), | 
| 395 |  | -                            ('comment', '/img'), | 
| 396 |  | -                            ('endtag', 'html<')]) | 
|  | 396 | +                            ('data', '\n')]) | 
| 397 | 397 | 
 | 
| 398 | 398 |     def test_starttag_junk_chars(self): | 
|  | 399 | +        self._run_check("<", [('data', '<')]) | 
|  | 400 | +        self._run_check("<>", [('data', '<>')]) | 
|  | 401 | +        self._run_check("< >", [('data', '< >')]) | 
|  | 402 | +        self._run_check("< ", [('data', '< ')]) | 
| 399 | 403 |         self._run_check("</>", []) | 
|  | 404 | +        self._run_check("<$>", [('data', '<$>')]) | 
| 400 | 405 |         self._run_check("</$>", [('comment', '$')]) | 
| 401 | 406 |         self._run_check("</", [('data', '</')]) | 
| 402 |  | -        self._run_check("</a", [('data', '</a')]) | 
|  | 407 | +        self._run_check("</a", []) | 
|  | 408 | +        self._run_check("</ a>", [('endtag', 'a')]) | 
|  | 409 | +        self._run_check("</ a", [('comment', ' a')]) | 
| 403 | 410 |         self._run_check("<a<a>", [('starttag', 'a<a', [])]) | 
| 404 | 411 |         self._run_check("</a<a>", [('endtag', 'a<a')]) | 
| 405 |  | -        self._run_check("<!", [('data', '<!')]) | 
| 406 |  | -        self._run_check("<a", [('data', '<a')]) | 
| 407 |  | -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")]) | 
| 408 |  | -        self._run_check("<a foo='bar", [('data', "<a foo='bar")]) | 
| 409 |  | -        self._run_check("<a foo='>'", [('data', "<a foo='>'")]) | 
| 410 |  | -        self._run_check("<a foo='>", [('data', "<a foo='>")]) | 
|  | 412 | +        self._run_check("<!", [('comment', '')]) | 
|  | 413 | +        self._run_check("<a", []) | 
|  | 414 | +        self._run_check("<a foo='bar'", []) | 
|  | 415 | +        self._run_check("<a foo='bar", []) | 
|  | 416 | +        self._run_check("<a foo='>'", []) | 
|  | 417 | +        self._run_check("<a foo='>", []) | 
| 411 | 418 |         self._run_check("<a$>", [('starttag', 'a$', [])]) | 
| 412 | 419 |         self._run_check("<a$b>", [('starttag', 'a$b', [])]) | 
| 413 | 420 |         self._run_check("<a$b/>", [('startendtag', 'a$b', [])]) | 
| 414 | 421 |         self._run_check("<a$b  >", [('starttag', 'a$b', [])]) | 
| 415 | 422 |         self._run_check("<a$b  />", [('startendtag', 'a$b', [])]) | 
|  | 423 | +        self._run_check("</a$b>", [('endtag', 'a$b')]) | 
| 416 | 424 | 
 | 
| 417 | 425 |     def test_slashes_in_starttag(self): | 
| 418 | 426 |         self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])]) | 
| @@ -537,13 +545,56 @@ def test_EOF_in_charref(self): | 
| 537 | 545 |         for html, expected in data: | 
| 538 | 546 |             self._run_check(html, expected) | 
| 539 | 547 | 
 | 
| 540 |  | -    def test_broken_comments(self): | 
| 541 |  | -        html = ('<! not really a comment >' | 
|  | 548 | +    def test_eof_in_comments(self): | 
|  | 549 | +        data = [ | 
|  | 550 | +            ('<!--', [('comment', '')]), | 
|  | 551 | +            ('<!---', [('comment', '')]), | 
|  | 552 | +            ('<!----', [('comment', '')]), | 
|  | 553 | +            ('<!-----', [('comment', '-')]), | 
|  | 554 | +            ('<!------', [('comment', '--')]), | 
|  | 555 | +            ('<!----!', [('comment', '')]), | 
|  | 556 | +            ('<!---!', [('comment', '-!')]), | 
|  | 557 | +            ('<!---!>', [('comment', '-!>')]), | 
|  | 558 | +            ('<!--foo', [('comment', 'foo')]), | 
|  | 559 | +            ('<!--foo-', [('comment', 'foo')]), | 
|  | 560 | +            ('<!--foo--', [('comment', 'foo')]), | 
|  | 561 | +            ('<!--foo--!', [('comment', 'foo')]), | 
|  | 562 | +            ('<!--<!--', [('comment', '<!')]), | 
|  | 563 | +            ('<!--<!--!', [('comment', '<!')]), | 
|  | 564 | +        ] | 
|  | 565 | +        for html, expected in data: | 
|  | 566 | +            self._run_check(html, expected) | 
|  | 567 | + | 
|  | 568 | +    def test_eof_in_declarations(self): | 
|  | 569 | +        data = [ | 
|  | 570 | +            ('<!', [('comment', '')]), | 
|  | 571 | +            ('<!-', [('comment', '-')]), | 
|  | 572 | +            ('<![', [('comment', '[')]), | 
|  | 573 | +            ('<![CDATA[', [('unknown decl', 'CDATA[')]), | 
|  | 574 | +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]), | 
|  | 575 | +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]), | 
|  | 576 | +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]), | 
|  | 577 | +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]), | 
|  | 578 | +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]), | 
|  | 579 | +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]), | 
|  | 580 | +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]), | 
|  | 581 | +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]), | 
|  | 582 | +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]), | 
|  | 583 | +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo', | 
|  | 584 | +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]), | 
|  | 585 | +        ] | 
|  | 586 | +        for html, expected in data: | 
|  | 587 | +            self._run_check(html, expected) | 
|  | 588 | + | 
|  | 589 | +    def test_bogus_comments(self): | 
|  | 590 | +        html = ('<!ELEMENT br EMPTY>' | 
|  | 591 | +                '<! not really a comment >' | 
| 542 | 592 |                 '<! not a comment either -->' | 
| 543 | 593 |                 '<! -- close enough -->' | 
| 544 | 594 |                 '<!><!<-- this was an empty comment>' | 
| 545 | 595 |                 '<!!! another bogus comment !!!>') | 
| 546 | 596 |         expected = [ | 
|  | 597 | +            ('comment', 'ELEMENT br EMPTY'), | 
| 547 | 598 |             ('comment', ' not really a comment '), | 
| 548 | 599 |             ('comment', ' not a comment either --'), | 
| 549 | 600 |             ('comment', ' -- close enough --'), | 
| @@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self): | 
| 598 | 649 |              ('endtag', 'a'), ('data', ' bar & baz')] | 
| 599 | 650 |         ) | 
| 600 | 651 | 
 | 
|  | 652 | +    @support.requires_resource('cpu') | 
|  | 653 | +    def test_eof_no_quadratic_complexity(self): | 
|  | 654 | +        # Each of these examples used to take about an hour. | 
|  | 655 | +        # Now they take a fraction of a second. | 
|  | 656 | +        def check(source): | 
|  | 657 | +            parser = html.parser.HTMLParser() | 
|  | 658 | +            parser.feed(source) | 
|  | 659 | +            parser.close() | 
|  | 660 | +        n = 120_000 | 
|  | 661 | +        check("<a " * n) | 
|  | 662 | +        check("<a a=" * n) | 
|  | 663 | +        check("</a " * 14 * n) | 
|  | 664 | +        check("</a a=" * 11 * n) | 
|  | 665 | +        check("<!--" * 4 * n) | 
|  | 666 | +        check("<!" * 60 * n) | 
|  | 667 | +        check("<?" * 19 * n) | 
|  | 668 | +        check("</$" * 15 * n) | 
|  | 669 | +        check("<![CDATA[" * 9 * n) | 
|  | 670 | +        check("<!doctype" * 35 * n) | 
|  | 671 | + | 
| 601 | 672 | 
 | 
| 602 | 673 | class AttributesTestCase(TestCaseBase): | 
| 603 | 674 | 
 | 
|  | 
0 commit comments