|
4 | 4 | import pprint |
5 | 5 | import unittest |
6 | 6 |
|
| 7 | +from test import support |
| 8 | + |
7 | 9 |
|
8 | 10 | class EventCollector(html.parser.HTMLParser): |
9 | 11 |
|
@@ -391,28 +393,34 @@ def test_tolerant_parsing(self): |
391 | 393 | ('data', '<'), |
392 | 394 | ('starttag', 'bc<', [('a', None)]), |
393 | 395 | ('endtag', 'html'), |
394 | | - ('data', '\n<img src="URL>'), |
395 | | - ('comment', '/img'), |
396 | | - ('endtag', 'html<')]) |
| 396 | + ('data', '\n')]) |
397 | 397 |
|
398 | 398 | def test_starttag_junk_chars(self): |
| 399 | + self._run_check("<", [('data', '<')]) |
| 400 | + self._run_check("<>", [('data', '<>')]) |
| 401 | + self._run_check("< >", [('data', '< >')]) |
| 402 | + self._run_check("< ", [('data', '< ')]) |
399 | 403 | self._run_check("</>", []) |
| 404 | + self._run_check("<$>", [('data', '<$>')]) |
400 | 405 | self._run_check("</$>", [('comment', '$')]) |
401 | 406 | self._run_check("</", [('data', '</')]) |
402 | | - self._run_check("</a", [('data', '</a')]) |
| 407 | + self._run_check("</a", []) |
| 408 | + self._run_check("</ a>", [('endtag', 'a')]) |
| 409 | + self._run_check("</ a", [('comment', ' a')]) |
403 | 410 | self._run_check("<a<a>", [('starttag', 'a<a', [])]) |
404 | 411 | self._run_check("</a<a>", [('endtag', 'a<a')]) |
405 | | - self._run_check("<!", [('data', '<!')]) |
406 | | - self._run_check("<a", [('data', '<a')]) |
407 | | - self._run_check("<a foo='bar'", [('data', "<a foo='bar'")]) |
408 | | - self._run_check("<a foo='bar", [('data', "<a foo='bar")]) |
409 | | - self._run_check("<a foo='>'", [('data', "<a foo='>'")]) |
410 | | - self._run_check("<a foo='>", [('data', "<a foo='>")]) |
| 412 | + self._run_check("<!", [('comment', '')]) |
| 413 | + self._run_check("<a", []) |
| 414 | + self._run_check("<a foo='bar'", []) |
| 415 | + self._run_check("<a foo='bar", []) |
| 416 | + self._run_check("<a foo='>'", []) |
| 417 | + self._run_check("<a foo='>", []) |
411 | 418 | self._run_check("<a$>", [('starttag', 'a$', [])]) |
412 | 419 | self._run_check("<a$b>", [('starttag', 'a$b', [])]) |
413 | 420 | self._run_check("<a$b/>", [('startendtag', 'a$b', [])]) |
414 | 421 | self._run_check("<a$b >", [('starttag', 'a$b', [])]) |
415 | 422 | self._run_check("<a$b />", [('startendtag', 'a$b', [])]) |
| 423 | + self._run_check("</a$b>", [('endtag', 'a$b')]) |
416 | 424 |
|
417 | 425 | def test_slashes_in_starttag(self): |
418 | 426 | self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])]) |
@@ -537,13 +545,56 @@ def test_EOF_in_charref(self): |
537 | 545 | for html, expected in data: |
538 | 546 | self._run_check(html, expected) |
539 | 547 |
|
540 | | - def test_broken_comments(self): |
541 | | - html = ('<! not really a comment >' |
| 548 | + def test_eof_in_comments(self): |
| 549 | + data = [ |
| 550 | + ('<!--', [('comment', '')]), |
| 551 | + ('<!---', [('comment', '')]), |
| 552 | + ('<!----', [('comment', '')]), |
| 553 | + ('<!-----', [('comment', '-')]), |
| 554 | + ('<!------', [('comment', '--')]), |
| 555 | + ('<!----!', [('comment', '')]), |
| 556 | + ('<!---!', [('comment', '-!')]), |
| 557 | + ('<!---!>', [('comment', '-!>')]), |
| 558 | + ('<!--foo', [('comment', 'foo')]), |
| 559 | + ('<!--foo-', [('comment', 'foo')]), |
| 560 | + ('<!--foo--', [('comment', 'foo')]), |
| 561 | + ('<!--foo--!', [('comment', 'foo')]), |
| 562 | + ('<!--<!--', [('comment', '<!')]), |
| 563 | + ('<!--<!--!', [('comment', '<!')]), |
| 564 | + ] |
| 565 | + for html, expected in data: |
| 566 | + self._run_check(html, expected) |
| 567 | + |
| 568 | + def test_eof_in_declarations(self): |
| 569 | + data = [ |
| 570 | + ('<!', [('comment', '')]), |
| 571 | + ('<!-', [('comment', '-')]), |
| 572 | + ('<![', [('comment', '[')]), |
| 573 | + ('<![CDATA[', [('unknown decl', 'CDATA[')]), |
| 574 | + ('<![CDATA[x', [('unknown decl', 'CDATA[x')]), |
| 575 | + ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]), |
| 576 | + ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]), |
| 577 | + ('<!DOCTYPE', [('decl', 'DOCTYPE')]), |
| 578 | + ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]), |
| 579 | + ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]), |
| 580 | + ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]), |
| 581 | + ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]), |
| 582 | + ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]), |
| 583 | + ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo', |
| 584 | + [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]), |
| 585 | + ] |
| 586 | + for html, expected in data: |
| 587 | + self._run_check(html, expected) |
| 588 | + |
| 589 | + def test_bogus_comments(self): |
| 590 | + html = ('<!ELEMENT br EMPTY>' |
| 591 | + '<! not really a comment >' |
542 | 592 | '<! not a comment either -->' |
543 | 593 | '<! -- close enough -->' |
544 | 594 | '<!><!<-- this was an empty comment>' |
545 | 595 | '<!!! another bogus comment !!!>') |
546 | 596 | expected = [ |
| 597 | + ('comment', 'ELEMENT br EMPTY'), |
547 | 598 | ('comment', ' not really a comment '), |
548 | 599 | ('comment', ' not a comment either --'), |
549 | 600 | ('comment', ' -- close enough --'), |
@@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self): |
598 | 649 | ('endtag', 'a'), ('data', ' bar & baz')] |
599 | 650 | ) |
600 | 651 |
|
| 652 | + @support.requires_resource('cpu') |
| 653 | + def test_eof_no_quadratic_complexity(self): |
| 654 | + # Each of these examples used to take about an hour. |
| 655 | + # Now they take a fraction of a second. |
| 656 | + def check(source): |
| 657 | + parser = html.parser.HTMLParser() |
| 658 | + parser.feed(source) |
| 659 | + parser.close() |
| 660 | + n = 120_000 |
| 661 | + check("<a " * n) |
| 662 | + check("<a a=" * n) |
| 663 | + check("</a " * 14 * n) |
| 664 | + check("</a a=" * 11 * n) |
| 665 | + check("<!--" * 4 * n) |
| 666 | + check("<!" * 60 * n) |
| 667 | + check("<?" * 19 * n) |
| 668 | + check("</$" * 15 * n) |
| 669 | + check("<![CDATA[" * 9 * n) |
| 670 | + check("<!doctype" * 35 * n) |
| 671 | + |
601 | 672 |
|
602 | 673 | class AttributesTestCase(TestCaseBase): |
603 | 674 |
|
|
0 commit comments