Skip to content

Commit 454edc7

Browse files
committed
Python HTMLParser automatically converts &lt; and &gt; to '<' and '>'... sigh. Convert them back. With Tests.
1 parent a8b38e2 commit 454edc7

File tree

2 files changed

+29
-0
lines changed

2 files changed

+29
-0
lines changed

AdvancedHTMLParser/Parser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ def handle_data(self, data):
133133
Internal for parsing
134134
'''
135135
if data:
136+
# Python HTMLParser so helpfully automatically replaces &lt; with < and &gt; with >.... sigh.
137+
data = data.replace('<', '&lt;').replace('>', '&gt;')
136138
if len(self.inTag) > 0:
137139
self.inTag[-1].appendText(data)
138140
elif data.strip(): #and not self.getRoot():
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env GoodTests.py
2+
'''
3+
Test that we retain &lt; and &gt;
4+
'''
5+
6+
import sys
7+
import tempfile
8+
import subprocess
9+
10+
from AdvancedHTMLParser.Parser import AdvancedHTMLParser
11+
12+
13+
class TestRefTag(object):
14+
15+
def test_refTag(self):
16+
html = """<html><body><p>This is &lt;html&gt;</p></body></html>"""
17+
18+
parser = AdvancedHTMLParser()
19+
parser.parseStr(html)
20+
21+
html = parser.getHTML().replace('\n', '').replace('html ', 'html')
22+
assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
23+
assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
24+
25+
26+
if __name__ == '__main__':
27+
pipe = subprocess.Popen('GoodTests.py "%s"' %(sys.argv[0],), shell=True).wait()

0 commit comments

Comments
 (0)