Skip to content

Commit 0b14db4

Browse files
committed
So it turns out only python3 automatically converts charrefs, python2 does not. So set the flag so internally it stops doing that, and undo previous replace
1 parent 0f5f9ff commit 0b14db4

File tree

3 files changed

+28
-2
lines changed

3 files changed

+28
-2
lines changed

AdvancedHTMLParser/Formatter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ def __init__(self, indent=' ', encoding='utf-8'):
4242
'''
4343
HTMLParser.__init__(self)
4444

45+
# Do not automatically convert charrefs in python3
46+
self.convert_charrefs = False
47+
4548
self.parsedData = []
4649
self.reset = self._reset
4750
self.decl = None

AdvancedHTMLParser/Parser.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ def __init__(self, filename=None, encoding='utf-8'):
4444
4545
'''
4646
HTMLParser.__init__(self)
47+
# Do not automatically convert charrefs in python3
48+
self.convert_charrefs = False
4749

4850
self.encoding = encoding
4951

@@ -128,13 +130,12 @@ def handle_endtag(self, tagName):
128130
except:
129131
pass
130132

133+
131134
def handle_data(self, data):
132135
'''
133136
Internal for parsing
134137
'''
135138
if data:
136-
# Python HTMLParser so helpfully automatically replaces &lt; with < and &gt; with >.... sigh.
137-
data = data.replace('<', '&lt;').replace('>', '&gt;')
138139
if len(self.inTag) > 0:
139140
self.inTag[-1].appendText(data)
140141
elif data.strip(): #and not self.getRoot():

tests/AdvancedHTMLParserTests/test_RefTag.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,28 @@ def test_refTag(self):
2121
assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
2222
assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
2323

24+
def test_nbsp(self):
25+
html = """<html><body><p>Test&nbsp;One</p></body></html>"""
26+
parser = AdvancedHTMLParser()
27+
parser.parseStr(html)
28+
29+
html = parser.getHTML().replace('\n', '').replace('html ', 'html')
30+
assert '&nbsp;' in html, '(Will fail in python2..) Expected to retain &nbsp; got %s' %(html,)
31+
32+
html = """<html><body><p>Test One</p></body></html>"""
33+
parser = AdvancedHTMLParser()
34+
parser.parseStr(html)
35+
36+
html = parser.getHTML().replace('\n', '').replace('html ', 'html')
37+
assert '&nbsp;' not in html, '(Will fail in python2..) Expected not to insert &nbsp; got %s' %(html,)
38+
39+
html = """<html><body><p>Test&nbsp;&nbsp;One</p></body></html>"""
40+
parser = AdvancedHTMLParser()
41+
parser.parseStr(html)
42+
43+
html = parser.getHTML().replace('\n', '').replace('html ', 'html')
44+
assert 'Test&nbsp;&nbsp;One' in html, '(Will fail in python2..) Expected to retain original data with two &nbsp; got %s' %(html,)
45+
2446

2547
if __name__ == '__main__':
2648
pipe = subprocess.Popen('GoodTests.py "%s"' %(sys.argv[0],), shell=True).wait()

0 commit comments

Comments
 (0)