So it turns out only python3 automatically converts charrefs, python2 does not. So set the flag so internally it stops doing that, and undo previous replace

kata198 · kata198 · commit 0b14db4bc3f8 · 2016-07-27T00:34:56.000-04:00
diff --git a/AdvancedHTMLParser/Formatter.py b/AdvancedHTMLParser/Formatter.py
@@ -42,6 +42,9 @@ def __init__(self, indent='  ', encoding='utf-8'):
         '''
         HTMLParser.__init__(self)
 
+        # Do not automatically convert charrefs in python3
+        self.convert_charrefs = False
+
         self.parsedData = []
         self.reset = self._reset
         self.decl = None
diff --git a/AdvancedHTMLParser/Parser.py b/AdvancedHTMLParser/Parser.py
@@ -44,6 +44,8 @@ def __init__(self, filename=None, encoding='utf-8'):
                                             
         '''
         HTMLParser.__init__(self)
+        # Do not automatically convert charrefs in python3
+        self.convert_charrefs = False
 
         self.encoding = encoding
 
@@ -128,13 +130,12 @@ def handle_endtag(self, tagName):
         except:
             pass
 
+
     def handle_data(self, data):
         '''
             Internal for parsing
         '''
         if data:
-            # Python HTMLParser so helpfully automatically replaces &lt; with < and &gt; with >.... sigh.
-            data = data.replace('<', '&lt;').replace('>', '&gt;')
             if len(self.inTag) > 0:
                 self.inTag[-1].appendText(data)
             elif data.strip(): #and not self.getRoot():
diff --git a/tests/AdvancedHTMLParserTests/test_RefTag.py b/tests/AdvancedHTMLParserTests/test_RefTag.py
@@ -21,6 +21,28 @@ def test_refTag(self):
         assert 'This is <html>' not in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
         assert 'This is &lt;html&gt;' in html, 'Expected to retain &lt; and &gt;, got %s' %(html,)
 
+    def test_nbsp(self):
+        html = """<html><body><p>Test&nbsp;One</p></body></html>"""
+        parser = AdvancedHTMLParser()
+        parser.parseStr(html)
+
+        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
+        assert '&nbsp;' in html, '(Will fail in python2..) Expected to retain &nbsp; got %s' %(html,)
+
+        html = """<html><body><p>Test One</p></body></html>"""
+        parser = AdvancedHTMLParser()
+        parser.parseStr(html)
+
+        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
+        assert '&nbsp;' not in html, '(Will fail in python2..) Expected not to insert &nbsp; got %s' %(html,)
+
+        html = """<html><body><p>Test&nbsp;&nbsp;One</p></body></html>"""
+        parser = AdvancedHTMLParser()
+        parser.parseStr(html)
+
+        html = parser.getHTML().replace('\n', '').replace('html ', 'html')
+        assert 'Test&nbsp;&nbsp;One' in html, '(Will fail in python2..) Expected to retain original data with two &nbsp; got %s' %(html,)
+
 
 if __name__ == '__main__':
     pipe  = subprocess.Popen('GoodTests.py "%s"' %(sys.argv[0],), shell=True).wait()