Skip to content

Commit 4eb09b1

Browse files
committed
Handle comments, char refs, etc before and after root node.
1 parent 93bc290 commit 4eb09b1

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

AdvancedHTMLParser/Parser.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,20 +145,26 @@ def handle_entityref(self, entity):
145145
'''
146146
if len(self.inTag) > 0:
147147
self.inTag[-1].appendText('&%s;' %(entity,))
148+
else:
149+
raise MultipleRootNodeException()
148150

149151
def handle_charref(self, charRef):
150152
'''
151153
Internal for parsing
152154
'''
153155
if len(self.inTag) > 0:
154156
self.inTag[-1].appendText('&#%s;' %(charRef,))
157+
else:
158+
raise MultipleRootNodeException()
155159

156160
def handle_comment(self, comment):
157161
'''
158162
Internal for parsing
159163
'''
160164
if len(self.inTag) > 0:
161165
self.inTag[-1].appendText('<!-- %s -->' %(comment,))
166+
else:
167+
raise MultipleRootNodeException()
162168

163169
def handle_decl(self, decl):
164170
'''

tests/AdvancedHTMLParserTests/test_untaggedText.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,51 @@ def test_textBeforeAndAfterRoot(self):
7676
assert strippedHTML.startswith('Hello') , 'Expected text before root tag to be retained, got "%s"' %(strippedHTML,)
7777
assert strippedHTML.endswith('World') , 'Expected text after root tag to be retained, got "%s"' %(strippedHTML,)
7878

79+
def test_commentRetained(self):
80+
html = """<html>
81+
<!-- CommentX -->
82+
<body><span>Hello</span></body></html>"""
83+
84+
parser = AdvancedHTMLParser()
85+
parser.parseStr(html)
86+
87+
retHTML = parser.getHTML()
88+
89+
assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
90+
91+
def test_commentRetainedPriorRoot(self):
92+
html = """<!-- CommentX --><html>
93+
<body><span>Hello</span></body></html>"""
94+
95+
parser = AdvancedHTMLParser()
96+
parser.parseStr(html)
97+
98+
retHTML = parser.getHTML()
99+
100+
assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
101+
102+
def test_commentRetainedAfterRoot(self):
103+
html = """<html>
104+
<body><span>Hello</span></body></html><!-- CommentX -->"""
105+
106+
parser = AdvancedHTMLParser()
107+
parser.parseStr(html)
108+
109+
retHTML = parser.getHTML()
110+
111+
assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
112+
113+
def test_commentRetainedBeforeAndAfterRoot(self):
114+
html = """<!-- CommentX --><html>
115+
<body><span>Hello</span></body></html><!-- CommentY -->"""
116+
117+
parser = AdvancedHTMLParser()
118+
parser.parseStr(html)
119+
120+
retHTML = parser.getHTML()
121+
122+
assert 'CommentX' in retHTML, 'Expected to find comment, "CommentX" in returned HTML: "%s"' %(retHTML,)
123+
assert 'CommentY' in retHTML, 'Expected to find comment, "CommentY" in returned HTML: "%s"' %(retHTML,)
79124

80125

81126

0 commit comments

Comments
 (0)