Skip to content

Commit b7b6bfa

Browse files
committed
Merge 'handle text that comes before or after root tag' etc changes into AdvancedHTMLFormatter
1 parent 0b14db4 commit b7b6bfa

File tree

1 file changed

+27
-9
lines changed

1 file changed

+27
-9
lines changed

AdvancedHTMLParser/Formatter.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,15 @@ def getHTML(self):
9191
else:
9292
doctypeStr = ''
9393

94-
return doctypeStr + ''.join([elem.outerHTML for elem in self.getRootNodes()])
94+
# 6.6.0: If we have a real root tag, print the outerHTML. If we have a fake root tag (for multiple root condition),
95+
# then print the innerHTML (skipping the outer root tag). Otherwise, we will miss
96+
# untagged text (between the multiple root nodes).
97+
rootNode = self.getRoot()
98+
if rootNode.tagName == INVISIBLE_ROOT_TAG:
99+
return doctypeStr + rootNode.innerHTML
100+
else:
101+
return doctypeStr + rootNode.outerHTML
102+
# return doctypeStr + ''.join([elem.outerHTML for elem in self.getRootNodes()])
95103

96104
def getRoot(self):
97105
'''
@@ -211,35 +219,45 @@ def handle_data(self, data):
211219
'''
212220
handle_data - Internal for parsing
213221
'''
214-
if data and len(self.inTag) > 0:
215-
if self.inTag[-1].tagName not in PRESERVE_CONTENTS_TAGS:
216-
data = data.replace('\t', ' ').strip('\r\n')
217-
if data.startswith(' '):
218-
data = ' ' + data.lstrip()
219-
if data.endswith(' '):
220-
data = data.rstrip() + ' '
221-
self.inTag[-1].appendText(data)
222+
if data:
223+
if len(self.inTag) > 0:
224+
if self.inTag[-1].tagName not in PRESERVE_CONTENTS_TAGS:
225+
data = data.replace('\t', ' ').strip('\r\n')
226+
if data.startswith(' '):
227+
data = ' ' + data.lstrip()
228+
if data.endswith(' '):
229+
data = data.rstrip() + ' '
230+
self.inTag[-1].appendText(data)
231+
elif data.strip():
232+
# Must be text prior to or after root node
233+
raise MultipleRootNodeException()
222234

223235
def handle_entityref(self, entity):
224236
'''
225237
Internal for parsing
226238
'''
227239
if len(self.inTag) > 0:
228240
self.inTag[-1].appendText('&%s;' %(entity,))
241+
else:
242+
raise MultipleRootNodeException()
229243

230244
def handle_charref(self, charRef):
231245
'''
232246
Internal for parsing
233247
'''
234248
if len(self.inTag) > 0:
235249
self.inTag[-1].appendText('&#%s;' %(charRef,))
250+
else:
251+
raise MultipleRootNodeException()
236252

237253
def handle_comment(self, comment):
238254
'''
239255
Internal for parsing
240256
'''
241257
if len(self.inTag) > 0:
242258
self.inTag[-1].appendText('<!-- %s -->' %(comment,))
259+
else:
260+
raise MultipleRootNodeException()
243261

244262
def handle_decl(self, decl):
245263
'''

0 commit comments

Comments
 (0)