@@ -91,7 +91,15 @@ def getHTML(self):
9191 else :
9292 doctypeStr = ''
9393
94- return doctypeStr + '' .join ([elem .outerHTML for elem in self .getRootNodes ()])
94+ # 6.6.0: If we have a real root tag, print the outerHTML. If we have a fake root tag (for multiple root condition),
95+ # then print the innerHTML (skipping the outer root tag). Otherwise, we will miss
96+ # untagged text (between the multiple root nodes).
97+ rootNode = self .getRoot ()
98+ if rootNode .tagName == INVISIBLE_ROOT_TAG :
99+ return doctypeStr + rootNode .innerHTML
100+ else :
101+ return doctypeStr + rootNode .outerHTML
102+ # return doctypeStr + ''.join([elem.outerHTML for elem in self.getRootNodes()])
95103
96104 def getRoot (self ):
97105 '''
@@ -211,35 +219,45 @@ def handle_data(self, data):
211219 '''
212220 handle_data - Internal for parsing
213221 '''
214- if data and len (self .inTag ) > 0 :
215- if self .inTag [- 1 ].tagName not in PRESERVE_CONTENTS_TAGS :
216- data = data .replace ('\t ' , ' ' ).strip ('\r \n ' )
217- if data .startswith (' ' ):
218- data = ' ' + data .lstrip ()
219- if data .endswith (' ' ):
220- data = data .rstrip () + ' '
221- self .inTag [- 1 ].appendText (data )
222+ if data :
223+ if len (self .inTag ) > 0 :
224+ if self .inTag [- 1 ].tagName not in PRESERVE_CONTENTS_TAGS :
225+ data = data .replace ('\t ' , ' ' ).strip ('\r \n ' )
226+ if data .startswith (' ' ):
227+ data = ' ' + data .lstrip ()
228+ if data .endswith (' ' ):
229+ data = data .rstrip () + ' '
230+ self .inTag [- 1 ].appendText (data )
231+ elif data .strip ():
232+ # Must be text prior to or after root node
233+ raise MultipleRootNodeException ()
222234
223235 def handle_entityref (self , entity ):
224236 '''
225237 Internal for parsing
226238 '''
227239 if len (self .inTag ) > 0 :
228240 self .inTag [- 1 ].appendText ('&%s;' % (entity ,))
241+ else :
242+ raise MultipleRootNodeException ()
229243
230244 def handle_charref (self , charRef ):
231245 '''
232246 Internal for parsing
233247 '''
234248 if len (self .inTag ) > 0 :
235249 self .inTag [- 1 ].appendText ('&#%s;' % (charRef ,))
250+ else :
251+ raise MultipleRootNodeException ()
236252
237253 def handle_comment (self , comment ):
238254 '''
239255 Internal for parsing
240256 '''
241257 if len (self .inTag ) > 0 :
242258 self .inTag [- 1 ].appendText ('<!-- %s -->' % (comment ,))
259+ else :
260+ raise MultipleRootNodeException ()
243261
244262 def handle_decl (self , decl ):
245263 '''
0 commit comments