Fix full HTML document parsing with HTML entities

ctoth · claude · ctoth · commit fcb681af46cf · 2025-10-03T20:42:15.000-06:00
Detect full HTML documents (starting with <!doctype or <html) and parse them with lxml.html.fromstring() instead of fragment_fromstring(). This preserves document structure for XHTML files with HTML entities like &nbsp; that fail strict XML parsing. Fragment parsing with span wrapper is still used for actual fragments to maintain platform consistency fix from ea79811. Fixes EPUB parsing where XHTML documents were being wrapped in span, breaking body element detection. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/html_to_text.py b/html_to_text.py
@@ -523,11 +523,23 @@ def tree_from_string(html: str) -> _Element:
     # fragment_fromstring is more forgiving, so check for empty/whitespace first
     if not html or not html.strip():
         raise lxml.etree.ParserError("Document is empty")
-    # Use fragment_fromstring with explicit parent container to ensure
-    # consistent parsing behavior. lxml.html.fromstring() has unpredictable
-    # auto-correction that wraps fragments differently across platforms.
-    # Using 'span' as parent since it's inline and won't add extra spacing.
-    return lxml.html.fragment_fromstring(html, create_parent="span")
+
+    # Detect if this is a full HTML document vs a fragment
+    html_stripped = html.strip()
+    is_full_document = (
+        html_stripped.lower().startswith('<!doctype') or
+        html_stripped.lower().startswith('<html')
+    )
+
+    if is_full_document:
+        # Full HTML documents should be parsed as documents to preserve structure
+        return lxml.html.fromstring(html)
+    else:
+        # Use fragment_fromstring with explicit parent container to ensure
+        # consistent parsing behavior. lxml.html.fromstring() has unpredictable
+        # auto-correction that wraps fragments differently across platforms.
+        # Using 'span' as parent since it's inline and won't add extra spacing.
+        return lxml.html.fragment_fromstring(html, create_parent="span")
 
 
 def main() -> int: