Detect XHTML documents with XML declarations

ctoth · claude · ctoth · commit fef66a44be75 · 2025-10-03T20:49:50.000-06:00
Include <?xml declarations when detecting full HTML documents. XHTML files from EPUBs typically start with XML declarations before the DOCTYPE, and should be parsed as documents not fragments. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/html_to_text.py b/html_to_text.py
@@ -529,12 +529,14 @@ def tree_from_string(html: Union[str, bytes]) -> _Element:
     if isinstance(html, bytes):
         html_stripped = html.strip()
         is_full_document = (
+            html_stripped.lower().startswith(b'<?xml') or
             html_stripped.lower().startswith(b'<!doctype') or
             html_stripped.lower().startswith(b'<html')
         )
     else:
         html_stripped = html.strip()
         is_full_document = (
+            html_stripped.lower().startswith('<?xml') or
             html_stripped.lower().startswith('<!doctype') or
             html_stripped.lower().startswith('<html')
         )