Fix platform-specific HTML parsing inconsistencies

ctoth · claude · ctoth · commit ea79811818e7 · 2025-09-30T12:50:28.000-06:00
Use lxml.html.fragment_fromstring with explicit 'span' parent to ensure consistent parsing behavior across platforms. lxml.html.fromstring() has unpredictable auto-correction that wraps fragments differently on different platforms/libxml2 versions, causing CI failures. Changes: - tree_from_string() now uses fragment_fromstring with create_parent='span' - Added explicit empty/whitespace check to maintain ParserError behavior - Updated test expectations to match consistent parsing behavior - Removed debug test file Fixes #CI 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/html_to_text.py b/html_to_text.py
@@ -415,7 +415,14 @@ def tree_from_string(html: str) -> _Element:
         return lxml.etree.fromstring(html)
     except lxml.etree.XMLSyntaxError:
         pass
-    return lxml.html.fromstring(html)
+    # fragment_fromstring is more forgiving, so check for empty/whitespace first
+    if not html or not html.strip():
+        raise lxml.etree.ParserError("Document is empty")
+    # Use fragment_fromstring with explicit parent container to ensure
+    # consistent parsing behavior. lxml.html.fromstring() has unpredictable
+    # auto-correction that wraps fragments differently across platforms.
+    # Using 'span' as parent since it's inline and won't add extra spacing.
+    return lxml.html.fragment_fromstring(html, create_parent='span')
 
 
 def main() -> int:
diff --git a/tests/test_debug_ci.py b/tests/test_debug_ci.py
diff --git a/tests/test_ignored_tags.py b/tests/test_ignored_tags.py
@@ -40,11 +40,8 @@ def test_text_after_ignored(self, tag):
 
     def test_text_before_and_after_ignored(self, tag):
         html = f"before<{tag}>ignored</{tag}>after"
-        # Title tag adds block spacing, others don't
-        if tag == "title":
-            assert convert(html) == "before\n\nafter"
-        else:
-            assert convert(html) == "beforeafter"
+        # All ignored tags behave the same - content is stripped
+        assert convert(html) == "beforeafter"
 
 
 @pytest.mark.parametrize("tag", ["script", "style", "title"])
diff --git a/tests/test_special_tags.py b/tests/test_special_tags.py
@@ -64,8 +64,8 @@ def test_hr_with_text_after(self):
 
     def test_hr_between_text(self):
         result = convert("before<hr>after")
-        # HR has \n at end, plus tail newlines
-        assert result == f"before{HR_TEXT}\n\nafter"
+        # HR adds its line, tail text follows
+        assert result == f"before{HR_TEXT}after"
 
     def test_hr_in_paragraph(self):
         html = "<p>text<hr></p>"
@@ -134,8 +134,8 @@ def test_br_then_hr(self):
     def test_hr_then_br(self):
         html = "<hr><br>text"
         result = convert(html)
-        # HR ends with \n, BR adds \n, plus spacing
-        assert result == f"{HR_TEXT}\n\n\ntext"
+        # HR ends with \n, BR adds \n
+        assert result == f"{HR_TEXT}\ntext"
 
     def test_br_in_definition_list(self):
         html = "<dt>term<br>continued</dt>"