security: Remove xslt_path and harden XML parsers in HTMLSectionSplitter: package: langchain-text-splitters (#31819)

ColeMurray · web-flow · commit 43eef435505a · 2025-07-02T15:24:08.000-04:00
## Summary
- Removes the `xslt_path` parameter from HTMLSectionSplitter to
eliminate XXE attack vector
- Hardens XML/HTML parsers with secure configurations to prevent XXE
attacks
- Adds comprehensive security tests to ensure the vulnerability is fixed

  ## Context
This PR addresses a critical XXE vulnerability discovered in the
HTMLSectionSplitter component. The vulnerability allowed attackers to:
- Read sensitive local files (SSH keys, passwords, configuration files)
  - Perform Server-Side Request Forgery (SSRF) attacks
  - Exfiltrate data to attacker-controlled servers

  ## Changes Made
1. **Removed `xslt_path` parameter** - This eliminates the primary
attack vector where users could supply malicious XSLT files
2. **Hardened XML parsers** - Added security configurations to prevent
XXE attacks even with the default XSLT:
     - `no_network=True` - Blocks network access
- `resolve_entities=False` - Prevents entity expansion -
`load_dtd=False` - Disables DTD processing -
`XSLTAccessControl.DENY_ALL` - Blocks all file/network I/O in XSLT
transformations

3. **Added security tests** - New test file `test_html_security.py` with
comprehensive tests for various XXE attack vectors
4. **Updated existing tests** - Modified tests that were using the
removed `xslt_path` parameter

  ## Test Plan
  - [x] All existing tests pass
  - [x] New security tests verify XXE attacks are blocked
  - [x] Code passes linting and formatting checks
  - [x] Tested with both old and new versions of lxml


Twitter handle: @_colemurray
diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py
@@ -309,7 +309,6 @@ class HTMLSectionSplitter:
     def __init__(
         self,
         headers_to_split_on: List[Tuple[str, str]],
-        xslt_path: Optional[str] = None,
         **kwargs: Any,
     ) -> None:
         """Create a new HTMLSectionSplitter.
@@ -318,20 +317,13 @@ def __init__(
             headers_to_split_on: list of tuples of headers we want to track mapped to
                 (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                 h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"].
-            xslt_path: path to xslt file for document transformation.
-            Uses a default if not passed.
-            Needed for html contents that using different format and layouts.
             **kwargs (Any): Additional optional arguments for customizations.
 
         """
         self.headers_to_split_on = dict(headers_to_split_on)
-
-        if xslt_path is None:
-            self.xslt_path = (
-                pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
-            ).absolute()
-        else:
-            self.xslt_path = pathlib.Path(xslt_path).absolute()
+        self.xslt_path = (
+            pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
+        ).absolute()
         self.kwargs = kwargs
 
     def split_documents(self, documents: Iterable[Document]) -> List[Document]:
@@ -457,11 +449,20 @@ def convert_possible_tags_to_header(self, html_content: str) -> str:
                 "Unable to import lxml, please install with `pip install lxml`."
             ) from e
         # use lxml library to parse html document and return xml ElementTree
-        parser = etree.HTMLParser()
-        tree = etree.parse(StringIO(html_content), parser)
+        # Create secure parsers to prevent XXE attacks
+        html_parser = etree.HTMLParser(no_network=True)
+        xslt_parser = etree.XMLParser(
+            resolve_entities=False, no_network=True, load_dtd=False
+        )
+
+        # Apply XSLT access control to prevent file/network access
+        # DENY_ALL is a predefined access control that blocks all file/network access
+        # Type ignore needed due to incomplete lxml type stubs
+        ac = etree.XSLTAccessControl.DENY_ALL  # type: ignore[attr-defined]
 
-        xslt_tree = etree.parse(self.xslt_path)
-        transform = etree.XSLT(xslt_tree)
+        tree = etree.parse(StringIO(html_content), html_parser)
+        xslt_tree = etree.parse(self.xslt_path, xslt_parser)
+        transform = etree.XSLT(xslt_tree, access_control=ac)
         result = transform(tree)
         return str(result)
 
diff --git a/libs/text-splitters/tests/unit_tests/test_html_security.py b/libs/text-splitters/tests/unit_tests/test_html_security.py
@@ -0,0 +1,130 @@
+"""Security tests for HTML splitters to prevent XXE attacks."""
+
+import pytest
+
+from langchain_text_splitters.html import HTMLSectionSplitter
+
+
+@pytest.mark.requires("lxml", "bs4")
+class TestHTMLSectionSplitterSecurity:
+    """Security tests for HTMLSectionSplitter to ensure XXE prevention."""
+
+    def test_xxe_entity_attack_blocked(self) -> None:
+        """Test that external entity attacks are blocked."""
+        # Create HTML content to process
+        html_content = """<html><body><p>Test content</p></body></html>"""
+
+        # Since xslt_path parameter is removed, this attack vector is eliminated
+        # The splitter should use only the default XSLT
+        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
+
+        # Process the HTML - should not contain any external entity content
+        result = splitter.split_text(html_content)
+
+        # Verify that no external entity content is present
+        all_content = " ".join([doc.page_content for doc in result])
+        assert "root:" not in all_content  # /etc/passwd content
+        assert "XXE Attack Result" not in all_content
+
+    def test_xxe_document_function_blocked(self) -> None:
+        """Test that XSLT document() function attacks are blocked."""
+        # Even if someone modifies the default XSLT internally,
+        # the secure parser configuration should block document() attacks
+
+        html_content = (
+            """<html><body><h1>Test Header</h1><p>Test content</p></body></html>"""
+        )
+
+        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
+
+        # Process the HTML safely
+        result = splitter.split_text(html_content)
+
+        # Should process normally without any security issues
+        assert len(result) > 0
+        assert any("Test content" in doc.page_content for doc in result)
+
+    def test_secure_parser_configuration(self) -> None:
+        """Test that parsers are configured with security settings."""
+        # This test verifies our security hardening is in place
+        html_content = """<html><body><h1>Test</h1></body></html>"""
+
+        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
+
+        # The convert_possible_tags_to_header method should use secure parsers
+        result = splitter.convert_possible_tags_to_header(html_content)
+
+        # Result should be valid transformed HTML
+        assert result is not None
+        assert isinstance(result, str)
+
+    def test_no_network_access(self) -> None:
+        """Test that network access is blocked in parsers."""
+        # Create HTML that might trigger network access
+        html_with_external_ref = """<?xml version="1.0"?>
+<!DOCTYPE html [
+  <!ENTITY external SYSTEM "http://attacker.com/xxe">
+]>
+<html>
+  <body>
+    <h1>Test</h1>
+    <p>&external;</p>
+  </body>
+</html>"""
+
+        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
+
+        # Process the HTML - should not make network requests
+        result = splitter.split_text(html_with_external_ref)
+
+        # Verify no external content is included
+        all_content = " ".join([doc.page_content for doc in result])
+        assert "attacker.com" not in all_content
+
+    def test_dtd_processing_disabled(self) -> None:
+        """Test that DTD processing is disabled."""
+        # HTML with DTD that attempts to define entities
+        html_with_dtd = """<!DOCTYPE html [
+  <!ELEMENT html (body)>
+  <!ELEMENT body (h1, p)>
+  <!ELEMENT h1 (#PCDATA)>
+  <!ELEMENT p (#PCDATA)>
+  <!ENTITY test "This is a test entity">
+]>
+<html>
+  <body>
+    <h1>Header</h1>
+    <p>&test;</p>
+  </body>
+</html>"""
+
+        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
+
+        # Process the HTML - entities should not be resolved
+        result = splitter.split_text(html_with_dtd)
+
+        # The entity should not be expanded
+        all_content = " ".join([doc.page_content for doc in result])
+        assert "This is a test entity" not in all_content
+
+    def test_safe_default_xslt_usage(self) -> None:
+        """Test that the default XSLT file is used safely."""
+        # Test with HTML that has font-size styling (what the default XSLT handles)
+        html_with_font_size = """<html>
+<body>
+    <span style="font-size: 24px;">Large Header</span>
+    <p>Content under large text</p>
+    <span style="font-size: 18px;">Small Header</span>
+    <p>Content under small text</p>
+</body>
+</html>"""
+
+        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
+
+        # Process the HTML using the default XSLT
+        result = splitter.split_text(html_with_font_size)
+
+        # Should successfully process the content
+        assert len(result) > 0
+        # Large font text should be converted to header
+        assert any("Large Header" in str(doc.metadata.values()) for doc in result)
diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -3,7 +3,6 @@
 import random
 import re
 import string
-from pathlib import Path
 from typing import Any, Callable, List, Tuple
 
 import pytest
@@ -2865,37 +2864,6 @@ def test_happy_path_splitting_based_on_header_with_whitespace_chars() -> None:
     assert docs[2].metadata["Header 2"] == "Baz"
 
 
-@pytest.mark.requires("bs4")
-@pytest.mark.requires("lxml")
-def test_section_splitter_accepts_a_relative_path() -> None:
-    html_string = """<html><body><p>Foo</p></body></html>"""
-    test_file = Path("tests/test_data/test_splitter.xslt")
-    assert test_file.is_file()
-
-    sec_splitter = HTMLSectionSplitter(
-        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
-        xslt_path=test_file.as_posix(),
-    )
-
-    sec_splitter.split_text(html_string)
-
-
-@pytest.mark.requires("bs4")
-@pytest.mark.requires("lxml")
-def test_section_splitter_accepts_an_absolute_path() -> None:
-    html_string = """<html><body><p>Foo</p></body></html>"""
-    test_file = Path("tests/test_data/test_splitter.xslt").absolute()
-    assert test_file.is_absolute()
-    assert test_file.is_file()
-
-    sec_splitter = HTMLSectionSplitter(
-        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
-        xslt_path=test_file.as_posix(),
-    )
-
-    sec_splitter.split_text(html_string)
-
-
 @pytest.mark.requires("bs4")
 @pytest.mark.requires("lxml")
 def test_happy_path_splitting_with_duplicate_header_tag() -> None: