Skip to content

Commit be8ec96

Browse files
committed
Remove ASCII control characters before reading the input
libxml2 removes them during serialization which means that an input like <a href="java\x1bscript:alert()">Link</a> contains the control character during cleaning but not after it's serialized back to a string. This basically overcomes checks in lxml_html_clean and the output might then be malicious even the input is not.
1 parent 5bb0e88 commit be8ec96

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

lxml_html_clean/clean.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from lxml import etree
1414
from lxml.html import defs
15-
from lxml.html import fromstring, XHTML_NAMESPACE
15+
from lxml.html import fromstring as lxml_fromstring, XHTML_NAMESPACE
1616
from lxml.html import xhtml_to_html, _transform_result
1717

1818

@@ -83,6 +83,21 @@ def _has_javascript_scheme(s):
8383
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
8484
namespaces={'x':XHTML_NAMESPACE})
8585

86+
# Regex to remove all ASCII control characters (00-1F,7F) except:
87+
# - 09 - Horizontal tab
88+
# - 0A - Line Feed
89+
# - 0B - Vertical tab
90+
# - 0D - Carriage Return
91+
_ascii_control_characters = re.compile(r"[\x00-\x08\x0C\x0E-\x1F\x7F]")
92+
93+
94+
def fromstring(string):
95+
"""
96+
Enhanced fromstring function that removes ASCII control chars
97+
before passing the input to the original lxml.html.fromstring.
98+
"""
99+
return lxml_fromstring(_ascii_control_characters.sub("", string))
100+
86101

87102
class Cleaner:
88103
"""

tests/test_clean.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,3 +327,9 @@ def test_host_whitelist_sneaky_userinfo(self):
327327
expected = '<div></div>'
328328
cleaner = Cleaner(frames=False, host_whitelist=["example.com"])
329329
self.assertEqual(expected, cleaner.clean_html(html))
330+
331+
def test_ascii_control_chars_removed(self):
332+
html = """<a href="java\x1bscript:alert()">Link</a>"""
333+
expected = """<a href="">Link</a>"""
334+
cleaner = Cleaner()
335+
self.assertEqual(expected, cleaner.clean_html(html))

0 commit comments

Comments
 (0)