Skip to content

Commit 91bafd3

Browse files
committed
Make our enhanced fromstring function accept bytes as well
Fixes: #21
1 parent b8a21a2 commit 91bafd3

File tree

2 files changed

+13
-3
lines changed

2 files changed

+13
-3
lines changed

lxml_html_clean/clean.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,19 @@ def _has_javascript_scheme(s):
9090
# - 0A - Line Feed
9191
# - 0B - Vertical tab
9292
# - 0D - Carriage Return
93-
_ascii_control_characters = re.compile(r"[\x00-\x08\x0C\x0E-\x1F\x7F]")
93+
_ascii_control_characters_str = re.compile("[\x00-\x08\x0C\x0E-\x1F\x7F]")
94+
_ascii_control_characters_bytes = re.compile(b"[\x00-\x08\x0C\x0E-\x1F\x7F]")
9495

9596

96-
def fromstring(string):
97+
def fromstring(data):
9798
"""
9899
Enhanced fromstring function that removes ASCII control chars
99100
before passing the input to the original lxml.html.fromstring.
100101
"""
101-
return lxml_fromstring(_ascii_control_characters.sub("", string))
102+
if isinstance(data, bytes):
103+
return lxml_fromstring(_ascii_control_characters_bytes.sub(b"", data))
104+
else:
105+
return lxml_fromstring(_ascii_control_characters_str.sub("", data))
102106

103107

104108
# This regular expression is inspired by the one in urllib3.

tests/test_clean.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,12 @@ def test_ascii_control_chars_removed(self):
355355
cleaner = Cleaner()
356356
self.assertEqual(expected, cleaner.clean_html(html))
357357

358+
def test_ascii_control_chars_removed_from_bytes(self):
359+
html = b"""<a href="java\x1bscript:alert()">Link</a>"""
360+
expected = b"""<a href="">Link</a>"""
361+
cleaner = Cleaner()
362+
self.assertEqual(expected, cleaner.clean_html(html))
363+
358364
def test_memory_usage_many_elements_with_long_tails(self):
359365
comment = "<!-- foo bar baz -->\n"
360366
empty_line = "\t" * 10 + "\n"

0 commit comments

Comments
 (0)