File tree Expand file tree Collapse file tree 2 files changed +13
-3
lines changed Expand file tree Collapse file tree 2 files changed +13
-3
lines changed Original file line number Diff line number Diff line change @@ -90,15 +90,19 @@ def _has_javascript_scheme(s):
9090# - 0A - Line Feed
9191# - 0B - Vertical tab
9292# - 0D - Carriage Return
93- _ascii_control_characters = re .compile (r"[\x00-\x08\x0C\x0E-\x1F\x7F]" )
93+ _ascii_control_characters_str = re .compile ("[\x00 -\x08 \x0C \x0E -\x1F \x7F ]" )
94+ _ascii_control_characters_bytes = re .compile (b"[\x00 -\x08 \x0C \x0E -\x1F \x7F ]" )
9495
9596
96- def fromstring (string ):
97+ def fromstring (data ):
9798 """
9899 Enhanced fromstring function that removes ASCII control chars
99100 before passing the input to the original lxml.html.fromstring.
100101 """
101- return lxml_fromstring (_ascii_control_characters .sub ("" , string ))
102+ if isinstance (data , bytes ):
103+ return lxml_fromstring (_ascii_control_characters_bytes .sub (b"" , data ))
104+ else :
105+ return lxml_fromstring (_ascii_control_characters_str .sub ("" , data ))
102106
103107
104108# This regular expression is inspired by the one in urllib3.
Original file line number Diff line number Diff line change @@ -355,6 +355,12 @@ def test_ascii_control_chars_removed(self):
355355 cleaner = Cleaner ()
356356 self .assertEqual (expected , cleaner .clean_html (html ))
357357
358+ def test_ascii_control_chars_removed_from_bytes (self ):
359+ html = b"""<a href="java\x1b script:alert()">Link</a>"""
360+ expected = b"""<a href="">Link</a>"""
361+ cleaner = Cleaner ()
362+ self .assertEqual (expected , cleaner .clean_html (html ))
363+
358364 def test_memory_usage_many_elements_with_long_tails (self ):
359365 comment = "<!-- foo bar baz -->\n "
360366 empty_line = "\t " * 10 + "\n "
You can’t perform that action at this time.
0 commit comments