Skip to content

Commit e94e699

Browse files
authored
Merge pull request #179 from AngelikiBoura/master
Fix issue #162 add different regex pattern to search for meta tags
2 parents 821dfe5 + a0a0b0d commit e94e699

File tree

5 files changed

+40
-91
lines changed

5 files changed

+40
-91
lines changed

pylintrc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ disable=bad-continuation,
77
consider-using-in,
88
expression-not-assigned,
99
fixme,
10+
implicit-str-concat,
1011
import-error,
1112
import-outside-toplevel,
1213
inconsistent-return-statements,
@@ -31,8 +32,10 @@ disable=bad-continuation,
3132
trailing-newlines,
3233
trailing-whitespace,
3334
unidiomatic-typecheck,
35+
unnecessary-lambda-assignment,
3436
unreachable,
3537
unused-argument,
3638
unused-variable,
39+
useless-option-value,
3740
wrong-import-order,
3841
wrong-import-position

tests/test_html.py

Lines changed: 27 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -73,54 +73,18 @@ def test_browser_hack(self):
7373

7474
def test_missing_semicolon(self):
7575
for entity, result in (
76-
(
77-
"&lt&lt!",
78-
"<<!",
79-
),
80-
(
81-
"&LT!",
82-
"<!",
83-
),
84-
(
85-
"&#X41 ",
86-
"A ",
87-
),
88-
(
89-
"&#x41!",
90-
"A!",
91-
),
92-
(
93-
"&#x41h",
94-
"Ah",
95-
),
96-
(
97-
"&#65!",
98-
"A!",
99-
),
100-
(
101-
"&#65x",
102-
"Ax",
103-
),
104-
(
105-
"&sup3!",
106-
"\u00B3!",
107-
),
108-
(
109-
"&Aacute!",
110-
"\u00C1!",
111-
),
112-
(
113-
"&#9731!",
114-
"\u2603!",
115-
),
116-
(
117-
"&#153",
118-
"\u2122",
119-
),
120-
(
121-
"&#x99",
122-
"\u2122",
123-
),
76+
("&lt&lt!", "<<!"),
77+
("&LT!", "<!"),
78+
("&#X41 ", "A "),
79+
("&#x41!", "A!"),
80+
("&#x41h", "Ah"),
81+
("&#65!", "A!"),
82+
("&#65x", "Ax"),
83+
("&sup3!", "\u00B3!"),
84+
("&Aacute!", "\u00C1!"),
85+
("&#9731!", "\u2603!"),
86+
("&#153", "\u2122"),
87+
("&#x99", "\u2122"),
12488
):
12589
self.assertEqual(replace_entities(entity, encoding="cp1252"), result)
12690
self.assertEqual(
@@ -203,16 +167,7 @@ def test_returns_unicode(self):
203167
def test_remove_tags_without_tags(self):
204168
# text without tags
205169
self.assertEqual(remove_tags("no tags"), "no tags")
206-
self.assertEqual(
207-
remove_tags(
208-
"no tags",
209-
which_ones=(
210-
"p",
211-
"b",
212-
),
213-
),
214-
"no tags",
215-
)
170+
self.assertEqual(remove_tags("no tags", which_ones=("p", "b")), "no tags")
216171

217172
def test_remove_tags(self):
218173
# text with tags
@@ -294,14 +249,7 @@ def test_without_tags(self):
294249
# text without tags
295250
self.assertEqual(remove_tags_with_content("no tags"), "no tags")
296251
self.assertEqual(
297-
remove_tags_with_content(
298-
"no tags",
299-
which_ones=(
300-
"p",
301-
"b",
302-
),
303-
),
304-
"no tags",
252+
remove_tags_with_content("no tags", which_ones=("p", "b")), "no tags"
305253
)
306254

307255
def test_with_tags(self):
@@ -340,28 +288,10 @@ def test_returns_unicode(self):
340288
assert isinstance(replace_escape_chars(b"no ec"), str)
341289
assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str)
342290
assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str)
343-
assert isinstance(
344-
replace_escape_chars(
345-
b"no ec",
346-
which_ones=(
347-
"\n",
348-
"\t",
349-
),
350-
),
351-
str,
352-
)
291+
assert isinstance(replace_escape_chars(b"no ec", which_ones=("\n", "\t")), str)
353292
assert isinstance(replace_escape_chars("no ec"), str)
354293
assert isinstance(replace_escape_chars("no ec", replace_by="str"), str)
355-
assert isinstance(
356-
replace_escape_chars(
357-
"no ec",
358-
which_ones=(
359-
"\n",
360-
"\t",
361-
),
362-
),
363-
str,
364-
)
294+
assert isinstance(replace_escape_chars("no ec", which_ones=("\n", "\t")), str)
365295

366296
def test_without_escape_chars(self):
367297
# text without escape chars
@@ -669,3 +599,14 @@ def test_inside_script(self):
669599
get_meta_refresh(body, baseurl, ignore_tags=()),
670600
(0.0, "http://example.org/foobar_required"),
671601
)
602+
603+
def test_redirections_in_different_ordering__in_meta_tag(self):
604+
baseurl = "http://localhost:8000"
605+
url1 = '<html><head><meta http-equiv="refresh" content="0;url=dummy.html"></head></html>'
606+
url2 = '<html><head><meta content="0;url=dummy.html" http-equiv="refresh"></head></html>'
607+
self.assertEqual(
608+
get_meta_refresh(url1, baseurl), (0.0, "http://localhost:8000/dummy.html")
609+
)
610+
self.assertEqual(
611+
get_meta_refresh(url2, baseurl), (0.0, "http://localhost:8000/dummy.html")
612+
)

tox.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,13 @@ commands =
4242
[testenv:pylint]
4343
deps =
4444
{[testenv]deps}
45-
pylint
45+
pylint==2.14.2
4646
commands =
4747
pylint conftest.py docs setup.py tests w3lib
4848

4949
[testenv:black]
5050
deps =
51-
black
51+
black==22.3.0
5252
commands =
5353
black --check {posargs:conftest.py setup.py tests w3lib}
5454

w3lib/html.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121
r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)',
2222
re.DOTALL | re.IGNORECASE,
2323
)
24+
_meta_refresh_re2 = re.compile(
25+
r'<meta\s[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)[^>]*?\shttp-equiv\s*=[^>]*refresh',
26+
re.DOTALL | re.IGNORECASE,
27+
)
28+
2429
_cdata_re = re.compile(
2530
r"((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))", re.DOTALL
2631
)
@@ -228,7 +233,7 @@ def remove_tags_with_content(
228233

229234
utext = to_unicode(text, encoding)
230235
if which_ones:
231-
tags = "|".join([fr"<{tag}\b.*?</{tag}>|<{tag}\s*/>" for tag in which_ones])
236+
tags = "|".join([rf"<{tag}\b.*?</{tag}>|<{tag}\s*/>" for tag in which_ones])
232237
retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
233238
utext = retags.sub("", utext)
234239
return utext
@@ -338,7 +343,7 @@ def get_meta_refresh(
338343
raise
339344
utext = remove_tags_with_content(utext, ignore_tags)
340345
utext = remove_comments(replace_entities(utext))
341-
m = _meta_refresh_re.search(utext)
346+
m = _meta_refresh_re.search(utext) or _meta_refresh_re2.search(utext)
342347
if m:
343348
interval = float(m.group("int"))
344349
url = safe_url_string(m.group("url").strip(" \"'"), encoding)

w3lib/url.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult:
446446
if m:
447447
attribute, value, value_quoted = m.groups()
448448
if value_quoted:
449-
value = re.sub(br"\\(.)", rb"\1", value_quoted)
449+
value = re.sub(rb"\\(.)", rb"\1", value_quoted)
450450
media_type_params[attribute.decode()] = value.decode()
451451
uri = uri[m.end() :]
452452
else:

0 commit comments

Comments
 (0)