Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions setup.cfg

This file was deleted.

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
include_package_data=True,
zip_safe=False,
platforms=["Any"],
python_requires=">=3.6",
classifiers=[
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: BSD License",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _assert_encoding(self, content_type, body, expected_encoding, expected_unico
else:
self.assertTrue(
body_unicode in expected_unicode,
"%s is not in %s" % (body_unicode, expected_unicode),
f"{body_unicode} is not in {expected_unicode}",
)

def test_content_type_and_conversion(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def test_missing_semicolon(self):
):
self.assertEqual(replace_entities(entity, encoding="cp1252"), result)
self.assertEqual(
replace_entities("x%sy" % entity, encoding="cp1252"), "x%sy" % result
replace_entities(f"x{entity}y", encoding="cp1252"), f"x{result}y"
)

def test_encoding(self):
Expand Down
18 changes: 5 additions & 13 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,8 @@ def test_safe_url_idna_encoding_failure(self):

# DNS label too long
self.assertEqual(
safe_url_string(
"http://www.{label}.com/résumé?q=résumé".format(label="example" * 11)
),
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
label="example" * 11
),
safe_url_string(f"http://www.{'example' * 11}.com/résumé?q=résumé"),
f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9",
)

def test_safe_url_port_number(self):
Expand Down Expand Up @@ -971,12 +967,8 @@ def test_canonicalize_url_idna_exceptions(self):

# DNS label too long
self.assertEqual(
canonicalize_url(
"http://www.{label}.com/résumé?q=résumé".format(label="example" * 11)
),
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
label="example" * 11
),
canonicalize_url(f"http://www.{'example' * 11}.com/résumé?q=résumé"),
f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9",
)

def test_preserve_nonfragment_hash(self):
Expand Down Expand Up @@ -1033,7 +1025,7 @@ def test_bytes_uri(self):

def test_unicode_uri(self):
result = parse_data_uri("data:,é")
self.assertEqual(result.data, "é".encode("utf-8"))
self.assertEqual(result.data, "é".encode())

def test_default_mediatype(self):
result = parse_data_uri("data:;charset=iso-8859-7,%be%d3%be")
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# and then run "tox" from this directory.

[tox]
envlist = py27, pypy, py35, py36, py37, py38, pypy3, docs, security, flake8, pylint, black
envlist = py36, py37, py38, pypy3, docs, security, flake8, pylint, black

[testenv]
deps =
Expand Down
12 changes: 5 additions & 7 deletions w3lib/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
Functions for handling encoding of web pages
"""
import re, codecs, encodings
from sys import version_info
from typing import Callable, Match, Optional, Tuple, Union, cast
from w3lib._types import AnyUnicodeError, StrOrBytes
from w3lib.util import to_native_str
import w3lib.util

_HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)

Expand Down Expand Up @@ -46,6 +45,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
_XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P<xmlcharset>[\w-]+)")

# check for meta tags, or xml decl. and stop search if a body tag is encountered
# pylint: disable=consider-using-f-string
_BODY_ENCODING_PATTERN = (
r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)"
% (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
Expand Down Expand Up @@ -93,7 +93,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
or match.group("xmlcharset")
)
if encoding:
return resolve_encoding(to_native_str(encoding))
return resolve_encoding(w3lib.util.to_unicode(encoding))

return None

Expand Down Expand Up @@ -163,7 +163,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]:
(codecs.BOM_UTF16_LE, "utf-16-le"),
(codecs.BOM_UTF8, "utf-8"),
]
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
_FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE}


def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
Expand Down Expand Up @@ -208,9 +208,7 @@ def to_unicode(data_str: bytes, encoding: str) -> str:
Characters that cannot be converted will be converted to ``\\ufffd`` (the
unicode replacement character).
"""
return data_str.decode(
encoding, "replace" if version_info[0:2] >= (3, 3) else "w3lib_replace"
)
return data_str.decode(encoding, "replace")


def html_to_unicode(
Expand Down
4 changes: 1 addition & 3 deletions w3lib/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,7 @@ def remove_tags_with_content(

utext = to_unicode(text, encoding)
if which_ones:
tags = "|".join(
[r"<%s\b.*?</%s>|<%s\s*/>" % (tag, tag, tag) for tag in which_ones]
)
tags = "|".join([fr"<{tag}\b.*?</{tag}>|<{tag}\s*/>" for tag in which_ones])
retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
utext = retags.sub("", utext)
return utext
Expand Down
4 changes: 2 additions & 2 deletions w3lib/http.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from base64 import urlsafe_b64encode
from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping
from w3lib.util import to_bytes, to_native_str
from w3lib.util import to_bytes, to_unicode

HeadersDictInput = Mapping[bytes, Union[Any, Sequence]]
HeadersDictOutput = MutableMapping[bytes, List[bytes]]
Expand Down Expand Up @@ -97,7 +97,7 @@ def basic_auth_header(

"""

auth = "%s:%s" % (to_native_str(username), to_native_str(password))
auth = f"{to_unicode(username)}:{to_unicode(password)}"
# XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
# seems to be the most widely used encoding here. See also:
# http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
Expand Down
4 changes: 3 additions & 1 deletion w3lib/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def path_to_file_uri(path: str) -> str:
x = pathname2url(os.path.abspath(path))
if os.name == "nt":
x = x.replace("|", ":") # http://bugs.python.org/issue5861
return "file:///%s" % x.lstrip("/")
return f"file:///{x.lstrip('/')}"


def file_uri_to_path(uri: str) -> str:
Expand All @@ -344,6 +344,7 @@ def any_to_uri(uri_or_path: str) -> str:
_char = set(map(chr, range(127)))

# RFC 2045 token.
# pylint: disable=consider-using-f-string
_token = r"[{}]+".format(
re.escape(
"".join(
Expand All @@ -359,6 +360,7 @@ def any_to_uri(uri_or_path: str) -> str:
)

# RFC 822 quoted-string, without surrounding quotation marks.
# pylint: disable=consider-using-f-string
_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format(
re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char))
)
Expand Down