Skip to content

Commit c2878b7

Browse files
authored
Update tool versions, add pyupgrade, cleanup pylint (#234)
1 parent fbe12ed commit c2878b7

File tree

14 files changed

+116
-149
lines changed

14 files changed

+116
-149
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
fail-fast: false
1717
matrix:
1818
include:
19-
- python-version: "3.12" # Keep in sync with .readthedocs.yml
19+
- python-version: "3.13" # Keep in sync with .readthedocs.yml
2020
env:
2121
TOXENV: docs
2222
- python-version: "3.13"

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ jobs:
3232
tox -e py
3333
3434
- name: Upload coverage report
35-
run: bash <(curl -s https://codecov.io/bash)
35+
uses: codecov/codecov-action@v5

.pre-commit-config.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/PyCQA/bandit
3-
rev: 1.7.10
3+
rev: 1.8.2
44
hooks:
55
- id: bandit
66
args: [-r, -c, .bandit.yml]
@@ -16,3 +16,8 @@ repos:
1616
rev: 5.13.2
1717
hooks:
1818
- id: isort
19+
- repo: https://github.com/asottile/pyupgrade
20+
rev: v3.19.1
21+
hooks:
22+
- id: pyupgrade
23+
args: [--py39-plus]

.readthedocs.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ sphinx:
44
configuration: docs/conf.py
55
fail_on_warning: true
66
build:
7-
os: ubuntu-22.04
7+
os: ubuntu-24.04
88
tools:
99
# For available versions, see:
1010
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
11-
python: "3.12" # Keep in sync with .github/workflows/build.yml
11+
python: "3.13" # Keep in sync with .github/workflows/build.yml
1212
python:
1313
install:
1414
- requirements: docs/requirements.txt

pylintrc

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,15 @@
22
persistent=no
33

44
[MESSAGES CONTROL]
5-
disable=bad-continuation,
6-
bad-whitespace,
7-
consider-using-in,
8-
expression-not-assigned,
9-
fixme,
10-
implicit-str-concat,
5+
enable=useless-suppression
6+
disable=fixme,
117
import-error,
128
import-outside-toplevel,
13-
inconsistent-return-statements,
149
invalid-name,
15-
len-as-condition,
1610
line-too-long,
1711
missing-class-docstring,
1812
missing-function-docstring,
1913
missing-module-docstring,
20-
multiple-imports,
21-
no-else-continue,
22-
no-else-return,
23-
no-self-use,
2414
raise-missing-from,
2515
redefined-builtin,
2616
redefined-outer-name,
@@ -29,14 +19,5 @@ disable=bad-continuation,
2919
too-many-lines,
3020
too-many-positional-arguments,
3121
too-many-public-methods,
32-
trailing-comma-tuple,
33-
trailing-newlines,
34-
trailing-whitespace,
35-
unidiomatic-typecheck,
36-
unnecessary-lambda-assignment,
37-
unreachable,
3822
unused-argument,
39-
unused-variable,
40-
useless-option-value,
41-
wrong-import-order,
42-
wrong-import-position
23+
unused-variable

tests/test_encoding.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
from __future__ import annotations
2+
13
import codecs
24
import unittest
3-
from typing import Any, List, Optional, Union
5+
from typing import Any
46

57
from w3lib.encoding import (
68
html_body_declared_encoding,
@@ -122,7 +124,7 @@ def test_invalid_utf8(self):
122124
self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3")
123125

124126

125-
def ct(charset: Optional[str]) -> Optional[str]:
127+
def ct(charset: str | None) -> str | None:
126128
return "Content-Type: text/html; charset=" + charset if charset else None
127129

128130

@@ -141,10 +143,10 @@ def test_unicode_body(self):
141143

142144
def _assert_encoding(
143145
self,
144-
content_type: Optional[str],
146+
content_type: str | None,
145147
body: bytes,
146148
expected_encoding: str,
147-
expected_unicode: Union[str, List[str]],
149+
expected_unicode: str | list[str],
148150
) -> None:
149151
assert not isinstance(body, str)
150152
encoding, body_unicode = html_to_unicode(ct(content_type), body)
@@ -218,7 +220,7 @@ def test_replace_wrong_encoding(self):
218220

219221
def _assert_encoding_detected(
220222
self,
221-
content_type: Optional[str],
223+
content_type: str | None,
222224
expected_encoding: str,
223225
body: bytes,
224226
**kwargs: Any,

tests/test_url.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
from __future__ import annotations
2+
13
import os
24
import sys
35
import unittest
46
from inspect import isclass
5-
from typing import Callable, List, Optional, Tuple, Type, Union
7+
from typing import Callable
68
from urllib.parse import urlparse
79

810
import pytest
@@ -35,9 +37,7 @@
3537
# input parameters.
3638
#
3739
# (encoding, input URL, output URL or exception)
38-
SAFE_URL_ENCODING_CASES: List[
39-
Tuple[Optional[str], StrOrBytes, Union[str, Type[Exception]]]
40-
] = [
40+
SAFE_URL_ENCODING_CASES: list[tuple[str | None, StrOrBytes, str | type[Exception]]] = [
4141
(None, "", ValueError),
4242
(None, "https://example.com", "https://example.com"),
4343
(None, "https://example.com/©", "https://example.com/%C2%A9"),
@@ -319,8 +319,8 @@
319319
def _test_safe_url_func(
320320
url: StrOrBytes,
321321
*,
322-
encoding: Optional[str] = None,
323-
output: Union[str, Type[Exception]],
322+
encoding: str | None = None,
323+
output: str | type[Exception],
324324
func: Callable[..., str],
325325
) -> None:
326326
kwargs = {}
@@ -338,8 +338,8 @@ def _test_safe_url_func(
338338
def _test_safe_url_string(
339339
url: StrOrBytes,
340340
*,
341-
encoding: Optional[str] = None,
342-
output: Union[str, Type[Exception]],
341+
encoding: str | None = None,
342+
output: str | type[Exception],
343343
) -> None:
344344
return _test_safe_url_func(
345345
url,
@@ -373,7 +373,7 @@ def _test_safe_url_string(
373373
),
374374
)
375375
def test_safe_url_string_encoding(
376-
encoding: Optional[str], url: StrOrBytes, output: Union[str, Type[Exception]]
376+
encoding: str | None, url: StrOrBytes, output: str | type[Exception]
377377
) -> None:
378378
_test_safe_url_string(url, encoding=encoding, output=output)
379379

@@ -439,9 +439,7 @@ def test_safe_url_string_encoding(
439439
for case in SAFE_URL_URL_CASES
440440
),
441441
)
442-
def test_safe_url_string_url(
443-
url: StrOrBytes, output: Union[str, Type[Exception]]
444-
) -> None:
442+
def test_safe_url_string_url(url: StrOrBytes, output: str | type[Exception]) -> None:
445443
_test_safe_url_string(url, output=output)
446444

447445

@@ -858,6 +856,7 @@ def test_url_query_parameter(self):
858856
url_query_parameter("product.html?id=", "id", keep_blank_values=1), ""
859857
)
860858

859+
@pytest.mark.xfail
861860
def test_url_query_parameter_2(self):
862861
"""
863862
This problem was seen several times in the feeds. Sometime affiliate URLs contains
@@ -873,14 +872,14 @@ def test_url_query_parameter_2(self):
873872
and the URL extraction will fail, current workaround was made in the spider,
874873
just a replace for &#39; to %27
875874
"""
876-
return # FIXME: this test should pass but currently doesnt
877875
# correct case
878876
aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1"
879877
aff_url2 = url_query_parameter(aff_url1, "url")
880878
self.assertEqual(
881879
aff_url2,
882880
"http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1",
883881
)
882+
assert aff_url2 is not None
884883
prod_url = url_query_parameter(aff_url2, "referredURL")
885884
self.assertEqual(
886885
prod_url,
@@ -893,6 +892,7 @@ def test_url_query_parameter_2(self):
893892
aff_url2,
894893
"http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children&#39;s garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1",
895894
)
895+
assert aff_url2 is not None
896896
prod_url = url_query_parameter(aff_url2, "referredURL")
897897
# fails, prod_url is None now
898898
self.assertEqual(
@@ -1574,7 +1574,7 @@ def test_mediatype_parameters(self):
15741574
self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e")
15751575

15761576
def test_base64(self):
1577-
result = parse_data_uri("data:text/plain;base64," "SGVsbG8sIHdvcmxkLg%3D%3D")
1577+
result = parse_data_uri("data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D")
15781578
self.assertEqual(result.media_type, "text/plain")
15791579
self.assertEqual(result.data, b"Hello, world.")
15801580

@@ -1587,7 +1587,7 @@ def test_base64_spaces(self):
15871587
self.assertEqual(result.data, b"Hello, world.")
15881588

15891589
result = parse_data_uri(
1590-
"data:text/plain;base64,SGVsb G8sIH\n " "dvcm xk Lg%3D\n%3D"
1590+
"data:text/plain;base64,SGVsb G8sIH\n dvcm xk Lg%3D\n%3D"
15911591
)
15921592
self.assertEqual(result.media_type, "text/plain")
15931593
self.assertEqual(result.data, b"Hello, world.")

tox.ini

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@ basepython = python3
2121
deps =
2222
# mypy would error if pytest (or its stub) not found
2323
pytest
24-
mypy==1.11.2
24+
mypy==1.14.1
2525
commands =
2626
mypy --strict {posargs: w3lib tests}
2727

2828
[testenv:pylint]
2929
deps =
3030
{[testenv]deps}
31-
pylint==3.3.1
31+
pylint==3.3.3
3232
commands =
3333
pylint conftest.py docs setup.py tests w3lib
3434

@@ -46,8 +46,8 @@ skip_install = true
4646
[testenv:twinecheck]
4747
basepython = python3
4848
deps =
49-
twine==5.1.1
50-
build==1.2.2
49+
twine==6.1.0
50+
build==1.2.2.post1
5151
commands =
5252
python -m build --sdist
5353
twine check dist/*

w3lib/_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
from typing import Union
24

35
# the base class UnicodeError doesn't have attributes like start / end

w3lib/encoding.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,21 @@
22
Functions for handling encoding of web pages
33
"""
44

5+
from __future__ import annotations
6+
57
import codecs
68
import encodings
79
import re
8-
from typing import Callable, Match, Optional, Tuple, Union, cast
10+
from re import Match
11+
from typing import Callable, cast
912

1013
import w3lib.util
1114
from w3lib._types import AnyUnicodeError, StrOrBytes
1215

1316
_HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)
1417

1518

16-
def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
19+
def http_content_type_encoding(content_type: str | None) -> str | None:
1720
"""Extract the encoding in the content-type header
1821
1922
>>> import w3lib.encoding
@@ -49,7 +52,6 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
4952
_XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P<xmlcharset>[\w-]+)")
5053

5154
# check for meta tags, or xml decl. and stop search if a body tag is encountered
52-
# pylint: disable=consider-using-f-string
5355
_BODY_ENCODING_PATTERN = (
5456
r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)"
5557
% (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
@@ -60,7 +62,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
6062
)
6163

6264

63-
def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
65+
def html_body_declared_encoding(html_body_str: StrOrBytes) -> str | None:
6466
'''Return the encoding specified in meta tags in the html body,
6567
or ``None`` if no suitable encoding was found
6668
@@ -84,7 +86,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
8486

8587
# html5 suggests the first 1024 bytes are sufficient, we allow for more
8688
chunk = html_body_str[:4096]
87-
match: Union[Optional[Match[bytes]], Optional[Match[str]]]
89+
match: Match[bytes] | Match[str] | None
8890
if isinstance(chunk, bytes):
8991
match = _BODY_ENCODING_BYTES_RE.search(chunk)
9092
else:
@@ -140,7 +142,7 @@ def _c18n_encoding(encoding: str) -> str:
140142
return cast(str, encodings.aliases.aliases.get(normed, normed))
141143

142144

143-
def resolve_encoding(encoding_alias: str) -> Optional[str]:
145+
def resolve_encoding(encoding_alias: str) -> str | None:
144146
"""Return the encoding that `encoding_alias` maps to, or ``None``
145147
if the encoding cannot be interpreted
146148
@@ -170,7 +172,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]:
170172
_FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE}
171173

172174

173-
def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
175+
def read_bom(data: bytes) -> tuple[None, None] | tuple[str, bytes]:
174176
r"""Read the byte order mark in the text, if present, and
175177
return the encoding represented by the BOM and the BOM.
176178
@@ -216,11 +218,11 @@ def to_unicode(data_str: bytes, encoding: str) -> str:
216218

217219

218220
def html_to_unicode(
219-
content_type_header: Optional[str],
221+
content_type_header: str | None,
220222
html_body_str: bytes,
221223
default_encoding: str = "utf8",
222-
auto_detect_fun: Optional[Callable[[bytes], Optional[str]]] = None,
223-
) -> Tuple[str, str]:
224+
auto_detect_fun: Callable[[bytes], str | None] | None = None,
225+
) -> tuple[str, str]:
224226
r'''Convert raw html bytes to unicode
225227
226228
This attempts to make a reasonable guess at the content encoding of the
@@ -289,7 +291,7 @@ def html_to_unicode(
289291

290292
enc = http_content_type_encoding(content_type_header)
291293
if enc is not None:
292-
if enc == "utf-16" or enc == "utf-32":
294+
if enc in {"utf-16", "utf-32"}:
293295
enc += "-be"
294296
return enc, to_unicode(html_body_str, enc)
295297
enc = html_body_declared_encoding(html_body_str)

0 commit comments

Comments
 (0)