Skip to content

Commit 3352cbc

Browse files
committed
Use ruff
1 parent f1ecb67 commit 3352cbc

File tree

7 files changed

+306
-213
lines changed

7 files changed

+306
-213
lines changed

.pre-commit-config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
repos:
2+
- repo: https://github.com/astral-sh/ruff-pre-commit
3+
rev: v0.9.7
4+
hooks:
5+
- id: ruff
6+
args: [ --fix ]
7+
- id: ruff-format

pyproject.toml

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
[tool.ruff.lint]
2+
extend-select = [
3+
# flake8-bugbear
4+
"B",
5+
# flake8-comprehensions
6+
"C4",
7+
# pydocstyle
8+
"D",
9+
# flake8-future-annotations
10+
"FA",
11+
# flynt
12+
"FLY",
13+
# refurb
14+
"FURB",
15+
# isort
16+
"I",
17+
# flake8-implicit-str-concat
18+
"ISC",
19+
# flake8-logging
20+
"LOG",
21+
# Perflint
22+
"PERF",
23+
# pygrep-hooks
24+
"PGH",
25+
# flake8-pie
26+
"PIE",
27+
# pylint
28+
"PL",
29+
# flake8-pytest-style
30+
"PT",
31+
# flake8-use-pathlib
32+
"PTH",
33+
# flake8-pyi
34+
"PYI",
35+
# flake8-quotes
36+
"Q",
37+
# flake8-return
38+
"RET",
39+
# flake8-raise
40+
"RSE",
41+
# Ruff-specific rules
42+
"RUF",
43+
# flake8-bandit
44+
"S",
45+
# flake8-simplify
46+
"SIM",
47+
# flake8-slots
48+
"SLOT",
49+
# flake8-debugger
50+
"T10",
51+
# flake8-type-checking
52+
"TC",
53+
# pyupgrade
54+
"UP",
55+
# pycodestyle warnings
56+
"W",
57+
# flake8-2020
58+
"YTT",
59+
]
60+
ignore = [
61+
# Missing docstring in public module
62+
"D100",
63+
# Missing docstring in public class
64+
"D101",
65+
# Missing docstring in public function
66+
"D103",
67+
# Missing docstring in public package
68+
"D104",
69+
# Missing docstring in magic method
70+
"D105",
71+
# Missing docstring in __init__
72+
"D107",
73+
# One-line docstring should fit on one line with quotes
74+
"D200",
75+
# No blank lines allowed after function docstring
76+
"D202",
77+
# 1 blank line required between summary line and description
78+
"D205",
79+
# Multi-line docstring closing quotes should be on a separate line
80+
"D209",
81+
# First line should end with a period
82+
"D400",
83+
# First line should be in imperative mood; try rephrasing
84+
"D401",
85+
# First line should not be the function's "signature"
86+
"D402",
87+
# Too many return statements
88+
"PLR0911",
89+
# Too many branches
90+
"PLR0912",
91+
# Too many arguments in function definition
92+
"PLR0913",
93+
# Too many statements
94+
"PLR0915",
95+
# Magic value used in comparison
96+
"PLR2004",
97+
# Mutable class attributes should be annotated with `typing.ClassVar`
98+
"RUF012",
99+
# Use of `assert` detected
100+
"S101",
101+
]
102+
103+
[tool.ruff.lint.per-file-ignores]
104+
# D102: Missing docstring in public method
105+
"tests/**" = ["D102"]

scrapy_deltafetch/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .middleware import DeltaFetch
22

3-
3+
__all__ = ["DeltaFetch"]
44
__version__ = "2.0.1"

scrapy_deltafetch/middleware.py

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,22 @@
1+
import dbm
12
import logging
2-
import os
33
import time
4-
import dbm
4+
from pathlib import Path
55

6+
from scrapy import signals
7+
from scrapy.exceptions import NotConfigured
68
from scrapy.http import Request
79
from scrapy.item import Item
8-
from scrapy.utils.request import request_fingerprint
910
from scrapy.utils.project import data_path
1011
from scrapy.utils.python import to_bytes
11-
from scrapy.exceptions import NotConfigured
12-
from scrapy import signals
13-
12+
from scrapy.utils.request import request_fingerprint
1413

1514
logger = logging.getLogger(__name__)
1615

1716

18-
class DeltaFetch(object):
19-
"""
20-
This is a spider middleware to ignore requests to pages containing items
21-
seen in previous crawls of the same spider, thus producing a "delta crawl"
17+
class DeltaFetch:
18+
"""Spider middleware to ignore requests to pages containing items seen in
19+
previous crawls of the same spider, thus producing a "delta crawl"
2220
containing only new items.
2321
2422
This also speeds up the crawl, by reducing the number of requests that need
@@ -32,56 +30,57 @@ def __init__(self, dir, reset=False, stats=None):
3230
self.stats = stats
3331

3432
@classmethod
35-
def from_crawler(cls, crawler):
33+
def from_crawler(cls, crawler): # noqa: D102
3634
s = crawler.settings
37-
if not s.getbool('DELTAFETCH_ENABLED'):
35+
if not s.getbool("DELTAFETCH_ENABLED"):
3836
raise NotConfigured
39-
dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
40-
reset = s.getbool('DELTAFETCH_RESET')
37+
dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch"))
38+
reset = s.getbool("DELTAFETCH_RESET")
4139
o = cls(dir, reset, crawler.stats)
4240
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
4341
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
4442
return o
4543

46-
def spider_opened(self, spider):
47-
if not os.path.exists(self.dir):
48-
os.makedirs(self.dir)
44+
def spider_opened(self, spider): # noqa: D102
45+
dir = Path(self.dir)
46+
dir.mkdir(parents=True, exist_ok=True)
4947
# TODO may be tricky, as there may be different paths on systems
50-
dbpath = os.path.join(self.dir, '%s.db' % spider.name)
51-
reset = self.reset or getattr(spider, 'deltafetch_reset', False)
52-
flag = 'n' if reset else 'c'
48+
dbpath = dir / f"{spider.name}.db"
49+
reset = self.reset or getattr(spider, "deltafetch_reset", False)
50+
flag = "n" if reset else "c"
5351
try:
54-
self.db = dbm.open(dbpath, flag=flag)
52+
self.db = dbm.open(dbpath, flag=flag) # noqa: SIM115
5553
except Exception:
56-
logger.warning("Failed to open DeltaFetch database at %s, "
57-
"trying to recreate it" % dbpath)
58-
if os.path.exists(dbpath):
59-
os.remove(dbpath)
60-
self.db = dbm.open(dbpath, 'c')
54+
logger.warning(
55+
f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it"
56+
)
57+
if dbpath.exists():
58+
dbpath.unlink()
59+
self.db = dbm.open(dbpath, "c") # noqa: SIM115
6160

62-
def spider_closed(self, spider):
61+
def spider_closed(self, spider): # noqa: D102
6362
self.db.close()
6463

65-
def process_spider_output(self, response, result, spider):
64+
def process_spider_output(self, response, result, spider): # noqa: D102
6665
for r in result:
6766
if isinstance(r, Request):
6867
key = self._get_key(r)
6968
if key in self.db and self._is_enabled_for_request(r):
70-
logger.info("Ignoring already visited: %s" % r)
69+
logger.info(f"Ignoring already visited: {r}")
7170
if self.stats:
72-
self.stats.inc_value('deltafetch/skipped', spider=spider)
71+
self.stats.inc_value("deltafetch/skipped", spider=spider)
7372
continue
7473
elif isinstance(r, (Item, dict)):
7574
key = self._get_key(response.request)
7675
self.db[key] = str(time.time())
7776
if self.stats:
78-
self.stats.inc_value('deltafetch/stored', spider=spider)
77+
self.stats.inc_value("deltafetch/stored", spider=spider)
7978
yield r
8079

8180
def _get_key(self, request):
82-
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
81+
key = request.meta.get("deltafetch_key") or request_fingerprint(request)
8382
return to_bytes(key)
8483

8584
def _is_enabled_for_request(self, request):
8685
# Gives you option to disable deltafetch for some requests
87-
return request.meta.get('deltafetch_enabled', True)
86+
return request.meta.get("deltafetch_enabled", True)

setup.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
1+
from pathlib import Path
2+
13
from setuptools import setup
24

35
setup(
4-
name='scrapy-deltafetch',
5-
version='2.0.1',
6-
license='BSD',
7-
description='Scrapy middleware to ignore previously crawled pages',
8-
long_description=open('README.rst').read(),
9-
author='Zyte',
10-
author_email='[email protected]',
11-
url='http://github.com/scrapy-plugins/scrapy-deltafetch',
12-
packages=['scrapy_deltafetch'],
13-
platforms=['Any'],
6+
name="scrapy-deltafetch",
7+
version="2.0.1",
8+
license="BSD",
9+
description="Scrapy middleware to ignore previously crawled pages",
10+
long_description=Path("README.rst").read_text(encoding="utf-8"),
11+
author="Zyte",
12+
author_email="[email protected]",
13+
url="http://github.com/scrapy-plugins/scrapy-deltafetch",
14+
packages=["scrapy_deltafetch"],
15+
platforms=["Any"],
1416
classifiers=[
15-
'Development Status :: 4 - Beta',
16-
'License :: OSI Approved :: BSD License',
17-
'Operating System :: OS Independent',
18-
'Programming Language :: Python',
19-
'Programming Language :: Python :: 3',
20-
'Programming Language :: Python :: 3.5',
21-
'Programming Language :: Python :: 3.6',
22-
'Programming Language :: Python :: 3.7',
23-
'Programming Language :: Python :: 3.8',
24-
'Programming Language :: Python :: 3.9',
17+
"Development Status :: 4 - Beta",
18+
"License :: OSI Approved :: BSD License",
19+
"Operating System :: OS Independent",
20+
"Programming Language :: Python",
21+
"Programming Language :: Python :: 3",
22+
"Programming Language :: Python :: 3.5",
23+
"Programming Language :: Python :: 3.6",
24+
"Programming Language :: Python :: 3.7",
25+
"Programming Language :: Python :: 3.8",
26+
"Programming Language :: Python :: 3.9",
2527
],
26-
install_requires=['Scrapy>=1.1.0']
28+
install_requires=["Scrapy>=1.1.0"],
2729
)

tests/benchmark.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import tempfile
2+
from unittest import mock
23

3-
import mock
44
from scrapy import Request, Spider
55
from scrapy.statscollectors import StatsCollector
66
from scrapy.utils.test import get_crawler
@@ -9,23 +9,23 @@
99

1010

1111
def benchmark_middleware(result):
12-
spider_name = 'df_tests'
12+
spider_name = "df_tests"
1313
spider = Spider(spider_name)
1414
temp_dir = tempfile.gettempdir()
1515
crawler = get_crawler(Spider)
1616
stats = StatsCollector(crawler)
1717
mw = DeltaFetch(temp_dir, reset=False, stats=stats)
1818
mw.spider_opened(spider)
1919
response = mock.Mock()
20-
response.request = Request('http://url',
21-
meta={'deltafetch_key': 'key'})
20+
response.request = Request("http://url", meta={"deltafetch_key": "key"})
2221

23-
for x in mw.process_spider_output(response, result, spider):
22+
for _x in mw.process_spider_output(response, result, spider):
2423
pass
2524

25+
2626
def test_middleware(benchmark):
2727
result = []
2828
for x in range(50000):
29-
request = Request(f'https://{x}')
29+
request = Request(f"https://{x}")
3030
result.append(request)
3131
result = benchmark(benchmark_middleware, result)

0 commit comments

Comments
 (0)