Skip to content

Commit 875310e

Browse files
authored
Scrapy 2.14 compatibility (#100)
* Bump minimum versions * Update DiskQuota middlewares * Update HubstorageDownloaderMiddleware * Update HubstorageSpiderMiddleware, rename flag * Update HubStorageStatsCollector * Hardcode version in setup.py, add it to bumpversion.cfg * Remove async * Multiline function def * CI: update pinned reqs env * Update tests * Remove py39, add py314 * Update argument type hint * Replace typing.Optional with pipe syntax * Update DiskQuota middlewares again * Remove py314 * Remove unnecessary type annotations * Update tests * async for in spider middleware Avoid warning: [scrapy.core.spidermw] sh_scrapy.middlewares.HubstorageSpiderMiddleware.process_spider_output_async is not an async generator function, skipping this method. * Annotate classmethods without quotes
1 parent 3df69be commit 875310e

File tree

13 files changed

+275
-92
lines changed

13 files changed

+275
-92
lines changed

.bumpversion.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ tag = True
55
tag_name = v{new_version}
66

77
[bumpversion:file:sh_scrapy/__init__.py]
8+
9+
[bumpversion:file:setup.py]

.github/workflows/publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
- name: Set up Python
1414
uses: actions/setup-python@v4
1515
with:
16-
python-version: 3.9
16+
python-version: "3.10"
1717

1818
- name: Publish to PyPI
1919
run: |

.github/workflows/tests.yml

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,10 @@ jobs:
1010
fail-fast: false
1111
matrix:
1212
include:
13-
- python-version: "3.8"
13+
- python-version: "3.10"
1414
os: ubuntu-22.04
1515
env:
16-
TOXENV: py-scrapy16
17-
- python-version: "3.8"
18-
os: ubuntu-24.04
19-
env:
20-
TOXENV: py
21-
- python-version: "3.9"
22-
os: ubuntu-24.04
23-
env:
24-
TOXENV: py
16+
TOXENV: py-scrapy27
2517
- python-version: "3.10"
2618
os: ubuntu-24.04
2719
env:

setup.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
from setuptools import setup, find_packages
22

3-
from sh_scrapy import __version__
4-
53

64
setup(
75
name='scrapinghub-entrypoint-scrapy',
8-
version=__version__,
6+
version='0.17.7',
97
license='BSD',
108
description='Scrapy entrypoint for Scrapinghub job runner',
119
long_description=open('README.md').read(),
1210
packages=find_packages(),
1311
install_requires=[
14-
'Scrapy>=1.6',
15-
'scrapinghub>=2.1.0',
12+
'Scrapy>=2.7',
13+
'scrapinghub>=2.4.0',
1614
],
1715
entry_points={
1816
'console_scripts': [
@@ -21,7 +19,7 @@
2119
'shub-image-info = sh_scrapy.crawl:shub_image_info',
2220
],
2321
},
24-
python_requires='>=3.8',
22+
python_requires='>=3.10',
2523
classifiers=[
2624
'Framework :: Scrapy',
2725
'Development Status :: 5 - Production/Stable',
@@ -30,8 +28,6 @@
3028
'Operating System :: OS Independent',
3129
'Programming Language :: Python',
3230
'Programming Language :: Python :: 3',
33-
'Programming Language :: Python :: 3.8',
34-
'Programming Language :: Python :: 3.9',
3531
'Programming Language :: Python :: 3.10',
3632
'Programming Language :: Python :: 3.11',
3733
'Programming Language :: Python :: 3.12',

sh_scrapy/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
11
__version__ = "0.17.7"
2+
3+
4+
from scrapy import version_info as scrapy_version_info
5+
6+
7+
_SCRAPY_NO_SPIDER_ARG = scrapy_version_info >= (2, 14, 0)

sh_scrapy/diskquota.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,69 @@
33
The goal is to catch disk quota errors and stop spider gently.
44
"""
55

6+
from __future__ import annotations
7+
8+
import asyncio
9+
10+
from scrapy import Spider
11+
from scrapy.crawler import Crawler
612
from scrapy.exceptions import NotConfigured
13+
from scrapy.http import Request, Response
14+
15+
from sh_scrapy import _SCRAPY_NO_SPIDER_ARG
716

817

9-
class DiskQuota(object):
18+
class DiskQuota:
1019

11-
def __init__(self, crawler):
12-
if not crawler.settings.getbool('DISK_QUOTA_STOP_ON_ERROR'):
20+
def __init__(self, crawler: Crawler):
21+
if not crawler.settings.getbool("DISK_QUOTA_STOP_ON_ERROR"):
1322
raise NotConfigured
1423
self.crawler = crawler
1524

1625
@classmethod
17-
def from_crawler(cls, crawler):
26+
def from_crawler(cls, crawler: Crawler) -> DiskQuota:
1827
return cls(crawler)
1928

20-
def _is_disk_quota_error(self, error):
29+
def _is_disk_quota_error(self, error: Exception) -> bool:
2130
return isinstance(error, (OSError, IOError)) and error.errno == 122
2231

2332

2433
class DiskQuotaDownloaderMiddleware(DiskQuota):
2534

26-
def process_exception(self, request, exception, spider):
27-
if self._is_disk_quota_error(exception):
28-
self.crawler.engine.close_spider(spider, 'diskusage_exceeded')
35+
if _SCRAPY_NO_SPIDER_ARG:
36+
37+
async def process_exception(self, request: Request, exception: Exception) -> None:
38+
if self._is_disk_quota_error(exception):
39+
await self.crawler.engine.close_spider_async(reason="diskusage_exceeded")
40+
41+
else:
42+
43+
def process_exception(
44+
self, request: Request, exception: Exception, spider: Spider
45+
) -> None:
46+
if self._is_disk_quota_error(exception):
47+
self.crawler.engine.close_spider(spider, "diskusage_exceeded")
2948

3049

3150
class DiskQuotaSpiderMiddleware(DiskQuota):
3251

33-
def process_spider_exception(self, response, exception, spider):
34-
if self._is_disk_quota_error(exception):
35-
self.crawler.engine.close_spider(spider, 'diskusage_exceeded')
52+
def __init__(self, crawler: Crawler):
53+
super().__init__(crawler)
54+
self._tasks: set[asyncio.Task] = set()
55+
56+
if _SCRAPY_NO_SPIDER_ARG:
57+
58+
def process_spider_exception(self, response: Response, exception: Exception) -> None:
59+
if self._is_disk_quota_error(exception):
60+
coro = self.crawler.engine.close_spider_async(reason="diskusage_exceeded")
61+
task = asyncio.create_task(coro)
62+
self._tasks.add(task)
63+
task.add_done_callback(self._tasks.discard)
64+
65+
else:
66+
67+
def process_spider_exception(
68+
self, response: Response, exception: Exception, spider: Spider
69+
) -> None:
70+
if self._is_disk_quota_error(exception):
71+
self.crawler.engine.close_spider(spider, "diskusage_exceeded")

sh_scrapy/middlewares.py

Lines changed: 66 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,96 @@
11
# -*- coding: utf-8 -*-
2+
from __future__ import annotations
3+
24
import itertools
5+
from typing import AsyncIterable, AsyncGenerator, Iterable
36
from warnings import warn
47
from weakref import WeakKeyDictionary
58

6-
from scrapy import Request
9+
from scrapy import Spider
10+
from scrapy.crawler import Crawler
11+
from scrapy.http import Request, Response
712

13+
from sh_scrapy import _SCRAPY_NO_SPIDER_ARG
814
from sh_scrapy.writer import pipe_writer
915

16+
1017
HS_REQUEST_ID_KEY = '_hsid'
1118
HS_PARENT_ID_KEY = '_hsparent'
1219
request_id_sequence = itertools.count()
1320
seen_requests = WeakKeyDictionary()
1421

1522

16-
class HubstorageSpiderMiddleware(object):
23+
class HubstorageSpiderMiddleware:
1724
"""Hubstorage spider middleware.
18-
25+
1926
What it does:
20-
27+
2128
- Sets parent request ids to the requests coming out of the spider.
22-
29+
2330
"""
2431

25-
def __init__(self):
32+
def __init__(self) -> None:
2633
self._seen_requests = seen_requests
2734

28-
def process_spider_output(self, response, result, spider):
35+
if _SCRAPY_NO_SPIDER_ARG:
36+
37+
def process_spider_output(self, response: Response, result: Iterable) -> Iterable:
38+
return self._process_spider_output(response, result)
39+
40+
async def process_spider_output_async(
41+
self, response: Response, result: Iterable
42+
) -> AsyncGenerator:
43+
async for x in self._process_spider_output_async(response, result):
44+
yield x
45+
46+
else:
47+
48+
def process_spider_output(
49+
self, response: Response, result: Iterable, spider: Spider
50+
) -> Iterable:
51+
return self._process_spider_output(response, result)
52+
53+
async def process_spider_output_async(
54+
self, response: Response, result: Iterable, spider: Spider
55+
) -> AsyncGenerator:
56+
async for x in self._process_spider_output_async(response, result):
57+
yield x
58+
59+
def _process_spider_output(self, response: Response, result: Iterable) -> Iterable:
2960
parent = self._seen_requests.pop(response.request, None)
3061
for x in result:
3162
if isinstance(x, Request):
3263
self._process_request(x, parent)
3364
yield x
3465

35-
async def process_spider_output_async(self, response, result, spider):
66+
async def _process_spider_output_async(
67+
self, response: Response, result: AsyncIterable
68+
) -> AsyncGenerator:
3669
parent = self._seen_requests.pop(response.request, None)
3770
async for x in result:
3871
if isinstance(x, Request):
3972
self._process_request(x, parent)
4073
yield x
4174

42-
def _process_request(self, request, parent):
75+
def _process_request(self, request: Request, parent: int | None) -> None:
4376
request.meta[HS_PARENT_ID_KEY] = parent
4477
# Remove request id if it was for some reason set in the request coming from Spider.
4578
request.meta.pop(HS_REQUEST_ID_KEY, None)
4679

4780

4881
class HubstorageDownloaderMiddleware:
4982
"""Hubstorage dowloader middleware.
50-
83+
5184
What it does:
52-
85+
5386
- Generates request ids for all downloaded requests.
5487
- Sets parent request ids for requests generated in downloader middlewares.
5588
- Stores all downloaded requests into Hubstorage.
56-
89+
5790
"""
5891

5992
@classmethod
60-
def from_crawler(cls, crawler):
93+
def from_crawler(cls, crawler: Crawler) -> HubstorageDownloaderMiddleware:
6194
try:
6295
result = cls(crawler)
6396
except TypeError:
@@ -74,29 +107,45 @@ def from_crawler(cls, crawler):
74107
result._load_fingerprinter()
75108
return result
76109

77-
def __init__(self, crawler):
110+
def __init__(self, crawler: Crawler):
78111
self._crawler = crawler
79112
self._seen_requests = seen_requests
80113
self.pipe_writer = pipe_writer
81114
self.request_id_sequence = request_id_sequence
82115
self._load_fingerprinter()
83116

84-
def _load_fingerprinter(self):
117+
def _load_fingerprinter(self) -> None:
85118
if hasattr(self._crawler, "request_fingerprinter"):
86119
self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex()
87120
else:
88121
from scrapy.utils.request import request_fingerprint
89122
self._fingerprint = request_fingerprint
90123

91-
def process_request(self, request, spider):
124+
if _SCRAPY_NO_SPIDER_ARG:
125+
126+
def process_request(self, request: Request) -> None:
127+
return self._process_request(request)
128+
129+
def process_response(self, request: Request, response: Response) -> Response:
130+
return self._process_response(request, response)
131+
132+
else:
133+
134+
def process_request(self, request: Request, spider: Spider) -> None:
135+
return self._process_request(request)
136+
137+
def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
138+
return self._process_response(request, response)
139+
140+
def _process_request(self, request: Request) -> None:
92141
# Check if request id is set, which usually happens for retries or redirects because
93142
# those requests are usually copied from the original one.
94143
request_id = request.meta.pop(HS_REQUEST_ID_KEY, None)
95144
if request_id is not None:
96145
# Set original request id or None as a parent request id.
97146
request.meta[HS_PARENT_ID_KEY] = request_id
98147

99-
def process_response(self, request, response, spider):
148+
def _process_response(self, request: Request, response: Response) -> Response:
100149
# This class of response check is intended to fix the bug described here
101150
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112
102151
if type(response).__name__ == "DummyResponse" and type(response).__module__.startswith("scrapy_poet"):

sh_scrapy/stats.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,49 @@
1-
from twisted.internet import task
1+
from scrapy import Spider
2+
from scrapy.crawler import Crawler
23
from scrapy.statscollectors import StatsCollector
4+
from twisted.internet import task
35

4-
from sh_scrapy import hsref
6+
from sh_scrapy import hsref, _SCRAPY_NO_SPIDER_ARG
57
from sh_scrapy.writer import pipe_writer
68

79

810
class HubStorageStatsCollector(StatsCollector):
911

1012
INTERVAL = 30
1113

12-
def __init__(self, crawler):
14+
def __init__(self, crawler: Crawler):
1315
super(HubStorageStatsCollector, self).__init__(crawler)
1416
self.hsref = hsref.hsref
1517
self.pipe_writer = pipe_writer
1618

17-
def _upload_stats(self):
19+
def _upload_stats(self) -> None:
1820
self.pipe_writer.write_stats(self._stats)
1921

20-
def open_spider(self, spider):
21-
self._setup_looping_call(now=True)
22-
23-
def _setup_looping_call(self, _ignored=None, **kwargs):
22+
def _setup_looping_call(self, _ignored=None, **kwargs) -> None:
2423
self._samplestask = task.LoopingCall(self._upload_stats)
2524
d = self._samplestask.start(self.INTERVAL, **kwargs)
2625
d.addErrback(self._setup_looping_call, now=False)
2726

28-
def close_spider(self, spider, reason):
29-
super(HubStorageStatsCollector, self).close_spider(spider, reason)
27+
def _close_spider(self, spider: Spider | None = None, reason: str | None = None) -> None:
28+
super().close_spider(spider=spider, reason=reason)
3029
if self._samplestask.running:
3130
self._samplestask.stop()
3231
self._upload_stats()
32+
33+
if _SCRAPY_NO_SPIDER_ARG:
34+
35+
def open_spider(self) -> None:
36+
self._setup_looping_call(now=True)
37+
38+
def close_spider(self, reason: str | None = None) -> None:
39+
self._close_spider(reason=reason)
40+
41+
else:
42+
43+
def open_spider(self, spider: Spider | None = None) -> None:
44+
self._setup_looping_call(now=True)
45+
46+
def close_spider(
47+
self, spider: Spider | None = None, reason: str | None = None
48+
) -> None:
49+
self._close_spider(spider=spider, reason=reason)

0 commit comments

Comments
 (0)