Skip to content

Commit 8e393e5

Browse files
authored
Support Scrapy 2.14 (#269)
1 parent ce6965a commit 8e393e5

22 files changed

+1115
-847
lines changed

.github/workflows/test.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,23 @@ jobs:
1717
fail-fast: false
1818
matrix:
1919
include:
20-
- python-version: '3.9'
20+
- python-version: '3.10'
2121
toxenv: min-scrapy-2x0
22-
- python-version: '3.9'
22+
- python-version: '3.10'
2323
toxenv: min-scrapy-2x1
24-
- python-version: '3.9'
24+
- python-version: '3.10'
2525
toxenv: min-scrapy-2x3
26-
- python-version: '3.9'
26+
- python-version: '3.10'
2727
toxenv: min-scrapy-2x4
28-
- python-version: '3.9'
28+
- python-version: '3.10'
2929
toxenv: min-scrapy-2x5
30-
- python-version: '3.9'
30+
- python-version: '3.10'
3131
toxenv: min-scrapy-2x6
32-
- python-version: '3.9'
32+
- python-version: '3.10'
3333
toxenv: min-scrapy-2x7
34-
- python-version: '3.9'
34+
- python-version: '3.10'
3535
toxenv: min-extra
36-
- python-version: '3.9'
36+
- python-version: '3.10'
3737
toxenv: min-provider
3838
- python-version: '3.10'
3939
toxenv: min-x402

docs/setup.rst

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,12 @@ You need at least:
1818
- A :ref:`Zyte API <zyte-api>` subscription (there’s a :ref:`free trial
1919
<zapi-trial>`).
2020

21-
- Python 3.9+
21+
- Python 3.10+
2222

2323
- Scrapy 2.0.1+
2424

2525
:doc:`scrapy-poet <scrapy-poet:index>` integration requires Scrapy 2.6+.
2626

27-
:ref:`x402 support <x402>` requires Python 3.10+.
28-
2927

3028
.. _install:
3129

@@ -38,14 +36,13 @@ For a basic installation:
3836
3937
pip install scrapy-zyte-api
4038
41-
For :ref:`scrapy-poet integration <scrapy-poet>`:
39+
For :ref:`scrapy-poet integration <scrapy-poet>`, install the ``provider`` extra:
4240

4341
.. code-block:: shell
4442
4543
pip install scrapy-zyte-api[provider]
4644
47-
For :ref:`x402 support <x402>`, make sure you have Python 3.10+ and install
48-
the ``x402`` extra:
45+
For :ref:`x402 support <x402>`, install the ``x402`` extra:
4946

5047
.. code-block:: shell
5148

docs/usage/automap.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ For example:
1919
class SampleQuotesSpider(scrapy.Spider):
2020
name = "sample_quotes"
2121
22-
def start_requests(self):
22+
async def start(self):
2323
yield scrapy.Request(
2424
url="https://quotes.toscrape.com/",
2525
meta={

docs/usage/manual.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ For example:
2222
class SampleQuotesSpider(scrapy.Spider):
2323
name = "sample_quotes"
2424
25-
def start_requests(self):
25+
async def start(self):
2626
yield scrapy.Request(
2727
url="https://quotes.toscrape.com/",
2828
meta={
@@ -48,7 +48,7 @@ remember to also request :http:`request:httpResponseHeaders`:
4848
class SampleQuotesSpider(scrapy.Spider):
4949
name = "sample_quotes"
5050
51-
def start_requests(self):
51+
async def start(self):
5252
yield scrapy.Request(
5353
url="https://quotes.toscrape.com/",
5454
meta={

pyproject.toml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@ classifiers = [
1616
"Operating System :: OS Independent",
1717
"Programming Language :: Python",
1818
"Programming Language :: Python :: 3",
19-
"Programming Language :: Python :: 3.9",
2019
"Programming Language :: Python :: 3.10",
2120
"Programming Language :: Python :: 3.11",
2221
"Programming Language :: Python :: 3.12",
2322
"Programming Language :: Python :: 3.13",
2423
]
25-
requires-python = ">=3.9"
24+
requires-python = ">=3.10"
2625
# Sync with [pinned] @ tox.ini
2726
dependencies = [
2827
"packaging>=20.0",
@@ -120,5 +119,13 @@ testpaths = [
120119
]
121120
minversion = "6.0"
122121
filterwarnings = [
123-
"ignore::DeprecationWarning:twisted.web.http",
122+
"ignore::DeprecationWarning:twisted\\.web\\.http",
123+
"ignore::DeprecationWarning:scrapy\\.core\\.downloader\\.contextfactory", # https://github.com/scrapy/scrapy/issues/3288
124+
125+
# scrapy-poet warnings for Scrapy 2.14:
126+
"ignore:CollectorPipeline\\.:scrapy.exceptions.ScrapyDeprecationWarning",
127+
"ignore:DownloaderStatsMiddleware\\.:scrapy.exceptions.ScrapyDeprecationWarning",
128+
"ignore:.*?InjectionMiddleware\\.:scrapy.exceptions.ScrapyDeprecationWarning",
129+
"ignore:RetryMiddleware\\.process_spider_exception\\(\\):scrapy.exceptions.ScrapyDeprecationWarning",
130+
"ignore::scrapy.exceptions.ScrapyDeprecationWarning:scrapy_poet",
124131
]

scrapy_zyte_api/_middlewares.py

Lines changed: 75 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
from logging import getLogger
2-
from typing import cast
2+
from warnings import warn
33

4-
from scrapy import Request
5-
from scrapy.exceptions import IgnoreRequest
4+
from scrapy import Request, Spider
5+
from scrapy.exceptions import IgnoreRequest, ScrapyDeprecationWarning
6+
from scrapy.utils.python import global_object_name
67
from zyte_api import RequestError
78

89
from ._params import _ParamParser
9-
from .utils import _AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT
10+
from .utils import (
11+
_AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT,
12+
_GET_SLOT_NEEDS_SPIDER,
13+
_LOG_DEFERRED_IS_DEPRECATED,
14+
_close_spider,
15+
_schedule_coro,
16+
maybe_deferred_to_future,
17+
)
1018

1119
logger = getLogger(__name__)
1220
_start_requests_processed = object()
@@ -27,7 +35,19 @@ def __init__(self, crawler):
2735
not crawler.settings.getbool("AUTOTHROTTLE_ENABLED"),
2836
)
2937

30-
def slot_request(self, request, spider, force=False):
38+
def slot_request(
39+
self, request: Request, spider: Spider | None = None, force: bool = False
40+
):
41+
if spider is not None:
42+
warn(
43+
f"Passing a 'spider' argument to "
44+
f"{global_object_name(self.__class__)}.slot_request() is "
45+
f"deprecated and the argument will be removed in a future "
46+
f"scrapy-zyte-api version.",
47+
category=ScrapyDeprecationWarning,
48+
stacklevel=2,
49+
)
50+
3151
if not force and self._param_parser.parse(request) is None:
3252
return
3353

@@ -38,12 +58,13 @@ def slot_request(self, request, spider, force=False):
3858
try:
3959
slot_id = downloader.get_slot_key(request)
4060
except AttributeError: # Scrapy < 2.12
41-
slot_id = downloader._get_slot_key(request, spider)
61+
slot_id = downloader._get_slot_key(request, self._crawler.spider)
4262
if not isinstance(slot_id, str) or not slot_id.startswith(self._slot_prefix):
4363
slot_id = f"{self._slot_prefix}{slot_id}"
4464
request.meta["download_slot"] = slot_id
4565
if not self._preserve_delay:
46-
_, slot = downloader._get_slot(request, spider)
66+
args = (self._crawler.spider,) if _GET_SLOT_NEEDS_SPIDER else ()
67+
_, slot = downloader._get_slot(request, *args)
4768
slot.delay = 0
4869

4970

@@ -65,6 +86,7 @@ def __init__(self, crawler) -> None:
6586
crawler.signals.connect(
6687
self._start_requests_processed, signal=_start_requests_processed
6788
)
89+
self._crawler = crawler
6890

6991
def _get_spm_mw(self):
7092
spm_mw_classes = []
@@ -89,15 +111,15 @@ def _get_spm_mw(self):
89111
return middleware
90112
return None
91113

92-
def _check_spm_conflict(self, spider):
114+
def _check_spm_conflict(self):
93115
checked = getattr(self, "_checked_spm_conflict", False)
94116
if checked:
95117
return
96118
self._checked_spm_conflict = True
97119
settings = self._crawler.settings
98120
in_transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False)
99121
spm_mw = self._get_spm_mw()
100-
spm_is_enabled = spm_mw and spm_mw.is_enabled(spider)
122+
spm_is_enabled = spm_mw and spm_mw.is_enabled(self._crawler.spider)
101123
if not in_transparent_mode or not spm_is_enabled:
102124
return
103125
logger.error(
@@ -114,35 +136,31 @@ def _check_spm_conflict(self, spider):
114136
"request.meta to set dont_proxy to True and zyte_api_automap "
115137
"either to True or to a dictionary of extra request fields."
116138
)
117-
from twisted.internet import reactor
118-
from twisted.internet.interfaces import IReactorCore
119-
120-
reactor = cast(IReactorCore, reactor)
121-
reactor.callLater(
122-
0, self._crawler.engine.close_spider, spider, "plugin_conflict"
123-
)
139+
_close_spider(self._crawler, "plugin_conflict")
124140

125141
def _start_requests_processed(self, count):
126142
self._total_start_request_count = count
127143
self._maybe_close()
128144

129-
def process_request(self, request, spider):
130-
self._check_spm_conflict(spider)
145+
def process_request(self, request: Request, spider: Spider | None = None):
146+
self._check_spm_conflict()
131147

132148
if self._param_parser.parse(request) is None:
133149
return
134150

135151
self._request_count += 1
136152
if self._max_requests and self._request_count > self._max_requests:
137-
self._crawler.engine.close_spider(spider, "closespider_max_zapi_requests")
153+
_close_spider(self._crawler, "closespider_max_zapi_requests")
138154
raise IgnoreRequest(
139155
f"The request {request} is skipped as {self._max_requests} max "
140156
f"Zyte API requests have been reached."
141157
)
142158

143-
self.slot_request(request, spider, force=True)
159+
self.slot_request(request, force=True)
144160

145-
def process_exception(self, request, exception, spider):
161+
def process_exception(
162+
self, request: Request, exception: Exception, spider: Spider | None = None
163+
):
146164
if (
147165
not request.meta.get("is_start_request")
148166
or not isinstance(exception, RequestError)
@@ -162,60 +180,69 @@ def _maybe_close(self):
162180
"Stopping the spider, all start requests failed because they "
163181
"were pointing to a domain forbidden by Zyte API."
164182
)
165-
self._crawler.engine.close_spider(
166-
self._crawler.spider, "failed_forbidden_domain"
167-
)
183+
_close_spider(self._crawler, "failed_forbidden_domain")
168184

169185

170186
class ScrapyZyteAPISpiderMiddleware(_BaseMiddleware):
171187
def __init__(self, crawler):
172188
super().__init__(crawler)
173-
self._send_signal = crawler.signals.send_catch_log
189+
if _LOG_DEFERRED_IS_DEPRECATED:
190+
self._send_signal = crawler.signals.send_catch_log_async
191+
else:
192+
193+
async def _send_signal(signal, **kwargs):
194+
await maybe_deferred_to_future(
195+
crawler.signals.send_catch_log_deferred(signal, **kwargs)
196+
)
197+
198+
self._send_signal = _send_signal
174199

175200
@staticmethod
176201
def _get_header_set(request):
177202
return {header.strip().lower() for header in request.headers}
178203

179-
async def process_start(self, start):
204+
async def process_start(self, start, spider: Spider | None = None):
180205
# Mark start requests and reports to the downloader middleware the
181206
# number of them once all have been processed.
182207
count = 0
183208
async for item_or_request in start:
184209
if isinstance(item_or_request, Request):
185210
count += 1
186211
item_or_request.meta["is_start_request"] = True
187-
self._process_output_request(item_or_request, None)
212+
self._process_output_request(item_or_request)
188213
yield item_or_request
189-
self._send_signal(_start_requests_processed, count=count)
214+
await self._send_signal(_start_requests_processed, count=count)
190215

191-
def process_start_requests(self, start_requests, spider):
216+
def process_start_requests(self, start_requests, spider: Spider):
192217
count = 0
193218
for item_or_request in start_requests:
194219
if isinstance(item_or_request, Request):
195220
count += 1
196221
item_or_request.meta["is_start_request"] = True
197-
self._process_output_request(item_or_request, spider)
222+
self._process_output_request(item_or_request)
198223
yield item_or_request
199-
self._send_signal(_start_requests_processed, count=count)
224+
_schedule_coro(self._send_signal(_start_requests_processed, count=count))
200225

201-
def _process_output_request(self, request, spider):
226+
def _process_output_request(self, request: Request):
202227
if "_pre_mw_headers" not in request.meta:
203228
request.meta["_pre_mw_headers"] = self._get_header_set(request)
204-
self.slot_request(request, spider)
229+
self.slot_request(request)
205230

206-
def _process_output_item_or_request(self, item_or_request, spider):
231+
def _process_output_item_or_request(self, item_or_request):
207232
if not isinstance(item_or_request, Request):
208233
return
209-
self._process_output_request(item_or_request, spider)
234+
self._process_output_request(item_or_request)
210235

211-
def process_spider_output(self, response, result, spider):
236+
def process_spider_output(self, response, result, spider: Spider | None = None):
212237
for item_or_request in result:
213-
self._process_output_item_or_request(item_or_request, spider)
238+
self._process_output_item_or_request(item_or_request)
214239
yield item_or_request
215240

216-
async def process_spider_output_async(self, response, result, spider):
241+
async def process_spider_output_async(
242+
self, response, result, spider: Spider | None = None
243+
):
217244
async for item_or_request in result:
218-
self._process_output_item_or_request(item_or_request, spider)
245+
self._process_output_item_or_request(item_or_request)
219246
yield item_or_request
220247

221248

@@ -230,22 +257,24 @@ def __init__(self, crawler):
230257
)
231258
self._param_parser = _ParamParser(crawler, cookies_enabled=False)
232259

233-
def process_spider_output(self, response, result, spider):
260+
def process_spider_output(self, response, result, spider: Spider | None = None):
234261
for item_or_request in result:
235-
self._process_output_item_or_request(item_or_request, spider)
262+
self._process_output_item_or_request(item_or_request)
236263
yield item_or_request
237264

238-
async def process_spider_output_async(self, response, result, spider):
265+
async def process_spider_output_async(
266+
self, response, result, spider: Spider | None = None
267+
):
239268
async for item_or_request in result:
240-
self._process_output_item_or_request(item_or_request, spider)
269+
self._process_output_item_or_request(item_or_request)
241270
yield item_or_request
242271

243-
def _process_output_item_or_request(self, item_or_request, spider):
272+
def _process_output_item_or_request(self, item_or_request):
244273
if not isinstance(item_or_request, Request):
245274
return
246-
self._process_output_request(item_or_request, spider)
275+
self._process_output_request(item_or_request)
247276

248-
def _process_output_request(self, request, spider):
277+
def _process_output_request(self, request: Request):
249278
if self._is_zyte_api_request(request):
250279
request.meta.setdefault("referrer_policy", self._default_policy)
251280

0 commit comments

Comments
 (0)