Skip to content

Commit 5856129

Browse files
GeorgeA92Georgiy ZatserklianyiGallaecio
authored
Implement ZYTE_API_PRESERVE_DELAY (#204)
Co-authored-by: Georgiy Zatserklianyi <[email protected]> Co-authored-by: Adrián Chaves <[email protected]>
1 parent beaf8ca commit 5856129

File tree

3 files changed

+53
-12
lines changed

3 files changed

+53
-12
lines changed

docs/reference/settings.rst

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,31 @@ Note that requests with error responses that cannot be retried or exceed their
205205
retry limit also count here.
206206

207207

208+
.. setting:: ZYTE_API_PRESERVE_DELAY
209+
210+
ZYTE_API_PRESERVE_DELAY
211+
=======================
212+
213+
Default: ``False if`` :setting:`AUTOTHROTTLE_ENABLED
214+
<scrapy:AUTOTHROTTLE_ENABLED>` ``else True``
215+
216+
By default, requests for which use of scrapy-zyte-api is enabled get
217+
``zyte-api@`` prepended to their download slot ID, and if
218+
:setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>` is ``True``, the
219+
corresponding download slot gets its download delay reset to 0. This nullifies
220+
the effects of the :ref:`AutoThrottle extension <topics-autothrottle>` for Zyte
221+
API requests, delegating throttling management to Zyte API.
222+
223+
If :setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>` is ``False``,
224+
but you have a download delay set through :setting:`DOWNLOAD_DELAY
225+
<scrapy:DOWNLOAD_DELAY>` and you do not want that delay to affect Zyte API
226+
requests, set this setting to ``False``.
227+
228+
If you have :setting:`AUTOTHROTTLE_ENABLED <scrapy:AUTOTHROTTLE_ENABLED>`
229+
enabled, and you want it to also work on Zyte API requests, set this setting to
230+
``True``.
231+
232+
208233
.. setting:: ZYTE_API_PROVIDER_PARAMS
209234

210235
ZYTE_API_PROVIDER_PARAMS

scrapy_zyte_api/_middlewares.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ def from_crawler(cls, crawler):
2121
def __init__(self, crawler):
2222
self._param_parser = _ParamParser(crawler, cookies_enabled=False)
2323
self._crawler = crawler
24+
self._preserve_delay = crawler.settings.getbool(
25+
"ZYTE_API_PRESERVE_DELAY",
26+
not crawler.settings.getbool("AUTOTHROTTLE_ENABLED"),
27+
)
2428

2529
def slot_request(self, request, spider, force=False):
2630
if not force and self._param_parser.parse(request) is None:
@@ -31,8 +35,9 @@ def slot_request(self, request, spider, force=False):
3135
if not isinstance(slot_id, str) or not slot_id.startswith(self._slot_prefix):
3236
slot_id = f"{self._slot_prefix}{slot_id}"
3337
request.meta["download_slot"] = slot_id
34-
_, slot = downloader._get_slot(request, spider)
35-
slot.delay = 0
38+
if not self._preserve_delay:
39+
_, slot = downloader._get_slot(request, spider)
40+
slot.delay = 0
3641

3742

3843
class ScrapyZyteAPIDownloaderMiddleware(_BaseMiddleware):

tests/test_middlewares.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,26 @@ def spider_output_processor(middleware, request, spider):
3939

4040

4141
@pytest.mark.parametrize(
42-
"mw_cls,processor",
42+
["mw_cls", "processor"],
4343
[
4444
(ScrapyZyteAPIDownloaderMiddleware, request_processor),
4545
(ScrapyZyteAPISpiderMiddleware, start_request_processor),
4646
(ScrapyZyteAPISpiderMiddleware, spider_output_processor),
4747
],
4848
)
49+
@pytest.mark.parametrize(
50+
["settings", "preserve"],
51+
[
52+
({}, True),
53+
({"ZYTE_API_PRESERVE_DELAY": False}, False),
54+
({"ZYTE_API_PRESERVE_DELAY": True}, True),
55+
({"AUTOTHROTTLE_ENABLED": True}, False),
56+
({"AUTOTHROTTLE_ENABLED": True, "ZYTE_API_PRESERVE_DELAY": True}, True),
57+
],
58+
)
4959
@ensureDeferred
50-
async def test_autothrottle_handling(mw_cls, processor):
51-
crawler = get_crawler()
60+
async def test_preserve_delay(mw_cls, processor, settings, preserve):
61+
crawler = get_crawler(settings_dict=settings)
5262
await crawler.crawl("a")
5363
spider = crawler.spider
5464

@@ -64,13 +74,13 @@ async def test_autothrottle_handling(mw_cls, processor):
6474
_, slot = crawler.engine.downloader._get_slot(request, spider)
6575
assert slot.delay == spider.download_delay
6676

67-
# On Zyte API requests, the download slot is changed, and its delay is set
68-
# to 0.
77+
# On Zyte API requests, the download slot is changed, and its delay may be
78+
# set to 0 depending on settings.
6979
request = Request("https://example.com", meta={"zyte_api": {}})
7080
processor(middleware, request, spider)
7181
assert request.meta["download_slot"] == "[email protected]"
7282
_, slot = crawler.engine.downloader._get_slot(request, spider)
73-
assert slot.delay == 0
83+
assert slot.delay == (5 if preserve else 0)
7484

7585
# Requests that happen to already have the right download slot assigned
7686
# work the same.
@@ -79,17 +89,18 @@ async def test_autothrottle_handling(mw_cls, processor):
7989
processor(middleware, request, spider)
8090
assert request.meta["download_slot"] == "[email protected]"
8191
_, slot = crawler.engine.downloader._get_slot(request, spider)
82-
assert slot.delay == 0
92+
assert slot.delay == (5 if preserve else 0)
8393

84-
# The slot delay is set to 0 every time a request for the slot is
94+
# The slot delay is taken into account every time a request for the slot is
8595
# processed, so even if it gets changed later on somehow, the downloader
86-
# middleware will reset it to 0 again the next time it processes a request.
96+
# middleware may reset it to 0 again the next time it processes a request
97+
# depending on settings.
8798
slot.delay = 10
8899
request = Request("https://example.com", meta={"zyte_api": {}})
89100
processor(middleware, request, spider)
90101
assert request.meta["download_slot"] == "[email protected]"
91102
_, slot = crawler.engine.downloader._get_slot(request, spider)
92-
assert slot.delay == 0
103+
assert slot.delay == (10 if preserve else 0)
93104

94105
await crawler.stop()
95106

0 commit comments

Comments
 (0)