Skip to content

Commit 98fb441

Browse files
authored
Count Zyte API requests from the downloader middleware itself (#228)
1 parent b08c735 commit 98fb441

File tree

2 files changed

+55
-21
lines changed

2 files changed

+55
-21
lines changed

scrapy_zyte_api/_middlewares.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def __init__(self, crawler) -> None:
5353
f"{self._max_requests}. The spider will close when it's "
5454
f"reached."
5555
)
56+
self._request_count = 0
5657

5758
crawler.signals.connect(
5859
self._start_requests_processed, signal=_start_requests_processed
@@ -124,29 +125,15 @@ def process_request(self, request, spider):
124125
if self._param_parser.parse(request) is None:
125126
return
126127

127-
self.slot_request(request, spider, force=True)
128-
129-
if self._max_requests_reached(self._crawler.engine.downloader):
128+
self._request_count += 1
129+
if self._max_requests and self._request_count > self._max_requests:
130130
self._crawler.engine.close_spider(spider, "closespider_max_zapi_requests")
131131
raise IgnoreRequest(
132132
f"The request {request} is skipped as {self._max_requests} max "
133133
f"Zyte API requests have been reached."
134134
)
135135

136-
def _max_requests_reached(self, downloader) -> bool:
137-
if not self._max_requests:
138-
return False
139-
140-
zapi_req_count = self._crawler.stats.get_value("scrapy-zyte-api/processed", 0)
141-
download_req_count = sum(
142-
[
143-
len(slot.transferring)
144-
for slot_id, slot in downloader.slots.items()
145-
if slot_id.startswith(self._slot_prefix)
146-
]
147-
)
148-
total_requests = zapi_req_count + download_req_count
149-
return total_requests >= self._max_requests
136+
self.slot_request(request, spider, force=True)
150137

151138
def process_exception(self, request, exception, spider):
152139
if (

tests/test_middlewares.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ def start_requests(self):
136136
for i in range(spider_requests):
137137
meta = {"zyte_api": {"browserHtml": True}}
138138

139-
# Alternating requests between ZAPI and non-ZAPI tests if
140-
# ZYTE_API_MAX_REQUESTS solely limits ZAPI Requests.
139+
# Alternating requests between ZAPI and non-ZAPI verifies
140+
# that ZYTE_API_MAX_REQUESTS solely limits ZAPI requests.
141141

142142
if i % 2:
143143
yield Request(
@@ -166,8 +166,8 @@ def parse(self, response):
166166
f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}"
167167
in caplog.text
168168
)
169-
assert crawler.stats.get_value("scrapy-zyte-api/success") <= zapi_max_requests
170-
assert crawler.stats.get_value("scrapy-zyte-api/processed") <= zapi_max_requests
169+
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
170+
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
171171
assert crawler.stats.get_value("item_scraped_count") <= zapi_max_requests + 6
172172
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
173173
assert (
@@ -178,6 +178,53 @@ def parse(self, response):
178178
)
179179

180180

181+
@ensureDeferred
182+
async def test_max_requests_race_condition(caplog):
183+
spider_requests = 8
184+
zapi_max_requests = 1
185+
186+
with MockServer(DelayedResource) as server:
187+
188+
class TestSpider(Spider):
189+
name = "test_spider"
190+
191+
def start_requests(self):
192+
for i in range(spider_requests):
193+
meta = {"zyte_api": {"browserHtml": True}}
194+
yield Request("https://example.com", meta=meta, dont_filter=True)
195+
196+
def parse(self, response):
197+
yield Item()
198+
199+
settings = {
200+
"DOWNLOADER_MIDDLEWARES": {
201+
"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633
202+
},
203+
"ZYTE_API_MAX_REQUESTS": zapi_max_requests,
204+
"ZYTE_API_URL": server.urljoin("/"),
205+
**SETTINGS,
206+
}
207+
208+
crawler = get_crawler(TestSpider, settings_dict=settings)
209+
with caplog.at_level("INFO"):
210+
await crawler.crawl()
211+
212+
assert (
213+
f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}"
214+
in caplog.text
215+
)
216+
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
217+
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
218+
assert crawler.stats.get_value("item_scraped_count") == zapi_max_requests
219+
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
220+
assert (
221+
crawler.stats.get_value(
222+
"downloader/exception_type_count/scrapy.exceptions.IgnoreRequest"
223+
)
224+
> 0
225+
)
226+
227+
181228
@ensureDeferred
182229
async def test_forbidden_domain_start_url():
183230
class TestSpider(Spider):

0 commit comments

Comments
 (0)