11from logging import getLogger
2- from typing import cast
2+ from warnings import warn
33
4- from scrapy import Request
5- from scrapy .exceptions import IgnoreRequest
4+ from scrapy import Request , Spider
5+ from scrapy .exceptions import IgnoreRequest , ScrapyDeprecationWarning
6+ from scrapy .utils .python import global_object_name
67from zyte_api import RequestError
78
89from ._params import _ParamParser
9- from .utils import _AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT
10+ from .utils import (
11+ _AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT ,
12+ _GET_SLOT_NEEDS_SPIDER ,
13+ _LOG_DEFERRED_IS_DEPRECATED ,
14+ _close_spider ,
15+ _schedule_coro ,
16+ maybe_deferred_to_future ,
17+ )
1018
1119logger = getLogger (__name__ )
1220_start_requests_processed = object ()
@@ -27,7 +35,19 @@ def __init__(self, crawler):
2735 not crawler .settings .getbool ("AUTOTHROTTLE_ENABLED" ),
2836 )
2937
30- def slot_request (self , request , spider , force = False ):
38+ def slot_request (
39+ self , request : Request , spider : Spider | None = None , force : bool = False
40+ ):
41+ if spider is not None :
42+ warn (
43+ f"Passing a 'spider' argument to "
44+ f"{ global_object_name (self .__class__ )} .slot_request() is "
45+ f"deprecated and the argument will be removed in a future "
46+ f"scrapy-zyte-api version." ,
47+ category = ScrapyDeprecationWarning ,
48+ stacklevel = 2 ,
49+ )
50+
3151 if not force and self ._param_parser .parse (request ) is None :
3252 return
3353
@@ -38,12 +58,13 @@ def slot_request(self, request, spider, force=False):
3858 try :
3959 slot_id = downloader .get_slot_key (request )
4060 except AttributeError : # Scrapy < 2.12
41- slot_id = downloader ._get_slot_key (request , spider )
61+ slot_id = downloader ._get_slot_key (request , self . _crawler . spider )
4262 if not isinstance (slot_id , str ) or not slot_id .startswith (self ._slot_prefix ):
4363 slot_id = f"{ self ._slot_prefix } { slot_id } "
4464 request .meta ["download_slot" ] = slot_id
4565 if not self ._preserve_delay :
46- _ , slot = downloader ._get_slot (request , spider )
66+ args = (self ._crawler .spider ,) if _GET_SLOT_NEEDS_SPIDER else ()
67+ _ , slot = downloader ._get_slot (request , * args )
4768 slot .delay = 0
4869
4970
@@ -65,6 +86,7 @@ def __init__(self, crawler) -> None:
6586 crawler .signals .connect (
6687 self ._start_requests_processed , signal = _start_requests_processed
6788 )
89+ self ._crawler = crawler
6890
6991 def _get_spm_mw (self ):
7092 spm_mw_classes = []
@@ -89,15 +111,15 @@ def _get_spm_mw(self):
89111 return middleware
90112 return None
91113
92- def _check_spm_conflict (self , spider ):
114+ def _check_spm_conflict (self ):
93115 checked = getattr (self , "_checked_spm_conflict" , False )
94116 if checked :
95117 return
96118 self ._checked_spm_conflict = True
97119 settings = self ._crawler .settings
98120 in_transparent_mode = settings .getbool ("ZYTE_API_TRANSPARENT_MODE" , False )
99121 spm_mw = self ._get_spm_mw ()
100- spm_is_enabled = spm_mw and spm_mw .is_enabled (spider )
122+ spm_is_enabled = spm_mw and spm_mw .is_enabled (self . _crawler . spider )
101123 if not in_transparent_mode or not spm_is_enabled :
102124 return
103125 logger .error (
@@ -114,35 +136,31 @@ def _check_spm_conflict(self, spider):
114136 "request.meta to set dont_proxy to True and zyte_api_automap "
115137 "either to True or to a dictionary of extra request fields."
116138 )
117- from twisted .internet import reactor
118- from twisted .internet .interfaces import IReactorCore
119-
120- reactor = cast (IReactorCore , reactor )
121- reactor .callLater (
122- 0 , self ._crawler .engine .close_spider , spider , "plugin_conflict"
123- )
139+ _close_spider (self ._crawler , "plugin_conflict" )
124140
125141 def _start_requests_processed (self , count ):
126142 self ._total_start_request_count = count
127143 self ._maybe_close ()
128144
129- def process_request (self , request , spider ):
130- self ._check_spm_conflict (spider )
145+ def process_request (self , request : Request , spider : Spider | None = None ):
146+ self ._check_spm_conflict ()
131147
132148 if self ._param_parser .parse (request ) is None :
133149 return
134150
135151 self ._request_count += 1
136152 if self ._max_requests and self ._request_count > self ._max_requests :
137- self ._crawler . engine . close_spider ( spider , "closespider_max_zapi_requests" )
153+ _close_spider ( self ._crawler , "closespider_max_zapi_requests" )
138154 raise IgnoreRequest (
139155 f"The request { request } is skipped as { self ._max_requests } max "
140156 f"Zyte API requests have been reached."
141157 )
142158
143- self .slot_request (request , spider , force = True )
159+ self .slot_request (request , force = True )
144160
145- def process_exception (self , request , exception , spider ):
161+ def process_exception (
162+ self , request : Request , exception : Exception , spider : Spider | None = None
163+ ):
146164 if (
147165 not request .meta .get ("is_start_request" )
148166 or not isinstance (exception , RequestError )
@@ -162,60 +180,69 @@ def _maybe_close(self):
162180 "Stopping the spider, all start requests failed because they "
163181 "were pointing to a domain forbidden by Zyte API."
164182 )
165- self ._crawler .engine .close_spider (
166- self ._crawler .spider , "failed_forbidden_domain"
167- )
183+ _close_spider (self ._crawler , "failed_forbidden_domain" )
168184
169185
170186class ScrapyZyteAPISpiderMiddleware (_BaseMiddleware ):
171187 def __init__ (self , crawler ):
172188 super ().__init__ (crawler )
173- self ._send_signal = crawler .signals .send_catch_log
189+ if _LOG_DEFERRED_IS_DEPRECATED :
190+ self ._send_signal = crawler .signals .send_catch_log_async
191+ else :
192+
193+ async def _send_signal (signal , ** kwargs ):
194+ await maybe_deferred_to_future (
195+ crawler .signals .send_catch_log_deferred (signal , ** kwargs )
196+ )
197+
198+ self ._send_signal = _send_signal
174199
175200 @staticmethod
176201 def _get_header_set (request ):
177202 return {header .strip ().lower () for header in request .headers }
178203
179- async def process_start (self , start ):
204+ async def process_start (self , start , spider : Spider | None = None ):
180205 # Mark start requests and reports to the downloader middleware the
181206 # number of them once all have been processed.
182207 count = 0
183208 async for item_or_request in start :
184209 if isinstance (item_or_request , Request ):
185210 count += 1
186211 item_or_request .meta ["is_start_request" ] = True
187- self ._process_output_request (item_or_request , None )
212+ self ._process_output_request (item_or_request )
188213 yield item_or_request
189- self ._send_signal (_start_requests_processed , count = count )
214+ await self ._send_signal (_start_requests_processed , count = count )
190215
191- def process_start_requests (self , start_requests , spider ):
216+ def process_start_requests (self , start_requests , spider : Spider ):
192217 count = 0
193218 for item_or_request in start_requests :
194219 if isinstance (item_or_request , Request ):
195220 count += 1
196221 item_or_request .meta ["is_start_request" ] = True
197- self ._process_output_request (item_or_request , spider )
222+ self ._process_output_request (item_or_request )
198223 yield item_or_request
199- self ._send_signal (_start_requests_processed , count = count )
224+ _schedule_coro ( self ._send_signal (_start_requests_processed , count = count ) )
200225
201- def _process_output_request (self , request , spider ):
226+ def _process_output_request (self , request : Request ):
202227 if "_pre_mw_headers" not in request .meta :
203228 request .meta ["_pre_mw_headers" ] = self ._get_header_set (request )
204- self .slot_request (request , spider )
229+ self .slot_request (request )
205230
206- def _process_output_item_or_request (self , item_or_request , spider ):
231+ def _process_output_item_or_request (self , item_or_request ):
207232 if not isinstance (item_or_request , Request ):
208233 return
209- self ._process_output_request (item_or_request , spider )
234+ self ._process_output_request (item_or_request )
210235
211- def process_spider_output (self , response , result , spider ):
236+ def process_spider_output (self , response , result , spider : Spider | None = None ):
212237 for item_or_request in result :
213- self ._process_output_item_or_request (item_or_request , spider )
238+ self ._process_output_item_or_request (item_or_request )
214239 yield item_or_request
215240
216- async def process_spider_output_async (self , response , result , spider ):
241+ async def process_spider_output_async (
242+ self , response , result , spider : Spider | None = None
243+ ):
217244 async for item_or_request in result :
218- self ._process_output_item_or_request (item_or_request , spider )
245+ self ._process_output_item_or_request (item_or_request )
219246 yield item_or_request
220247
221248
@@ -230,22 +257,24 @@ def __init__(self, crawler):
230257 )
231258 self ._param_parser = _ParamParser (crawler , cookies_enabled = False )
232259
233- def process_spider_output (self , response , result , spider ):
260+ def process_spider_output (self , response , result , spider : Spider | None = None ):
234261 for item_or_request in result :
235- self ._process_output_item_or_request (item_or_request , spider )
262+ self ._process_output_item_or_request (item_or_request )
236263 yield item_or_request
237264
238- async def process_spider_output_async (self , response , result , spider ):
265+ async def process_spider_output_async (
266+ self , response , result , spider : Spider | None = None
267+ ):
239268 async for item_or_request in result :
240- self ._process_output_item_or_request (item_or_request , spider )
269+ self ._process_output_item_or_request (item_or_request )
241270 yield item_or_request
242271
243- def _process_output_item_or_request (self , item_or_request , spider ):
272+ def _process_output_item_or_request (self , item_or_request ):
244273 if not isinstance (item_or_request , Request ):
245274 return
246- self ._process_output_request (item_or_request , spider )
275+ self ._process_output_request (item_or_request )
247276
248- def _process_output_request (self , request , spider ):
277+ def _process_output_request (self , request : Request ):
249278 if self ._is_zyte_api_request (request ):
250279 request .meta .setdefault ("referrer_policy" , self ._default_policy )
251280
0 commit comments