@@ -86,7 +86,7 @@ def _make_auth_url(self, spider):
8686 auth = self .get_proxyauth (spider )
8787 if not auth .startswith (b'Basic ' ):
8888 raise ValueError (
89- 'Zyte Smart Proxy Manager only supports HTTP basic access '
89+ 'Zyte proxy services only support HTTP basic access '
9090 'authentication, but %s.%s.get_proxyauth() returned %r'
9191 % (self .__module__ , self .__class__ .__name__ , auth )
9292 )
@@ -111,7 +111,7 @@ def open_spider(self, spider):
111111
112112 if not self .apikey :
113113 logger .warning (
114- "Zyte Smart Proxy Manager cannot be used without an API key" ,
114+ "Zyte proxy services cannot be used without an API key" ,
115115 extra = {'spider' : spider },
116116 )
117117 return
@@ -120,7 +120,7 @@ def open_spider(self, spider):
120120 self ._authless_url = _remove_auth (self ._auth_url )
121121
122122 logger .info (
123- "Using Zyte Smart Proxy Manager at %s (apikey: %s) " % (
123+ "Using Zyte proxy service %s with an API key ending in %s " % (
124124 self .url , self .apikey [:7 ]
125125 ),
126126 extra = {'spider' : spider },
@@ -131,8 +131,8 @@ def open_spider(self, spider):
131131 spider .download_delay = 0
132132 logger .info (
133133 "ZyteSmartProxyMiddleware: disabling download delays in "
134- "Scrapy to optimize delays introduced by Zyte Smart Proxy "
135- "Manager. To avoid this behaviour you can use the "
134+ "Scrapy to optimize delays introduced by Zyte proxy services. "
135+ "To avoid this behaviour you can use the "
136136 "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind "
137137 "that this may slow down the crawl significantly" ,
138138 extra = {'spider' : spider },
@@ -196,7 +196,9 @@ def get_proxyauth(self, spider):
196196 return basic_auth_header (self .apikey , '' )
197197
198198 def _targets_zyte_api (self , request ):
199- auth_url = request .meta ["proxy" ]
199+ if self ._auth_url is None :
200+ return False
201+ auth_url = request .meta .get ("proxy" , self ._auth_url )
200202 targets_zyte_api = self ._targets .get (auth_url , None )
201203 if targets_zyte_api is None :
202204 targets_zyte_api = urlparse (auth_url ).hostname == "api.zyte.com"
@@ -220,6 +222,10 @@ def _translate_headers(self, request, targets_zyte_api):
220222 request ,
221223 )
222224
225+ def _inc_stat (self , stat , targets_zyte_api , value = 1 ):
226+ prefix = "zyte_api_proxy" if targets_zyte_api else "zyte_smartproxy"
227+ self .crawler .stats .inc_value ("{}/{}" .format (prefix , stat ), value )
228+
223229 def process_request (self , request , spider ):
224230 if self ._is_enabled_for_request (request ):
225231 if 'proxy' not in request .meta :
@@ -246,8 +252,8 @@ def process_request(self, request, spider):
246252 user_agent_header = "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client"
247253 from scrapy_zyte_smartproxy import __version__
248254 request .headers [user_agent_header ] = 'scrapy-zyte-smartproxy/%s' % __version__
249- self .crawler . stats . inc_value ( 'zyte_smartproxy/ request' )
250- self .crawler . stats . inc_value ( 'zyte_smartproxy/ request/method/%s' % request .method )
255+ self ._inc_stat ( " request" , targets_zyte_api = targets_zyte_api )
256+ self ._inc_stat ( " request/method/{}" . format ( request .method ), targets_zyte_api = targets_zyte_api )
251257 self ._translate_headers (request , targets_zyte_api = targets_zyte_api )
252258 self ._clean_zyte_smartproxy_headers (request , targets_zyte_api = targets_zyte_api )
253259 else :
@@ -285,8 +291,10 @@ def _process_error(self, response):
285291 def process_response (self , request , response , spider ):
286292 zyte_smartproxy_error = self ._process_error (response )
287293
294+ targets_zyte_api = self ._targets_zyte_api (request )
295+
288296 if not self ._is_enabled_for_request (request ):
289- return self ._handle_not_enabled_response (request , response )
297+ return self ._handle_not_enabled_response (request , response , targets_zyte_api = targets_zyte_api )
290298
291299 if not self ._is_zyte_smartproxy_or_zapi_response (response ):
292300 return response
@@ -299,19 +307,19 @@ def process_response(self, request, response, spider):
299307 reason = 'noslaves'
300308 else :
301309 reason = 'autherror'
302- self ._set_custom_delay (request , next (self .exp_backoff ), reason = reason )
310+ self ._set_custom_delay (request , next (self .exp_backoff ), reason = reason , targets_zyte_api = targets_zyte_api )
303311 else :
304- self .crawler . stats . inc_value ( 'zyte_smartproxy/ delay/reset_backoff' )
312+ self ._inc_stat ( " delay/reset_backoff" , targets_zyte_api = targets_zyte_api )
305313 self .exp_backoff = exp_backoff (self .backoff_step , self .backoff_max )
306314
307315 if self ._is_auth_error (response ):
308316 # When Zyte Smart Proxy Manager has issues it might not be able to
309317 # authenticate users we must retry
310318 retries = request .meta .get ('zyte_smartproxy_auth_retry_times' , 0 )
311319 if retries < self .max_auth_retry_times :
312- return self ._retry_auth (response , request , spider )
320+ return self ._retry_auth (response , request , spider , targets_zyte_api = targets_zyte_api )
313321 else :
314- self .crawler . stats . inc_value ( 'zyte_smartproxy/ retries/auth/max_reached' )
322+ self ._inc_stat ( " retries/auth/max_reached" , targets_zyte_api = targets_zyte_api )
315323 logger .warning (
316324 "Max retries for authentication issues reached, please check auth"
317325 " information settings" ,
@@ -325,17 +333,17 @@ def process_response(self, request, response, spider):
325333 else :
326334 after = response .headers .get ('retry-after' )
327335 if after :
328- self ._set_custom_delay (request , float (after ), reason = 'banned' )
329- self .crawler . stats . inc_value ( 'zyte_smartproxy/ response/banned' )
336+ self ._set_custom_delay (request , float (after ), reason = 'banned' , targets_zyte_api = targets_zyte_api )
337+ self ._inc_stat ( " response/banned" , targets_zyte_api = targets_zyte_api )
330338 else :
331339 self ._bans [key ] = 0
332340 # If placed behind `RedirectMiddleware`, it would not count 3xx responses
333- self .crawler . stats . inc_value ( 'zyte_smartproxy/ response' )
334- self .crawler . stats . inc_value ( 'zyte_smartproxy/ response/status/%s' % response .status )
341+ self ._inc_stat ( " response" , targets_zyte_api = targets_zyte_api )
342+ self ._inc_stat ( " response/status/{}" . format ( response .status ), targets_zyte_api = targets_zyte_api )
335343 if zyte_smartproxy_error :
336- self .crawler . stats . inc_value ( 'zyte_smartproxy/ response/error' )
337- self . crawler . stats . inc_value (
338- 'zyte_smartproxy/ response/error/%s' % zyte_smartproxy_error . decode ( 'utf8' ) )
344+ self ._inc_stat ( " response/error" , targets_zyte_api = targets_zyte_api )
345+ error_msg = zyte_smartproxy_error . decode ( 'utf8' )
346+ self . _inc_stat ( " response/error/{}" . format ( error_msg ), targets_zyte_api = targets_zyte_api )
339347 return response
340348
341349 def process_exception (self , request , exception , spider ):
@@ -344,30 +352,33 @@ def process_exception(self, request, exception, spider):
344352 if isinstance (exception , (ConnectionRefusedError , ConnectionDone )):
345353 # Handle Zyte Smart Proxy Manager downtime
346354 self ._clear_dns_cache ()
347- self ._set_custom_delay (request , self .connection_refused_delay , reason = 'conn_refused' )
355+ targets_zyte_api = self ._targets_zyte_api (request )
356+ self ._set_custom_delay (request , self .connection_refused_delay , reason = 'conn_refused' , targets_zyte_api = targets_zyte_api )
348357
349- def _handle_not_enabled_response (self , request , response ):
358+ def _handle_not_enabled_response (self , request , response , targets_zyte_api ):
350359 if self ._should_enable_for_response (response ):
351360 domain = self ._get_url_domain (request .url )
352361 self .enabled_for_domain [domain ] = True
353362
354363 retryreq = request .copy ()
355364 retryreq .dont_filter = True
356- self .crawler . stats . inc_value ( 'zyte_smartproxy/ retries/should_have_been_enabled' )
365+ self ._inc_stat ( " retries/should_have_been_enabled" , targets_zyte_api = targets_zyte_api )
357366 return retryreq
358367 return response
359368
360- def _retry_auth (self , response , request , spider ):
369+ def _retry_auth (self , response , request , spider , targets_zyte_api ):
361370 logger .warning (
362- "Retrying a Zyte Smart Proxy Manager request due to an "
363- "authentication issue" ,
371+ (
372+ "Retrying a request due to an authentication issue with "
373+ "the configured Zyte proxy service"
374+ ),
364375 extra = {'spider' : self .spider },
365376 )
366377 retries = request .meta .get ('zyte_smartproxy_auth_retry_times' , 0 ) + 1
367378 retryreq = request .copy ()
368379 retryreq .meta ['zyte_smartproxy_auth_retry_times' ] = retries
369380 retryreq .dont_filter = True
370- self .crawler . stats . inc_value ( 'zyte_smartproxy/ retries/auth' )
381+ self ._inc_stat ( " retries/auth" , targets_zyte_api = targets_zyte_api )
371382 return retryreq
372383
373384 def _clear_dns_cache (self ):
@@ -402,7 +413,7 @@ def _get_slot(self, request):
402413 key = self ._get_slot_key (request )
403414 return key , self .crawler .engine .downloader .slots .get (key )
404415
405- def _set_custom_delay (self , request , delay , reason = None ):
416+ def _set_custom_delay (self , request , delay , targets_zyte_api , reason = None ):
406417 """Set custom delay for slot and save original one."""
407418 key , slot = self ._get_slot (request )
408419 if not slot :
@@ -411,8 +422,8 @@ def _set_custom_delay(self, request, delay, reason=None):
411422 self ._saved_delays [key ] = slot .delay
412423 slot .delay = delay
413424 if reason is not None :
414- self .crawler . stats . inc_value ( 'zyte_smartproxy/ delay/%s' % reason )
415- self .crawler . stats . inc_value ( 'zyte_smartproxy/ delay/%s /total' % reason , delay )
425+ self ._inc_stat ( " delay/{}" . format ( reason ), targets_zyte_api = targets_zyte_api )
426+ self ._inc_stat ( " delay/{} /total" . format ( reason ), value = delay , targets_zyte_api = targets_zyte_api )
416427
417428 def _restore_original_delay (self , request ):
418429 """Restore original delay for slot if it was changed."""
0 commit comments