@@ -37,35 +37,13 @@ class ZyteSmartProxyMiddleware(object):
3737 connection_refused_delay = 90
3838 preserve_delay = False
3939 header_prefix = "X-Crawlera-" # Deprecated
40- header_lowercase_prefixes = ("zyte-" , "x-crawlera-" )
40+ header_lowercase_prefixes = (b "zyte-" , b "x-crawlera-" )
4141 conflicting_headers = ("X-Crawlera-Profile" , "X-Crawlera-UA" )
4242 backoff_step = 15
4343 backoff_max = 180
4444 exp_backoff = None
45- force_enable_on_http_codes = [] # type: List[int]
4645 max_auth_retry_times = 10
47- enabled_for_domain = {} # type: Dict[str, bool]
4846 apikey = ""
49- zyte_api_to_spm_translations = {
50- b"zyte-device" : b"x-crawlera-profile" ,
51- b"zyte-geolocation" : b"x-crawlera-region" ,
52- b"zyte-jobid" : b"x-crawlera-jobid" ,
53- b"zyte-override-headers" : b"x-crawlera-profile-pass" ,
54- }
55- spm_to_zyte_api_translations = {
56- v : k for k , v in zyte_api_to_spm_translations .items ()
57- }
58-
59- _settings = [
60- ("apikey" , str ),
61- ("url" , str ),
62- ("maxbans" , int ),
63- ("download_timeout" , int ),
64- ("preserve_delay" , bool ),
65- ("backoff_step" , int ),
66- ("backoff_max" , int ),
67- ("force_enable_on_http_codes" , list ),
68- ]
6947
7048 def __init__ (self , crawler ):
7149 self .crawler = crawler
@@ -74,9 +52,40 @@ def __init__(self, crawler):
7452 self ._bans = defaultdict (int )
7553 self ._saved_delays = defaultdict (lambda : None )
7654 self ._auth_url = None
55+ self .enabled_for_domain = {} # type: Dict[str, bool]
56+ self .force_enable_on_http_codes = [] # type: List[int]
57+ self .zyte_api_to_spm_translations = {
58+ b"zyte-device" : b"x-crawlera-profile" ,
59+ b"zyte-geolocation" : b"x-crawlera-region" ,
60+ b"zyte-jobid" : b"x-crawlera-jobid" ,
61+ b"zyte-override-headers" : b"x-crawlera-profile-pass" ,
62+ }
63+ self ._settings = [
64+ ("apikey" , str ),
65+ ("url" , str ),
66+ ("maxbans" , int ),
67+ ("download_timeout" , int ),
68+ ("preserve_delay" , bool ),
69+ ("backoff_step" , int ),
70+ ("backoff_max" , int ),
71+ ("force_enable_on_http_codes" , list ),
72+ ]
7773 # Keys are proxy URLs, values are booleans (True means Zyte API, False
7874 # means Zyte Smart Proxy Manager).
7975 self ._targets = {}
76+ # SPM headers that can be used with Zyte API proxy mode
77+ # https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping
78+ self .spm_bc_headers = [
79+ b"x-crawlera-cookies" ,
80+ b"x-crawlera-jobid" ,
81+ b"x-crawlera-profile" ,
82+ b"x-crawlera-profile-pass" ,
83+ b"x-crawlera-region" ,
84+ b"x-crawlera-session" ,
85+ ]
86+ self ._keep_headers = crawler .settings .getbool (
87+ "ZYTE_SMARTPROXY_KEEP_HEADERS" , False
88+ )
8089
8190 @classmethod
8291 def from_crawler (cls , crawler ):
@@ -230,15 +239,14 @@ def _targets_zyte_api(self, request):
230239 return targets_zyte_api
231240
232241 def _translate_headers (self , request , targets_zyte_api ):
233- translation_dict = (
234- self .spm_to_zyte_api_translations
235- if targets_zyte_api
236- else self .zyte_api_to_spm_translations
237- )
238- for header , translation in translation_dict .items ():
242+ if targets_zyte_api :
243+ return
244+ for header , translation in self .zyte_api_to_spm_translations .items ():
239245 if header not in request .headers :
240246 continue
241- request .headers [translation ] = value = request .headers .pop (header )
247+ values = request .headers .pop (header )
248+ value = b"" .join (values )
249+ request .headers [translation ] = value
242250 logger .warning (
243251 "Translating header %r (%r) to %r on request %r" ,
244252 header ,
@@ -287,11 +295,12 @@ def process_request(self, request, spider):
287295 "request/method/{}" .format (request .method ),
288296 targets_zyte_api = targets_zyte_api ,
289297 )
290- self ._translate_headers (request , targets_zyte_api = targets_zyte_api )
291- self ._clean_zyte_smartproxy_headers (
292- request , targets_zyte_api = targets_zyte_api
293- )
294- else :
298+ if not self ._keep_headers :
299+ self ._translate_headers (request , targets_zyte_api = targets_zyte_api )
300+ self ._clean_zyte_smartproxy_headers (
301+ request , targets_zyte_api = targets_zyte_api
302+ )
303+ elif not self ._keep_headers :
295304 self ._clean_zyte_smartproxy_headers (request )
296305
297306 def _is_banned (self , response ):
@@ -311,7 +320,7 @@ def _throttle_error(self, response):
311320 "X-Crawlera-Error"
312321 )
313322 if response .status in {429 , 503 } and error and error != b"banned" :
314- return error .decode ()
323+ return error .decode ("utf-8" )
315324 return None
316325
317326 def _process_error (self , response ):
@@ -513,16 +522,15 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None):
513522 if targets_zyte_api is None :
514523 prefixes = self .header_lowercase_prefixes
515524 elif targets_zyte_api :
516- prefixes = ("x-crawlera-" ,)
525+ prefixes = (b "x-crawlera-" ,)
517526 else :
518- prefixes = ("zyte-" ,)
527+ prefixes = (b "zyte-" ,)
519528 targets = [
520- header
521- for header in request .headers
522- if self ._is_zyte_smartproxy_header (header , prefixes )
529+ header for header in request .headers if self ._drop_header (header , prefixes )
523530 ]
524531 for header in targets :
525- value = request .headers .pop (header , None )
532+ values = request .headers .pop (header , None )
533+ value = b"" .join (values )
526534 if targets_zyte_api is not None :
527535 actual_target , header_target = (
528536 ("Zyte API" , "Zyte Smart Proxy Manager" )
@@ -546,12 +554,43 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None):
546554 actual_target ,
547555 header_target ,
548556 )
557+ else :
558+ logger .warning (
559+ (
560+ "Dropping header {header!r} ({value!r}) from request "
561+ "{request!r}, as this request is not handled by "
562+ "scrapy-zyte-smartproxy. If you are sure that you need "
563+ "to send this header in a request not handled by "
564+ "scrapy-zyte-smartproxy, use the "
565+ "ZYTE_SMARTPROXY_KEEP_HEADERS setting."
566+ ).format (
567+ header = header ,
568+ value = value ,
569+ request = request ,
570+ )
571+ )
549572
550- def _is_zyte_smartproxy_header (self , header_name , prefixes ):
573+ def _drop_header (self , header_name , prefixes ):
551574 if not header_name :
552575 return False
553- header_name = header_name .decode ("utf-8" ).lower ()
554- return any (header_name .startswith (prefix ) for prefix in prefixes )
576+ header_name_lowercase = header_name .lower ()
577+ has_drop_prefix = any (
578+ header_name_lowercase .startswith (prefix ) for prefix in prefixes
579+ )
580+ if (
581+ has_drop_prefix
582+ # When dropping all prefixes, always drop matching headers, i.e.
583+ # ignore self.spm_bc_headers.
584+ and len (prefixes ) <= 1
585+ and header_name_lowercase in self .spm_bc_headers
586+ ):
587+ logger .warning (
588+ "Keeping deprecated header {header_name!r}." .format (
589+ header_name = header_name
590+ )
591+ )
592+ return False
593+ return has_drop_prefix
555594
556595 def _set_zyte_smartproxy_default_headers (self , request ):
557596 for header , value in self ._headers :
0 commit comments