Skip to content

Commit 614cb03

Browse files
authored
Keep backward-compatible headers, allow disabling header translation (#128)
1 parent 93b1469 commit 614cb03

File tree

4 files changed

+323
-251
lines changed

4 files changed

+323
-251
lines changed

docs/settings.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,10 @@ service for the corresponding domain.
8585
When a response with one of these HTTP status codes is received after an
8686
unproxied request, the request is retried with your Zyte proxy service, and any
8787
new request to the same domain is also proxied.
88+
89+
ZYTE_SMARTPROXY_KEEP_HEADERS
90+
----------------------------
91+
92+
Default: ``False``
93+
94+
If ``True``, header dropping and translation is disabled.

scrapy_zyte_smartproxy/middleware.py

Lines changed: 84 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -37,35 +37,13 @@ class ZyteSmartProxyMiddleware(object):
3737
connection_refused_delay = 90
3838
preserve_delay = False
3939
header_prefix = "X-Crawlera-" # Deprecated
40-
header_lowercase_prefixes = ("zyte-", "x-crawlera-")
40+
header_lowercase_prefixes = (b"zyte-", b"x-crawlera-")
4141
conflicting_headers = ("X-Crawlera-Profile", "X-Crawlera-UA")
4242
backoff_step = 15
4343
backoff_max = 180
4444
exp_backoff = None
45-
force_enable_on_http_codes = [] # type: List[int]
4645
max_auth_retry_times = 10
47-
enabled_for_domain = {} # type: Dict[str, bool]
4846
apikey = ""
49-
zyte_api_to_spm_translations = {
50-
b"zyte-device": b"x-crawlera-profile",
51-
b"zyte-geolocation": b"x-crawlera-region",
52-
b"zyte-jobid": b"x-crawlera-jobid",
53-
b"zyte-override-headers": b"x-crawlera-profile-pass",
54-
}
55-
spm_to_zyte_api_translations = {
56-
v: k for k, v in zyte_api_to_spm_translations.items()
57-
}
58-
59-
_settings = [
60-
("apikey", str),
61-
("url", str),
62-
("maxbans", int),
63-
("download_timeout", int),
64-
("preserve_delay", bool),
65-
("backoff_step", int),
66-
("backoff_max", int),
67-
("force_enable_on_http_codes", list),
68-
]
6947

7048
def __init__(self, crawler):
7149
self.crawler = crawler
@@ -74,9 +52,40 @@ def __init__(self, crawler):
7452
self._bans = defaultdict(int)
7553
self._saved_delays = defaultdict(lambda: None)
7654
self._auth_url = None
55+
self.enabled_for_domain = {} # type: Dict[str, bool]
56+
self.force_enable_on_http_codes = [] # type: List[int]
57+
self.zyte_api_to_spm_translations = {
58+
b"zyte-device": b"x-crawlera-profile",
59+
b"zyte-geolocation": b"x-crawlera-region",
60+
b"zyte-jobid": b"x-crawlera-jobid",
61+
b"zyte-override-headers": b"x-crawlera-profile-pass",
62+
}
63+
self._settings = [
64+
("apikey", str),
65+
("url", str),
66+
("maxbans", int),
67+
("download_timeout", int),
68+
("preserve_delay", bool),
69+
("backoff_step", int),
70+
("backoff_max", int),
71+
("force_enable_on_http_codes", list),
72+
]
7773
# Keys are proxy URLs, values are booleans (True means Zyte API, False
7874
# means Zyte Smart Proxy Manager).
7975
self._targets = {}
76+
# SPM headers that can be used with Zyte API proxy mode
77+
# https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping
78+
self.spm_bc_headers = [
79+
b"x-crawlera-cookies",
80+
b"x-crawlera-jobid",
81+
b"x-crawlera-profile",
82+
b"x-crawlera-profile-pass",
83+
b"x-crawlera-region",
84+
b"x-crawlera-session",
85+
]
86+
self._keep_headers = crawler.settings.getbool(
87+
"ZYTE_SMARTPROXY_KEEP_HEADERS", False
88+
)
8089

8190
@classmethod
8291
def from_crawler(cls, crawler):
@@ -230,15 +239,14 @@ def _targets_zyte_api(self, request):
230239
return targets_zyte_api
231240

232241
def _translate_headers(self, request, targets_zyte_api):
233-
translation_dict = (
234-
self.spm_to_zyte_api_translations
235-
if targets_zyte_api
236-
else self.zyte_api_to_spm_translations
237-
)
238-
for header, translation in translation_dict.items():
242+
if targets_zyte_api:
243+
return
244+
for header, translation in self.zyte_api_to_spm_translations.items():
239245
if header not in request.headers:
240246
continue
241-
request.headers[translation] = value = request.headers.pop(header)
247+
values = request.headers.pop(header)
248+
value = b"".join(values)
249+
request.headers[translation] = value
242250
logger.warning(
243251
"Translating header %r (%r) to %r on request %r",
244252
header,
@@ -287,11 +295,12 @@ def process_request(self, request, spider):
287295
"request/method/{}".format(request.method),
288296
targets_zyte_api=targets_zyte_api,
289297
)
290-
self._translate_headers(request, targets_zyte_api=targets_zyte_api)
291-
self._clean_zyte_smartproxy_headers(
292-
request, targets_zyte_api=targets_zyte_api
293-
)
294-
else:
298+
if not self._keep_headers:
299+
self._translate_headers(request, targets_zyte_api=targets_zyte_api)
300+
self._clean_zyte_smartproxy_headers(
301+
request, targets_zyte_api=targets_zyte_api
302+
)
303+
elif not self._keep_headers:
295304
self._clean_zyte_smartproxy_headers(request)
296305

297306
def _is_banned(self, response):
@@ -311,7 +320,7 @@ def _throttle_error(self, response):
311320
"X-Crawlera-Error"
312321
)
313322
if response.status in {429, 503} and error and error != b"banned":
314-
return error.decode()
323+
return error.decode("utf-8")
315324
return None
316325

317326
def _process_error(self, response):
@@ -513,16 +522,15 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None):
513522
if targets_zyte_api is None:
514523
prefixes = self.header_lowercase_prefixes
515524
elif targets_zyte_api:
516-
prefixes = ("x-crawlera-",)
525+
prefixes = (b"x-crawlera-",)
517526
else:
518-
prefixes = ("zyte-",)
527+
prefixes = (b"zyte-",)
519528
targets = [
520-
header
521-
for header in request.headers
522-
if self._is_zyte_smartproxy_header(header, prefixes)
529+
header for header in request.headers if self._drop_header(header, prefixes)
523530
]
524531
for header in targets:
525-
value = request.headers.pop(header, None)
532+
values = request.headers.pop(header, None)
533+
value = b"".join(values)
526534
if targets_zyte_api is not None:
527535
actual_target, header_target = (
528536
("Zyte API", "Zyte Smart Proxy Manager")
@@ -546,12 +554,43 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None):
546554
actual_target,
547555
header_target,
548556
)
557+
else:
558+
logger.warning(
559+
(
560+
"Dropping header {header!r} ({value!r}) from request "
561+
"{request!r}, as this request is not handled by "
562+
"scrapy-zyte-smartproxy. If you are sure that you need "
563+
"to send this header in a request not handled by "
564+
"scrapy-zyte-smartproxy, use the "
565+
"ZYTE_SMARTPROXY_KEEP_HEADERS setting."
566+
).format(
567+
header=header,
568+
value=value,
569+
request=request,
570+
)
571+
)
549572

550-
def _is_zyte_smartproxy_header(self, header_name, prefixes):
573+
def _drop_header(self, header_name, prefixes):
551574
if not header_name:
552575
return False
553-
header_name = header_name.decode("utf-8").lower()
554-
return any(header_name.startswith(prefix) for prefix in prefixes)
576+
header_name_lowercase = header_name.lower()
577+
has_drop_prefix = any(
578+
header_name_lowercase.startswith(prefix) for prefix in prefixes
579+
)
580+
if (
581+
has_drop_prefix
582+
# When dropping all prefixes, always drop matching headers, i.e.
583+
# ignore self.spm_bc_headers.
584+
and len(prefixes) <= 1
585+
and header_name_lowercase in self.spm_bc_headers
586+
):
587+
logger.warning(
588+
"Keeping deprecated header {header_name!r}.".format(
589+
header_name=header_name
590+
)
591+
)
592+
return False
593+
return has_drop_prefix
555594

556595
def _set_zyte_smartproxy_default_headers(self, request):
557596
for header, value in self._headers:

0 commit comments

Comments
 (0)