Skip to content

Commit cbe4a8e

Browse files
authored
Fix compatibility with Zyte API proxy mode in response handling (#117)
1 parent 5a29d34 commit cbe4a8e

File tree

2 files changed

+83
-18
lines changed

2 files changed

+83
-18
lines changed

scrapy_zyte_smartproxy/middleware.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def process_response(self, request, response, spider):
288288
if not self._is_enabled_for_request(request):
289289
return self._handle_not_enabled_response(request, response)
290290

291-
if not self._is_zyte_smartproxy_response(response):
291+
if not self._is_zyte_smartproxy_or_zapi_response(response):
292292
return response
293293

294294
key = self._get_slot_key(request)
@@ -388,8 +388,12 @@ def _get_url_domain(self, url):
388388
parsed = urlparse(url)
389389
return parsed.netloc
390390

391-
def _is_zyte_smartproxy_response(self, response):
392-
return bool("X-Crawlera-Version" in response.headers)
391+
def _is_zyte_smartproxy_or_zapi_response(self, response):
392+
return (
393+
"X-Crawlera-Version" in response.headers
394+
or "Zyte-Request-Id" in response.headers
395+
or "zyte-error-type" in response.headers
396+
)
393397

394398
def _get_slot_key(self, request):
395399
return request.meta.get('download_slot')

tests/test_all.py

Lines changed: 76 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,15 @@
2121
from scrapy_zyte_smartproxy.utils import exp_backoff
2222

2323

24+
RESPONSE_IDENTIFYING_HEADERS = (
25+
("X-Crawlera-Version", None),
26+
("X-Crawlera-Version", ""),
27+
("X-Crawlera-Version", "1.36.3-cd5e44"),
28+
("Zyte-Request-Id", "123456789"),
29+
("zyte-error-type", "foo"),
30+
)
31+
32+
2433
class MockedSlot(object):
2534

2635
def __init__(self, delay=0.0):
@@ -45,11 +54,10 @@ def Response_init_new(self, *args, **kwargs):
4554
Response.__init__ = Response_init_new
4655

4756
def _mock_zyte_smartproxy_response(self, url, headers=None, **kwargs):
48-
zyte_smartproxy_version = choice(("1.36.3-cd5e44", "", None))
49-
zyte_smartproxy_headers = {"X-Crawlera-Version": zyte_smartproxy_version}
50-
if headers:
51-
zyte_smartproxy_headers.update(headers)
52-
return Response(url, headers=zyte_smartproxy_headers, **kwargs)
57+
headers = headers or {}
58+
k, v = choice(RESPONSE_IDENTIFYING_HEADERS)
59+
headers[k] = v
60+
return Response(url, headers=headers, **kwargs)
5361

5462
def _mock_crawler(self, spider, settings=None):
5563

@@ -93,25 +101,34 @@ def _assert_enabled(self, spider,
93101
crawler = self._mock_crawler(spider, settings)
94102
mw = self.mwcls.from_crawler(crawler)
95103
mw.open_spider(spider)
104+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
96105
assert mw.url == proxyurl
97106
req = Request('http://example.com')
98107
assert mw.process_request(req, spider) is None
99108
self.assertEqual(req.meta.get('proxy'), proxyurlcreds)
100109
self.assertEqual(req.meta.get('download_timeout'), download_timeout)
101110
self.assertNotIn(b'Proxy-Authorization', req.headers)
111+
102112
res = self._mock_zyte_smartproxy_response(req.url)
103113
assert mw.process_response(req, res, spider) is res
104114

105115
# disabled if 'dont_proxy=True' is set
106116
req = Request('http://example.com')
107117
req.meta['dont_proxy'] = True
108118
assert mw.process_request(req, spider) is None
119+
assert httpproxy.process_request(req, spider) is None
109120
self.assertEqual(req.meta.get('proxy'), None)
110121
self.assertEqual(req.meta.get('download_timeout'), None)
111-
self.assertEqual(req.headers.get('Proxy-Authorization'), None)
122+
self.assertNotIn(b'Proxy-Authorization', req.headers)
112123
res = self._mock_zyte_smartproxy_response(req.url)
113124
assert mw.process_response(req, res, spider) is res
125+
114126
del req.meta['dont_proxy']
127+
assert mw.process_request(req, spider) is None
128+
assert httpproxy.process_request(req, spider) is None
129+
self.assertEqual(req.meta.get('proxy'), proxyurl)
130+
self.assertEqual(req.meta.get('download_timeout'), download_timeout)
131+
self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
115132

116133
if maxbans > 0:
117134
# assert ban count is reseted after a succesful response
@@ -177,7 +194,9 @@ def test_apikey(self):
177194
proxyauth = basic_auth_header(apikey, '')
178195
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://apikey:@proxy.zyte.com:8011')
179196

180-
self.spider.zyte_smartproxy_apikey = 'notfromsettings'
197+
apikey = 'notfromsettings'
198+
proxyauth = basic_auth_header(apikey, '')
199+
self.spider.zyte_smartproxy_apikey = apikey
181200
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://notfromsettings:@proxy.zyte.com:8011')
182201

183202
def test_proxyurl(self):
@@ -262,6 +281,7 @@ def test_delay_adjustment(self):
262281
self.spider.download_delay = delay
263282
mw = self.mwcls.from_crawler(crawler)
264283
mw.open_spider(self.spider)
284+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
265285
self.assertEqual(self.spider.download_delay, 0)
266286

267287
# preserve original delay
@@ -276,6 +296,8 @@ def test_delay_adjustment(self):
276296

277297
# ban without retry-after
278298
req = Request(url, meta={'download_slot': slot_key})
299+
assert mw.process_request(req, self.spider) is None
300+
assert httpproxy.process_request(req, self.spider) is None
279301
headers = {'X-Crawlera-Error': 'banned'}
280302
res = self._mock_zyte_smartproxy_response(
281303
ban_url,
@@ -376,7 +398,7 @@ def test_jobid_header(self):
376398
req3 = Request(
377399
'http://example.com',
378400
meta={
379-
"proxy": "http://[email protected]:8011",
401+
"proxy": "http://apikey:@api.zyte.com:8011",
380402
},
381403
)
382404
self.assertEqual(mw3.process_request(req3, self.spider), None)
@@ -389,9 +411,11 @@ def test_stats(self):
389411
crawler = self._mock_crawler(spider, self.settings)
390412
mw = self.mwcls.from_crawler(crawler)
391413
mw.open_spider(spider)
414+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
392415

393416
req = Request('http://example.com')
394417
assert mw.process_request(req, spider) is None
418+
assert httpproxy.process_request(req, spider) is None
395419
self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request'), 1)
396420
self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request/method/GET'), 1)
397421

@@ -402,6 +426,7 @@ def test_stats(self):
402426

403427
req = Request('http://example.com/other', method='POST')
404428
assert mw.process_request(req, spider) is None
429+
assert httpproxy.process_request(req, spider) is None
405430
self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request'), 2)
406431
self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request/method/POST'), 1)
407432

@@ -433,6 +458,7 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs):
433458
crawler = self._mock_crawler(spider, self.settings)
434459
mw = self.mwcls.from_crawler(crawler)
435460
mw.open_spider(spider)
461+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
436462
headers = {
437463
'X-Crawlera-Debug': True,
438464
'X-Crawlera-Foo': "foo",
@@ -444,7 +470,8 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs):
444470
'Zyte-Geolocation': 'foo',
445471
}
446472
req = Request('http://example.com', headers=headers, **kwargs)
447-
out = mw.process_request(req, spider)
473+
mw.process_request(req, spider)
474+
httpproxy.process_request(req, spider)
448475
return req
449476

450477
def test_clean_headers_when_disabled(self):
@@ -470,7 +497,7 @@ def test_clean_headers_when_enabled_spm(self):
470497
self.assertIn(b'User-Agent', req.headers)
471498

472499
def test_clean_headers_when_enabled_zyte_api(self):
473-
meta = {"proxy": "http://[email protected]:8011"}
500+
meta = {"proxy": "http://apikey:@api.zyte.com:8011"}
474501
req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True, meta=meta)
475502
self.assertNotIn(b'X-Crawlera-Debug', req.headers)
476503
self.assertNotIn(b'X-Crawlera-Foo', req.headers)
@@ -499,7 +526,7 @@ def test_zyte_smartproxy_default_headers(self):
499526
# Header translation
500527
req = Request(
501528
'http://example.com/other',
502-
meta={"proxy": "http://[email protected]:8011"},
529+
meta={"proxy": "http://apikey:@api.zyte.com:8011"},
503530
)
504531
assert mw.process_request(req, spider) is None
505532
self.assertNotIn('X-Crawlera-Profile', req.headers)
@@ -620,11 +647,14 @@ def test_noslaves_delays(self, random_uniform_patch):
620647
crawler = self._mock_crawler(self.spider, self.settings)
621648
mw = self.mwcls.from_crawler(crawler)
622649
mw.open_spider(self.spider)
650+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
623651

624652
slot = MockedSlot()
625653
crawler.engine.downloader.slots[slot_key] = slot
626654

627655
noslaves_req = Request(url, meta={'download_slot': slot_key})
656+
assert mw.process_request(noslaves_req, self.spider) is None
657+
assert httpproxy.process_request(noslaves_req, self.spider) is None
628658
headers = {'X-Crawlera-Error': 'noslaves'}
629659
noslaves_res = self._mock_zyte_smartproxy_response(
630660
ban_url,
@@ -647,6 +677,8 @@ def test_noslaves_delays(self, random_uniform_patch):
647677

648678
# other responses reset delay
649679
ban_req = Request(url, meta={'download_slot': slot_key})
680+
assert mw.process_request(ban_req, self.spider) is None
681+
assert httpproxy.process_request(ban_req, self.spider) is None
650682
ban_headers = {'X-Crawlera-Error': 'banned'}
651683
ban_res = self._mock_zyte_smartproxy_response(
652684
ban_url,
@@ -660,6 +692,8 @@ def test_noslaves_delays(self, random_uniform_patch):
660692
self.assertEqual(slot.delay, backoff_step)
661693

662694
good_req = Request(url, meta={'download_slot': slot_key})
695+
assert mw.process_request(good_req, self.spider) is None
696+
assert httpproxy.process_request(good_req, self.spider) is None
663697
good_res = self._mock_zyte_smartproxy_response(
664698
url,
665699
status=200,
@@ -687,11 +721,14 @@ def test_auth_error_retries(self, random_uniform_patch):
687721
mw = self.mwcls.from_crawler(crawler)
688722
mw.open_spider(self.spider)
689723
mw.max_auth_retry_times = 4
724+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
690725

691726
slot = MockedSlot()
692727
crawler.engine.downloader.slots[slot_key] = slot
693728

694729
auth_error_req = Request(url, meta={'download_slot': slot_key})
730+
assert mw.process_request(auth_error_req, self.spider) is None
731+
assert httpproxy.process_request(auth_error_req, self.spider) is None
695732
auth_error_headers = {'X-Crawlera-Error': 'bad_proxy_auth'}
696733
auth_error_response = self._mock_zyte_smartproxy_response(
697734
ban_url,
@@ -978,7 +1015,7 @@ def test_client_header(self):
9781015
req2 = Request(
9791016
'http://example.com',
9801017
meta={
981-
"proxy": "http://[email protected]:8011",
1018+
"proxy": "http://apikey:@api.zyte.com:8011",
9821019
},
9831020
)
9841021
self.assertEqual(mw.process_request(req2, self.spider), None)
@@ -1081,7 +1118,7 @@ def test_header_translation(self):
10811118
request = Request(
10821119
"https://example.com",
10831120
headers={header: value},
1084-
meta={"proxy": "http://[email protected]:8011"},
1121+
meta={"proxy": "http://apikey:@api.zyte.com:8011"},
10851122
)
10861123
self.assertEqual(mw.process_request(request, self.spider), None)
10871124
self.assertNotIn(header, request.headers)
@@ -1111,7 +1148,7 @@ def test_header_drop_warnings(self, mock_logger):
11111148
request = Request(
11121149
"https://example.com",
11131150
headers={"X-Crawlera-Profile": "desktop"},
1114-
meta={"proxy": "http://[email protected]:8011"},
1151+
meta={"proxy": "http://apikey:@api.zyte.com:8011"},
11151152
)
11161153
self.assertEqual(mw.process_request(request, self.spider), None)
11171154
mock_logger.warning.assert_called_with(
@@ -1149,7 +1186,7 @@ def test_header_drop_warnings(self, mock_logger):
11491186
request = Request(
11501187
"https://example.com",
11511188
headers={"X-Crawlera-Foo": "bar"},
1152-
meta={"proxy": "http://[email protected]:8011"},
1189+
meta={"proxy": "http://apikey:@api.zyte.com:8011"},
11531190
)
11541191
self.assertEqual(mw.process_request(request, self.spider), None)
11551192
mock_logger.warning.assert_called_with(
@@ -1180,6 +1217,30 @@ def test_header_drop_warnings(self, mock_logger):
11801217
self.assertEqual(mw.process_request(request, self.spider), None)
11811218
mock_logger.warning.assert_not_called() # No warnings for "drop all" scenarios
11821219

1220+
def test_header_based_handling(self):
1221+
self.spider.zyte_smartproxy_enabled = True
1222+
spider = self.spider
1223+
crawler = self._mock_crawler(spider, self.settings)
1224+
mw = self.mwcls.from_crawler(crawler)
1225+
mw.open_spider(spider)
1226+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
1227+
1228+
req = Request('http://example.com')
1229+
assert mw.process_request(req, spider) is None
1230+
assert httpproxy.process_request(req, spider) is None
1231+
1232+
count = 0
1233+
res = Response(req.url)
1234+
assert mw.process_response(req, res, spider) is res
1235+
self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), None)
1236+
1237+
for k, v in RESPONSE_IDENTIFYING_HEADERS:
1238+
count += 1
1239+
res = Response(req.url, headers={k: v})
1240+
assert mw.process_response(req, res, spider) is res
1241+
self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), count)
1242+
1243+
11831244
def test_meta_copy(self):
11841245
"""Warn when users copy the proxy key from one response to the next."""
11851246
self.spider.zyte_smartproxy_enabled = True

0 commit comments

Comments
 (0)