2121from scrapy_zyte_smartproxy .utils import exp_backoff
2222
2323
24+ RESPONSE_IDENTIFYING_HEADERS = (
25+ ("X-Crawlera-Version" , None ),
26+ ("X-Crawlera-Version" , "" ),
27+ ("X-Crawlera-Version" , "1.36.3-cd5e44" ),
28+ ("Zyte-Request-Id" , "123456789" ),
29+ ("zyte-error-type" , "foo" ),
30+ )
31+
32+
2433class MockedSlot (object ):
2534
2635 def __init__ (self , delay = 0.0 ):
@@ -45,11 +54,10 @@ def Response_init_new(self, *args, **kwargs):
4554 Response .__init__ = Response_init_new
4655
4756 def _mock_zyte_smartproxy_response (self , url , headers = None , ** kwargs ):
48- zyte_smartproxy_version = choice (("1.36.3-cd5e44" , "" , None ))
49- zyte_smartproxy_headers = {"X-Crawlera-Version" : zyte_smartproxy_version }
50- if headers :
51- zyte_smartproxy_headers .update (headers )
52- return Response (url , headers = zyte_smartproxy_headers , ** kwargs )
57+ headers = headers or {}
58+ k , v = choice (RESPONSE_IDENTIFYING_HEADERS )
59+ headers [k ] = v
60+ return Response (url , headers = headers , ** kwargs )
5361
5462 def _mock_crawler (self , spider , settings = None ):
5563
@@ -93,25 +101,34 @@ def _assert_enabled(self, spider,
93101 crawler = self ._mock_crawler (spider , settings )
94102 mw = self .mwcls .from_crawler (crawler )
95103 mw .open_spider (spider )
104+ httpproxy = HttpProxyMiddleware .from_crawler (crawler )
96105 assert mw .url == proxyurl
97106 req = Request ('http://example.com' )
98107 assert mw .process_request (req , spider ) is None
99108 self .assertEqual (req .meta .get ('proxy' ), proxyurlcreds )
100109 self .assertEqual (req .meta .get ('download_timeout' ), download_timeout )
101110 self .assertNotIn (b'Proxy-Authorization' , req .headers )
111+
102112 res = self ._mock_zyte_smartproxy_response (req .url )
103113 assert mw .process_response (req , res , spider ) is res
104114
105115 # disabled if 'dont_proxy=True' is set
106116 req = Request ('http://example.com' )
107117 req .meta ['dont_proxy' ] = True
108118 assert mw .process_request (req , spider ) is None
119+ assert httpproxy .process_request (req , spider ) is None
109120 self .assertEqual (req .meta .get ('proxy' ), None )
110121 self .assertEqual (req .meta .get ('download_timeout' ), None )
111- self .assertEqual ( req . headers . get ( 'Proxy-Authorization' ), None )
122+ self .assertNotIn ( b 'Proxy-Authorization', req . headers )
112123 res = self ._mock_zyte_smartproxy_response (req .url )
113124 assert mw .process_response (req , res , spider ) is res
125+
114126 del req .meta ['dont_proxy' ]
127+ assert mw .process_request (req , spider ) is None
128+ assert httpproxy .process_request (req , spider ) is None
129+ self .assertEqual (req .meta .get ('proxy' ), proxyurl )
130+ self .assertEqual (req .meta .get ('download_timeout' ), download_timeout )
131+ self .assertEqual (req .headers .get ('Proxy-Authorization' ), proxyauth )
115132
116133 if maxbans > 0 :
117134 # assert ban count is reseted after a succesful response
@@ -177,7 +194,9 @@ def test_apikey(self):
177194 proxyauth = basic_auth_header (apikey , '' )
178195 self ._assert_enabled (self .spider , self .settings , proxyauth = proxyauth , proxyurlcreds = 'http://apikey:@proxy.zyte.com:8011' )
179196
180- self .spider .zyte_smartproxy_apikey = 'notfromsettings'
197+ apikey = 'notfromsettings'
198+ proxyauth = basic_auth_header (apikey , '' )
199+ self .spider .zyte_smartproxy_apikey = apikey
181200 self ._assert_enabled (self .spider , self .settings , proxyauth = proxyauth , proxyurlcreds = 'http://notfromsettings:@proxy.zyte.com:8011' )
182201
183202 def test_proxyurl (self ):
@@ -262,6 +281,7 @@ def test_delay_adjustment(self):
262281 self .spider .download_delay = delay
263282 mw = self .mwcls .from_crawler (crawler )
264283 mw .open_spider (self .spider )
284+ httpproxy = HttpProxyMiddleware .from_crawler (crawler )
265285 self .assertEqual (self .spider .download_delay , 0 )
266286
267287 # preserve original delay
@@ -276,6 +296,8 @@ def test_delay_adjustment(self):
276296
277297 # ban without retry-after
278298 req = Request (url , meta = {'download_slot' : slot_key })
299+ assert mw .process_request (req , self .spider ) is None
300+ assert httpproxy .process_request (req , self .spider ) is None
279301 headers = {'X-Crawlera-Error' : 'banned' }
280302 res = self ._mock_zyte_smartproxy_response (
281303 ban_url ,
@@ -376,7 +398,7 @@ def test_jobid_header(self):
376398 req3 = Request (
377399 'http://example.com' ,
378400 meta = {
379- "proxy" :
"http://[email protected] :8011" ,
401+ "proxy" : "http://apikey: @api.zyte.com:8011" ,
380402 },
381403 )
382404 self .assertEqual (mw3 .process_request (req3 , self .spider ), None )
@@ -389,9 +411,11 @@ def test_stats(self):
389411 crawler = self ._mock_crawler (spider , self .settings )
390412 mw = self .mwcls .from_crawler (crawler )
391413 mw .open_spider (spider )
414+ httpproxy = HttpProxyMiddleware .from_crawler (crawler )
392415
393416 req = Request ('http://example.com' )
394417 assert mw .process_request (req , spider ) is None
418+ assert httpproxy .process_request (req , spider ) is None
395419 self .assertEqual (crawler .stats .get_value ('zyte_smartproxy/request' ), 1 )
396420 self .assertEqual (crawler .stats .get_value ('zyte_smartproxy/request/method/GET' ), 1 )
397421
@@ -402,6 +426,7 @@ def test_stats(self):
402426
403427 req = Request ('http://example.com/other' , method = 'POST' )
404428 assert mw .process_request (req , spider ) is None
429+ assert httpproxy .process_request (req , spider ) is None
405430 self .assertEqual (crawler .stats .get_value ('zyte_smartproxy/request' ), 2 )
406431 self .assertEqual (crawler .stats .get_value ('zyte_smartproxy/request/method/POST' ), 1 )
407432
@@ -433,6 +458,7 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs):
433458 crawler = self ._mock_crawler (spider , self .settings )
434459 mw = self .mwcls .from_crawler (crawler )
435460 mw .open_spider (spider )
461+ httpproxy = HttpProxyMiddleware .from_crawler (crawler )
436462 headers = {
437463 'X-Crawlera-Debug' : True ,
438464 'X-Crawlera-Foo' : "foo" ,
@@ -444,7 +470,8 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs):
444470 'Zyte-Geolocation' : 'foo' ,
445471 }
446472 req = Request ('http://example.com' , headers = headers , ** kwargs )
447- out = mw .process_request (req , spider )
473+ mw .process_request (req , spider )
474+ httpproxy .process_request (req , spider )
448475 return req
449476
450477 def test_clean_headers_when_disabled (self ):
@@ -470,7 +497,7 @@ def test_clean_headers_when_enabled_spm(self):
470497 self .assertIn (b'User-Agent' , req .headers )
471498
472499 def test_clean_headers_when_enabled_zyte_api (self ):
473- meta = {
"proxy" :
"http://[email protected] :8011" }
500+ meta = {"proxy" : "http://apikey: @api.zyte.com:8011" }
474501 req = self ._make_fake_request (self .spider , zyte_smartproxy_enabled = True , meta = meta )
475502 self .assertNotIn (b'X-Crawlera-Debug' , req .headers )
476503 self .assertNotIn (b'X-Crawlera-Foo' , req .headers )
@@ -499,7 +526,7 @@ def test_zyte_smartproxy_default_headers(self):
499526 # Header translation
500527 req = Request (
501528 'http://example.com/other' ,
502- meta = {
"proxy" :
"http://[email protected] :8011" },
529+ meta = {"proxy" : "http://apikey: @api.zyte.com:8011" },
503530 )
504531 assert mw .process_request (req , spider ) is None
505532 self .assertNotIn ('X-Crawlera-Profile' , req .headers )
@@ -620,11 +647,14 @@ def test_noslaves_delays(self, random_uniform_patch):
620647 crawler = self ._mock_crawler (self .spider , self .settings )
621648 mw = self .mwcls .from_crawler (crawler )
622649 mw .open_spider (self .spider )
650+ httpproxy = HttpProxyMiddleware .from_crawler (crawler )
623651
624652 slot = MockedSlot ()
625653 crawler .engine .downloader .slots [slot_key ] = slot
626654
627655 noslaves_req = Request (url , meta = {'download_slot' : slot_key })
656+ assert mw .process_request (noslaves_req , self .spider ) is None
657+ assert httpproxy .process_request (noslaves_req , self .spider ) is None
628658 headers = {'X-Crawlera-Error' : 'noslaves' }
629659 noslaves_res = self ._mock_zyte_smartproxy_response (
630660 ban_url ,
@@ -647,6 +677,8 @@ def test_noslaves_delays(self, random_uniform_patch):
647677
648678 # other responses reset delay
649679 ban_req = Request (url , meta = {'download_slot' : slot_key })
680+ assert mw .process_request (ban_req , self .spider ) is None
681+ assert httpproxy .process_request (ban_req , self .spider ) is None
650682 ban_headers = {'X-Crawlera-Error' : 'banned' }
651683 ban_res = self ._mock_zyte_smartproxy_response (
652684 ban_url ,
@@ -660,6 +692,8 @@ def test_noslaves_delays(self, random_uniform_patch):
660692 self .assertEqual (slot .delay , backoff_step )
661693
662694 good_req = Request (url , meta = {'download_slot' : slot_key })
695+ assert mw .process_request (good_req , self .spider ) is None
696+ assert httpproxy .process_request (good_req , self .spider ) is None
663697 good_res = self ._mock_zyte_smartproxy_response (
664698 url ,
665699 status = 200 ,
@@ -687,11 +721,14 @@ def test_auth_error_retries(self, random_uniform_patch):
687721 mw = self .mwcls .from_crawler (crawler )
688722 mw .open_spider (self .spider )
689723 mw .max_auth_retry_times = 4
724+ httpproxy = HttpProxyMiddleware .from_crawler (crawler )
690725
691726 slot = MockedSlot ()
692727 crawler .engine .downloader .slots [slot_key ] = slot
693728
694729 auth_error_req = Request (url , meta = {'download_slot' : slot_key })
730+ assert mw .process_request (auth_error_req , self .spider ) is None
731+ assert httpproxy .process_request (auth_error_req , self .spider ) is None
695732 auth_error_headers = {'X-Crawlera-Error' : 'bad_proxy_auth' }
696733 auth_error_response = self ._mock_zyte_smartproxy_response (
697734 ban_url ,
@@ -978,7 +1015,7 @@ def test_client_header(self):
9781015 req2 = Request (
9791016 'http://example.com' ,
9801017 meta = {
981- "proxy" :
"http://[email protected] :8011" ,
1018+ "proxy" : "http://apikey: @api.zyte.com:8011" ,
9821019 },
9831020 )
9841021 self .assertEqual (mw .process_request (req2 , self .spider ), None )
@@ -1081,7 +1118,7 @@ def test_header_translation(self):
10811118 request = Request (
10821119 "https://example.com" ,
10831120 headers = {header : value },
1084- meta = {
"proxy" :
"http://[email protected] :8011" },
1121+ meta = {"proxy" : "http://apikey: @api.zyte.com:8011" },
10851122 )
10861123 self .assertEqual (mw .process_request (request , self .spider ), None )
10871124 self .assertNotIn (header , request .headers )
@@ -1111,7 +1148,7 @@ def test_header_drop_warnings(self, mock_logger):
11111148 request = Request (
11121149 "https://example.com" ,
11131150 headers = {"X-Crawlera-Profile" : "desktop" },
1114- meta = {
"proxy" :
"http://[email protected] :8011" },
1151+ meta = {"proxy" : "http://apikey: @api.zyte.com:8011" },
11151152 )
11161153 self .assertEqual (mw .process_request (request , self .spider ), None )
11171154 mock_logger .warning .assert_called_with (
@@ -1149,7 +1186,7 @@ def test_header_drop_warnings(self, mock_logger):
11491186 request = Request (
11501187 "https://example.com" ,
11511188 headers = {"X-Crawlera-Foo" : "bar" },
1152- meta = {
"proxy" :
"http://[email protected] :8011" },
1189+ meta = {"proxy" : "http://apikey: @api.zyte.com:8011" },
11531190 )
11541191 self .assertEqual (mw .process_request (request , self .spider ), None )
11551192 mock_logger .warning .assert_called_with (
@@ -1180,6 +1217,30 @@ def test_header_drop_warnings(self, mock_logger):
11801217 self .assertEqual (mw .process_request (request , self .spider ), None )
11811218 mock_logger .warning .assert_not_called () # No warnings for "drop all" scenarios
11821219
1220+ def test_header_based_handling (self ):
1221+ self .spider .zyte_smartproxy_enabled = True
1222+ spider = self .spider
1223+ crawler = self ._mock_crawler (spider , self .settings )
1224+ mw = self .mwcls .from_crawler (crawler )
1225+ mw .open_spider (spider )
1226+ httpproxy = HttpProxyMiddleware .from_crawler (crawler )
1227+
1228+ req = Request ('http://example.com' )
1229+ assert mw .process_request (req , spider ) is None
1230+ assert httpproxy .process_request (req , spider ) is None
1231+
1232+ count = 0
1233+ res = Response (req .url )
1234+ assert mw .process_response (req , res , spider ) is res
1235+ self .assertEqual (crawler .stats .get_value ('zyte_smartproxy/response' ), None )
1236+
1237+ for k , v in RESPONSE_IDENTIFYING_HEADERS :
1238+ count += 1
1239+ res = Response (req .url , headers = {k : v })
1240+ assert mw .process_response (req , res , spider ) is res
1241+ self .assertEqual (crawler .stats .get_value ('zyte_smartproxy/response' ), count )
1242+
1243+
11831244 def test_meta_copy (self ):
11841245 """Warn when users copy the proxy key from one response to the next."""
11851246 self .spider .zyte_smartproxy_enabled = True
0 commit comments