Skip to content

Commit aa31b37

Browse files
authored
Merge pull request #74 from scrapy-plugins/retry_on_407
Retry 407 crawlera errors with exp backoff up to 10 times
2 parents 87f8de1 + 5043f75 commit aa31b37

File tree

2 files changed

+98
-2
lines changed

2 files changed

+98
-2
lines changed

scrapy_crawlera/middleware.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class CrawleraMiddleware(object):
2727
backoff_step = 15
2828
backoff_max = 180
2929
exp_backoff = None
30+
max_auth_retry_times = 10
3031

3132
_settings = [
3233
('apikey', str),
@@ -156,17 +157,33 @@ def _is_no_available_proxies(self, response):
156157
response.headers.get('X-Crawlera-Error') == b'noslaves'
157158
)
158159

160+
def _is_auth_error(self, response):
161+
return (
162+
response.status == 407 and
163+
response.headers.get('X-Crawlera-Error') == b'bad_proxy_auth'
164+
)
165+
159166
def process_response(self, request, response, spider):
160167
if not self._is_enabled_for_request(request):
161168
return response
162169
key = self._get_slot_key(request)
163170
self._restore_original_delay(request)
164171

165-
if self._is_no_available_proxies(response):
172+
if self._is_no_available_proxies(response) or self._is_auth_error(response):
166173
self._set_custom_delay(request, next(self.exp_backoff))
167174
else:
168175
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
169176

177+
if self._is_auth_error(response):
178+
# When crawlera has issues it might not be able to authenticate users
179+
# we must retry
180+
retries = response.meta.get('crawlera_auth_retry_times', 0)
181+
if retries < self.max_auth_retry_times:
182+
return self._retry_auth(response, request)
183+
else:
184+
logging.warning("Max retries for authentication issues reached, please check"
185+
"auth information settings")
186+
170187
if self._is_banned(response):
171188
self._bans[key] += 1
172189
if self._bans[key] > self.maxbans:
@@ -196,6 +213,14 @@ def process_exception(self, request, exception, spider):
196213
self._clear_dns_cache()
197214
self._set_custom_delay(request, self.connection_refused_delay)
198215

216+
def _retry_auth(self, response, request):
217+
logging.warning("Retrying crawlera request for authentication issue")
218+
retries = response.meta.get('crawlera_auth_retry_times', 0) + 1
219+
retryreq = request.copy()
220+
retryreq.meta['crawlera_auth_retry_times'] = retries
221+
retryreq.dont_filter = True
222+
return retryreq
223+
199224
def _clear_dns_cache(self):
200225
# Scrapy doesn't expire dns records by default, so we force it here,
201226
# so client can reconnect trough DNS failover.

tests/test_crawlera.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class CrawleraMiddlewareTestCase(TestCase):
2727

2828
mwcls = CrawleraMiddleware
2929
bancode = 503
30+
auth_error_code = 407
3031

3132
def setUp(self):
3233
self.spider = Spider('foo')
@@ -474,7 +475,6 @@ def test_noslaves_delays(self, random_uniform_patch):
474475
crawler = self._mock_crawler(self.spider, self.settings)
475476
mw = self.mwcls.from_crawler(crawler)
476477
mw.open_spider(self.spider)
477-
mw.noslaves_max_delay = max_delay
478478

479479
slot = MockedSlot()
480480
crawler.engine.downloader.slots[slot_key] = slot
@@ -514,6 +514,77 @@ def test_noslaves_delays(self, random_uniform_patch):
514514
mw.process_response(good_req, good_res, self.spider)
515515
self.assertEqual(slot.delay, default_delay)
516516

517+
@patch('random.uniform')
518+
def test_auth_error_retries(self, random_uniform_patch):
519+
# mock random.uniform to just return the max delay
520+
random_uniform_patch.side_effect = lambda x, y: y
521+
522+
slot_key = 'www.scrapytest.org'
523+
url = 'http://www.scrapytest.org'
524+
ban_url = 'http://auth.error'
525+
max_delay = 70
526+
backoff_step = 15
527+
default_delay = 0
528+
529+
self.settings['CRAWLERA_BACKOFF_STEP'] = backoff_step
530+
self.settings['CRAWLERA_BACKOFF_MAX'] = max_delay
531+
532+
self.spider.crawlera_enabled = True
533+
crawler = self._mock_crawler(self.spider, self.settings)
534+
mw = self.mwcls.from_crawler(crawler)
535+
mw.open_spider(self.spider)
536+
mw.max_auth_retry_times = 4
537+
538+
slot = MockedSlot()
539+
crawler.engine.downloader.slots[slot_key] = slot
540+
541+
auth_error_req = Request(url, meta={'download_slot': slot_key})
542+
auth_error_headers = {'X-Crawlera-Error': 'bad_proxy_auth'}
543+
auth_error_response = Response(
544+
ban_url,
545+
status=self.auth_error_code,
546+
request=auth_error_req,
547+
headers=auth_error_headers
548+
)
549+
550+
# delays grow exponentially, retry times increase accordingly
551+
req = mw.process_response(auth_error_req, auth_error_response, self.spider)
552+
self.assertEqual(slot.delay, backoff_step)
553+
retry_times = req.meta["crawlera_auth_retry_times"]
554+
self.assertEqual(retry_times, 1)
555+
556+
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
557+
req = mw.process_response(auth_error_req, auth_error_response, self.spider)
558+
self.assertEqual(slot.delay, backoff_step * 2 ** 1)
559+
retry_times = req.meta["crawlera_auth_retry_times"]
560+
self.assertEqual(retry_times, 2)
561+
562+
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
563+
req = mw.process_response(auth_error_req, auth_error_response, self.spider)
564+
self.assertEqual(slot.delay, backoff_step * 2 ** 2)
565+
retry_times = req.meta["crawlera_auth_retry_times"]
566+
self.assertEqual(retry_times, 3)
567+
568+
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
569+
req = mw.process_response(auth_error_req, auth_error_response, self.spider)
570+
self.assertEqual(slot.delay, max_delay)
571+
retry_times = req.meta["crawlera_auth_retry_times"]
572+
self.assertEqual(retry_times, 4)
573+
574+
# Should return a response when after max number of retries
575+
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
576+
res = mw.process_response(auth_error_req, auth_error_response, self.spider)
577+
self.assertIsInstance(res, Response)
578+
579+
# non crawlera 407 is not retried
580+
non_crawlera_407_response = Response(
581+
ban_url,
582+
status=self.auth_error_code,
583+
request=auth_error_req,
584+
)
585+
res = mw.process_response(auth_error_req, non_crawlera_407_response, self.spider)
586+
self.assertIsInstance(res, Response)
587+
517588
@patch('scrapy_crawlera.middleware.logging')
518589
def test_open_spider_logging(self, mock_logger):
519590
spider = self.spider

0 commit comments

Comments
 (0)