Skip to content

Commit 65c1377

Browse files
authored
Merge pull request #70 from scrapy-plugins/exponential-backoff-noproxies
Add exponential backoff when No Available Proxies
2 parents e3b0c1d + 673a14f commit 65c1377

File tree

4 files changed

+114
-12
lines changed

4 files changed

+114
-12
lines changed

docs/settings.rst

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,20 @@ CRAWLERA_DEFAULT_HEADERS
6464

6565
Default: ``{}``
6666

67-
Default headers added only to crawlera requests. Headers defined on ``DEFAULT_REQUEST_HEADERS`` will take precedence as long as the ``CrawleraMiddleware`` is placed after the ``DefaultHeadersMiddleware``*. Headers set on the requests have precedence over the two settings.
67+
Default headers added only to crawlera requests. Headers defined on ``DEFAULT_REQUEST_HEADERS`` will take precedence as long as the ``CrawleraMiddleware`` is placed after the ``DefaultHeadersMiddleware``. Headers set on the requests have precedence over the two settings.
6868

69-
*This is the default behavior, ``DefaultHeadersMiddleware`` default priority is ``400`` and we recommend ``CrawleraMiddleware`` priority to be ``610``
69+
* This is the default behavior, ``DefaultHeadersMiddleware`` default priority is ``400`` and we recommend ``CrawleraMiddleware`` priority to be ``610``
70+
71+
CRAWLERA_BACKOFF_STEP
72+
-----------------------
73+
74+
Default: ``15``
75+
76+
Step size used for calculating exponential backoff according to the formula: ``random.uniform(0, min(max, step * 2 ** attempt))``.
77+
78+
CRAWLERA_BACKOFF_MAX
79+
-----------------------
80+
81+
Default: ``180``
82+
83+
Max value for exponential backoff as showed in the formula above.

scrapy_crawlera/middleware.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from scrapy.exceptions import ScrapyDeprecationWarning
1111
from twisted.internet.error import ConnectionRefusedError, ConnectionDone
1212

13+
from scrapy_crawlera.utils import exp_backoff
14+
1315

1416
class CrawleraMiddleware(object):
1517

@@ -22,6 +24,9 @@ class CrawleraMiddleware(object):
2224
preserve_delay = False
2325
header_prefix = 'X-Crawlera-'
2426
conflicting_headers = ('X-Crawlera-Profile', 'X-Crawlera-UA')
27+
backoff_step = 15
28+
backoff_max = 180
29+
exp_backoff = None
2530

2631
_settings = [
2732
('apikey', str),
@@ -31,6 +36,8 @@ class CrawleraMiddleware(object):
3136
('maxbans', int),
3237
('download_timeout', int),
3338
('preserve_delay', bool),
39+
('backoff_step', int),
40+
('backoff_max', int),
3441
]
3542

3643
def __init__(self, crawler):
@@ -66,6 +73,7 @@ def open_spider(self, spider):
6673
"To avoid this behaviour you can use the CRAWLERA_PRESERVE_DELAY setting but keep in mind that this may slow down the crawl significantly")
6774

6875
self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items()
76+
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
6977

7078
def _settings_get(self, type_, *a, **kw):
7179
if type_ is int:
@@ -140,11 +148,23 @@ def _is_banned(self, response):
140148
response.headers.get('X-Crawlera-Error') == b'banned'
141149
)
142150

151+
def _is_no_available_proxies(self, response):
152+
return (
153+
response.status == self.ban_code and
154+
response.headers.get('X-Crawlera-Error') == b'noslaves'
155+
)
156+
143157
def process_response(self, request, response, spider):
144158
if not self._is_enabled_for_request(request):
145159
return response
146160
key = self._get_slot_key(request)
147161
self._restore_original_delay(request)
162+
163+
if self._is_no_available_proxies(response):
164+
self._set_custom_delay(request, next(self.exp_backoff))
165+
else:
166+
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
167+
148168
if self._is_banned(response):
149169
self._bans[key] += 1
150170
if self._bans[key] > self.maxbans:

scrapy_crawlera/utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import math
2+
import random
3+
4+
from itertools import count
5+
6+
7+
def exp_backoff(step, max):
8+
""" Exponential backoff time with Full Jitter """
9+
# this is a numerically stable version of
10+
# random.uniform(0, min(max, step * 2 ** attempt))
11+
max_attempts = math.log(max / step, 2)
12+
for attempt in count(0, 1):
13+
if attempt <= max_attempts:
14+
yield random.uniform(0, step * 2 ** attempt)
15+
else:
16+
yield random.uniform(0, max)

tests/test_crawlera.py

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from scrapy_crawlera import CrawleraMiddleware
1515
import os
1616

17+
from scrapy_crawlera.utils import exp_backoff
18+
1719

1820
class MockedSlot(object):
1921

@@ -223,15 +225,6 @@ def test_delay_adjustment(self):
223225
slot = MockedSlot(self.spider.download_delay)
224226
crawler.engine.downloader.slots[slot_key] = slot
225227

226-
# no ban
227-
req = Request(url, meta={'download_slot': slot_key})
228-
headers = {'X-Crawlera-Error': 'no_proxies'}
229-
res = Response(
230-
ban_url, status=self.bancode, headers=headers, request=req)
231-
mw.process_response(req, res, self.spider)
232-
self.assertEqual(slot.delay, delay)
233-
self.assertEqual(self.spider.download_delay, delay)
234-
235228
# ban without retry-after
236229
req = Request(url, meta={'download_slot': slot_key})
237230
headers = {'X-Crawlera-Error': 'banned'}
@@ -473,7 +466,66 @@ def test_is_banned(self):
473466
req = self._make_fake_request(self.spider, crawlera_enabled=True)
474467
res = Response(req.url, status=200)
475468
self.assertFalse(mw._is_banned(res))
476-
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'no_proxies'})
469+
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'noslaves'})
477470
self.assertFalse(mw._is_banned(res))
478471
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'banned'})
479472
self.assertTrue(mw._is_banned(res))
473+
474+
@patch('random.uniform')
475+
def test_noslaves_delays(self, random_uniform_patch):
476+
# mock random.uniform to just return the max delay
477+
random_uniform_patch.side_effect = lambda x, y: y
478+
479+
slot_key = 'www.scrapytest.org'
480+
url = 'http://www.scrapytest.org'
481+
ban_url = 'http://ban.me'
482+
max_delay = 70
483+
backoff_step = 15
484+
default_delay = 0
485+
486+
self.settings['CRAWLERA_BACKOFF_STEP'] = backoff_step
487+
self.settings['CRAWLERA_BACKOFF_MAX'] = max_delay
488+
489+
self.spider.crawlera_enabled = True
490+
crawler = self._mock_crawler(self.spider, self.settings)
491+
mw = self.mwcls.from_crawler(crawler)
492+
mw.open_spider(self.spider)
493+
mw.noslaves_max_delay = max_delay
494+
495+
slot = MockedSlot()
496+
crawler.engine.downloader.slots[slot_key] = slot
497+
498+
noslaves_req = Request(url, meta={'download_slot': slot_key})
499+
headers = {'X-Crawlera-Error': 'noslaves'}
500+
noslaves_res = Response(
501+
ban_url, status=self.bancode, headers=headers, request=noslaves_req)
502+
503+
# delays grow exponentially
504+
mw.process_response(noslaves_req, noslaves_res, self.spider)
505+
self.assertEqual(slot.delay, backoff_step)
506+
507+
mw.process_response(noslaves_req, noslaves_res, self.spider)
508+
self.assertEqual(slot.delay, backoff_step * 2 ** 1)
509+
510+
mw.process_response(noslaves_req, noslaves_res, self.spider)
511+
self.assertEqual(slot.delay, backoff_step * 2 ** 2)
512+
513+
mw.process_response(noslaves_req, noslaves_res, self.spider)
514+
self.assertEqual(slot.delay, max_delay)
515+
516+
# other responses reset delay
517+
ban_req = Request(url, meta={'download_slot': slot_key})
518+
ban_headers = {'X-Crawlera-Error': 'banned'}
519+
ban_res = Response(
520+
ban_url, status=self.bancode, headers=ban_headers, request=ban_req)
521+
mw.process_response(ban_req, ban_res, self.spider)
522+
self.assertEqual(slot.delay, default_delay)
523+
524+
mw.process_response(noslaves_req, noslaves_res, self.spider)
525+
self.assertEqual(slot.delay, backoff_step)
526+
527+
good_req = Request(url, meta={'download_slot': slot_key})
528+
good_res = Response(
529+
url, status=200, request=good_req)
530+
mw.process_response(good_req, good_res, self.spider)
531+
self.assertEqual(slot.delay, default_delay)

0 commit comments

Comments
 (0)