Skip to content

Commit c51c565

Browse files
VMRuizeLRuLL
authored andcommitted
Check Crawlera Error Header when checking bans (#64)
* Check Crawlera Error Header when checking bans * fix tests for new feature * Added trailing comma and remove empty line
1 parent 28b38e4 commit c51c565

File tree

2 files changed

+48
-6
lines changed

2 files changed

+48
-6
lines changed

scrapy_crawlera/middleware.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,12 +134,18 @@ def process_request(self, request, spider):
134134
else:
135135
self._clean_crawlera_headers(request)
136136

137+
def _is_banned(self, response):
138+
return (
139+
response.status == self.ban_code and
140+
response.headers.get('X-Crawlera-Error') == b'banned'
141+
)
142+
137143
def process_response(self, request, response, spider):
138144
if not self._is_enabled_for_request(request):
139145
return response
140146
key = self._get_slot_key(request)
141147
self._restore_original_delay(request)
142-
if response.status == self.ban_code:
148+
if self._is_banned(response):
143149
self._bans[key] += 1
144150
if self._bans[key] > self.maxbans:
145151
self.crawler.engine.close_spider(spider, 'banned')

tests/test_crawlera.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,11 @@ def _assert_enabled(self, spider,
102102
# check for not banning before maxbans for bancode
103103
for x in range(maxbans + 1):
104104
self.assertEqual(crawler.engine.fake_spider_closed_result, None)
105-
res = Response('http://ban.me/%d' % x, status=self.bancode)
105+
res = Response(
106+
'http://ban.me/%d' % x,
107+
status=self.bancode,
108+
headers={'X-Crawlera-Error': 'banned'},
109+
)
106110
assert mw.process_response(req, res, spider) is res
107111

108112
# max bans reached and close_spider called
@@ -219,15 +223,30 @@ def test_delay_adjustment(self):
219223
slot = MockedSlot(self.spider.download_delay)
220224
crawler.engine.downloader.slots[slot_key] = slot
221225

222-
# ban
226+
# no ban
223227
req = Request(url, meta={'download_slot': slot_key})
224-
res = Response(ban_url, status=self.bancode, request=req)
228+
headers = {'X-Crawlera-Error': 'no_proxies'}
229+
res = Response(
230+
ban_url, status=self.bancode, headers=headers, request=req)
225231
mw.process_response(req, res, self.spider)
226232
self.assertEqual(slot.delay, delay)
227233
self.assertEqual(self.spider.download_delay, delay)
228234

235+
# ban without retry-after
236+
req = Request(url, meta={'download_slot': slot_key})
237+
headers = {'X-Crawlera-Error': 'banned'}
238+
res = Response(
239+
ban_url, status=self.bancode, headers=headers, request=req)
240+
mw.process_response(req, res, self.spider)
241+
self.assertEqual(slot.delay, delay)
242+
self.assertEqual(self.spider.download_delay, delay)
243+
244+
# ban with retry-after
229245
retry_after = 1.5
230-
headers = {'retry-after': str(retry_after)}
246+
headers = {
247+
'retry-after': str(retry_after),
248+
'X-Crawlera-Error': 'banned'
249+
}
231250
res = Response(
232251
ban_url, status=self.bancode, headers=headers, request=req)
233252
mw.process_response(req, res, self.spider)
@@ -335,8 +354,12 @@ def test_stats(self):
335354
assert mw.process_response(req, res, spider) is res
336355
self.assertEqual(crawler.stats.get_value('crawlera/response'), 2)
337356
self.assertEqual(crawler.stats.get_value('crawlera/response/status/{}'.format(mw.ban_code)), 1)
338-
self.assertEqual(crawler.stats.get_value('crawlera/response/banned'), 1)
339357
self.assertEqual(crawler.stats.get_value('crawlera/response/error/somethingbad'), 1)
358+
res = Response(req.url, status=mw.ban_code, headers={'X-Crawlera-Error': 'banned'})
359+
assert mw.process_response(req, res, spider) is res
360+
self.assertEqual(crawler.stats.get_value('crawlera/response'), 3)
361+
self.assertEqual(crawler.stats.get_value('crawlera/response/status/{}'.format(mw.ban_code)), 2)
362+
self.assertEqual(crawler.stats.get_value('crawlera/response/banned'), 1)
340363

341364
def _make_fake_request(self, spider, crawlera_enabled):
342365
spider.crawlera_enabled = crawlera_enabled
@@ -441,3 +464,16 @@ def test_dont_proxy_false_does_nothing(self):
441464
req.meta['dont_proxy'] = False
442465
assert mw.process_request(req, spider) is None
443466
self.assertIsNotNone(req.meta.get('proxy'))
467+
468+
def test_is_banned(self):
469+
self.spider.crawlera_enabled = True
470+
crawler = self._mock_crawler(self.spider, self.settings)
471+
mw = self.mwcls.from_crawler(crawler)
472+
mw.open_spider(self.spider)
473+
req = self._make_fake_request(self.spider, crawlera_enabled=True)
474+
res = Response(req.url, status=200)
475+
self.assertFalse(mw._is_banned(res))
476+
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'no_proxies'})
477+
self.assertFalse(mw._is_banned(res))
478+
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'banned'})
479+
self.assertTrue(mw._is_banned(res))

0 commit comments

Comments
 (0)