Skip to content

Commit 63650a6

Browse files
authored
Merge pull request #55 from stummjr/clean-headers
[MRG+2] clean X-Crawlera-* headers when crawlera is disabled
2 parents 4c0418c + 88c4132 commit 63650a6

File tree

3 files changed

+54
-0
lines changed

3 files changed

+54
-0
lines changed

docs/index.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ Here we have an example of specifying a Crawlera header into a Scrapy request::
7373
Remember that you could also set which headers to use by default by all
7474
requests with `DEFAULT_REQUEST_HEADERS <http://doc.scrapy.org/en/1.0/topics/settings.html#default-request-headers>`_
7575

76+
.. note:: Crawlera headers are removed from requests when the middleware is activated but Crawlera
77+
is disabled. For example, if you accidentally disable Crawlera via ``crawlera_enabled = False``
78+
but keep sending ``X-Crawlera-*`` headers in your requests, those will be removed from the
79+
request headers.
80+
81+
7682
This Middleware also adds some configurable Scrapy Settings, check :ref:`the complete list here <settings>`.
7783

7884
All the rest

scrapy_crawlera/middleware.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class CrawleraMiddleware(object):
2020
# Handle crawlera server failures
2121
connection_refused_delay = 90
2222
preserve_delay = False
23+
header_prefix = 'X-Crawlera-'
2324

2425
_settings = [
2526
('apikey', str),
@@ -126,6 +127,8 @@ def process_request(self, request, spider):
126127
request.headers['X-Crawlera-Jobid'] = self.job_id
127128
self.crawler.stats.inc_value('crawlera/request')
128129
self.crawler.stats.inc_value('crawlera/request/method/%s' % request.method)
130+
else:
131+
self._clean_crawlera_headers(request)
129132

130133
def process_response(self, request, response, spider):
131134
if not self._is_enabled_for_request(request):
@@ -192,3 +195,19 @@ def _restore_original_delay(self, request):
192195
return
193196
if self._saved_delays[key] is not None:
194197
slot.delay, self._saved_delays[key] = self._saved_delays[key], None
198+
199+
def _clean_crawlera_headers(self, request):
200+
"""Remove X-Crawlera-* headers from the request."""
201+
targets = [
202+
header
203+
for header in request.headers
204+
if self._is_crawlera_header(header)
205+
]
206+
for header in targets:
207+
request.headers.pop(header, None)
208+
209+
def _is_crawlera_header(self, header_name):
210+
if not header_name:
211+
return False
212+
header_name = header_name.decode('utf-8').lower()
213+
return header_name.startswith(self.header_prefix.lower())

tests/test_crawlera.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,32 @@ def test_stats(self):
333333
self.assertEqual(crawler.stats.get_value('crawlera/response/status/{}'.format(mw.ban_code)), 1)
334334
self.assertEqual(crawler.stats.get_value('crawlera/response/banned'), 1)
335335
self.assertEqual(crawler.stats.get_value('crawlera/response/error/somethingbad'), 1)
336+
337+
def _make_fake_request(self, spider, crawlera_enabled):
338+
spider.crawlera_enabled = crawlera_enabled
339+
crawler = self._mock_crawler(spider, self.settings)
340+
mw = self.mwcls.from_crawler(crawler)
341+
mw.open_spider(spider)
342+
headers = {
343+
'X-Crawlera-Debug': True,
344+
'X-Crawlera-Profile': 'desktop',
345+
'User-Agent': 'Scrapy',
346+
'': None,
347+
}
348+
req = Request('http://www.scrapytest.org', headers=headers)
349+
out = mw.process_request(req, spider)
350+
return req
351+
352+
def test_clean_headers_when_disabled(self):
353+
req = self._make_fake_request(self.spider, crawlera_enabled=False)
354+
355+
self.assertNotIn(b'X-Crawlera-Debug', req.headers)
356+
self.assertNotIn(b'X-Crawlera-Profile', req.headers)
357+
self.assertIn(b'User-Agent', req.headers)
358+
359+
def test_clean_headers_when_enabled(self):
360+
req = self._make_fake_request(self.spider, crawlera_enabled=True)
361+
362+
self.assertIn(b'X-Crawlera-Debug', req.headers)
363+
self.assertIn(b'X-Crawlera-Profile', req.headers)
364+
self.assertIn(b'User-Agent', req.headers)

0 commit comments

Comments
 (0)