Skip to content

Commit c934fee

Browse files
authored
Merge pull request #58 from hcoura/default-crawlera-headers
[MRG + 1] Add DEFAULT_CRAWLERA_HEADERS settings
2 parents 63650a6 + 250ceb8 commit c934fee

File tree

5 files changed

+104
-2
lines changed

5 files changed

+104
-2
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ htmlcov/
5353
nosetests.xml
5454
coverage.xml
5555
*,cover
56+
.pytest_cache
5657

5758
# Translations
5859
*.mo
@@ -67,4 +68,7 @@ docs/_build/
6768
target/
6869

6970
# IDEA
70-
.idea/
71+
.idea/
72+
73+
# Pipenv
74+
Pipfile*

docs/settings.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,13 @@ CRAWLERA_PRESERVE_DELAY
5757
Default: ``False``
5858

5959
If ``False`` Sets Scrapy's ``DOWNLOAD_DELAY`` to ``0``, making the spider to crawl faster. If set to ``True``, it will
60-
respect the provided ``DOWNLOAD_DELAY`` from Scrapy.
60+
respect the provided ``DOWNLOAD_DELAY`` from Scrapy.
61+
62+
CRAWLERA_DEFAULT_HEADERS
63+
-----------------------
64+
65+
Default: ``{}``
66+
67+
Default headers added only to crawlera requests. Headers defined on ``DEFAULT_REQUEST_HEADERS`` will take precedence as long as the ``CrawleraMiddleware`` is placed after the ``DefaultHeadersMiddleware``*. Headers set on the requests have precedence over the two settings.
68+
69+
*This is the default behavior, ``DefaultHeadersMiddleware`` default priority is ``400`` and we recommend ``CrawleraMiddleware`` priority to be ``610``

scrapy_crawlera/middleware.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class CrawleraMiddleware(object):
2121
connection_refused_delay = 90
2222
preserve_delay = False
2323
header_prefix = 'X-Crawlera-'
24+
conflicting_headers = ('X-Crawlera-Profile', 'X-Crawlera-UA')
2425

2526
_settings = [
2627
('apikey', str),
@@ -64,6 +65,8 @@ def open_spider(self, spider):
6465
"CrawleraMiddleware: disabling download delays on Scrapy side to optimize delays introduced by Crawlera. "
6566
"To avoid this behaviour you can use the CRAWLERA_PRESERVE_DELAY setting but keep in mind that this may slow down the crawl significantly")
6667

68+
self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items()
69+
6770
def _settings_get(self, type_, *a, **kw):
6871
if type_ is int:
6972
return self.crawler.settings.getint(*a, **kw)
@@ -120,6 +123,7 @@ def get_proxyauth(self, spider):
120123

121124
def process_request(self, request, spider):
122125
if self._is_enabled_for_request(request):
126+
self._set_crawlera_default_headers(request)
123127
request.meta['proxy'] = self.url
124128
request.meta['download_timeout'] = self.download_timeout
125129
request.headers['Proxy-Authorization'] = self._proxyauth
@@ -211,3 +215,19 @@ def _is_crawlera_header(self, header_name):
211215
return False
212216
header_name = header_name.decode('utf-8').lower()
213217
return header_name.startswith(self.header_prefix.lower())
218+
219+
def _set_crawlera_default_headers(self, request):
220+
for header, value in self._headers:
221+
if value is None:
222+
continue
223+
request.headers.setdefault(header, value)
224+
lower_case_headers = [
225+
header.decode('utf-8').lower() for header in request.headers
226+
]
227+
if all(h.lower() in lower_case_headers for h in self.conflicting_headers):
228+
logging.warn(
229+
'The headers %s are conflicting on request %s. X-Crawlera-UA '
230+
'will be ignored. Please check https://doc.scrapinghub.com/cr'
231+
'awlera.html for more information'
232+
% (str(self.conflicting_headers), request.url)
233+
)

tests/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pytest
22
pytest-cov
3+
mock; python_version == '2.7'

tests/test_crawlera.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
from unittest import TestCase
2+
try:
3+
from unittest.mock import patch
4+
except ImportError:
5+
from mock import patch
26

37
from w3lib.http import basic_auth_header
48
from scrapy.http import Request, Response
@@ -362,3 +366,67 @@ def test_clean_headers_when_enabled(self):
362366
self.assertIn(b'X-Crawlera-Debug', req.headers)
363367
self.assertIn(b'X-Crawlera-Profile', req.headers)
364368
self.assertIn(b'User-Agent', req.headers)
369+
370+
def test_crawlera_default_headers(self):
371+
spider = self.spider
372+
self.spider.crawlera_enabled = True
373+
374+
self.settings['CRAWLERA_DEFAULT_HEADERS'] = {
375+
'X-Crawlera-Profile': 'desktop'
376+
}
377+
crawler = self._mock_crawler(spider, self.settings)
378+
mw = self.mwcls.from_crawler(crawler)
379+
mw.open_spider(spider)
380+
req = Request('http://www.scrapytest.org/other')
381+
assert mw.process_request(req, spider) is None
382+
self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop')
383+
384+
# test ignore None headers
385+
self.settings['CRAWLERA_DEFAULT_HEADERS'] = {
386+
'X-Crawlera-Profile': None,
387+
'X-Crawlera-Cookies': 'disable'
388+
}
389+
crawler = self._mock_crawler(spider, self.settings)
390+
mw = self.mwcls.from_crawler(crawler)
391+
mw.open_spider(spider)
392+
req = Request('http://www.scrapytest.org/other')
393+
assert mw.process_request(req, spider) is None
394+
self.assertEqual(req.headers['X-Crawlera-Cookies'], b'disable')
395+
self.assertNotIn('X-Crawlera-Profile', req.headers)
396+
397+
@patch('scrapy_crawlera.middleware.logging')
398+
def test_crawlera_default_headers_conflicting_headers(self, mock_logger):
399+
spider = self.spider
400+
self.spider.crawlera_enabled = True
401+
402+
self.settings['CRAWLERA_DEFAULT_HEADERS'] = {
403+
'X-Crawlera-Profile': 'desktop'
404+
}
405+
crawler = self._mock_crawler(spider, self.settings)
406+
mw = self.mwcls.from_crawler(crawler)
407+
mw.open_spider(spider)
408+
409+
req = Request('http://www.scrapytest.org/other',
410+
headers={'X-Crawlera-UA': 'desktop'})
411+
assert mw.process_request(req, spider) is None
412+
self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop')
413+
self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop')
414+
mock_logger.warn.assert_called_with(
415+
"The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
416+
"g on request http://www.scrapytest.org/other. X-Crawlera-UA will b"
417+
"e ignored. Please check https://doc.scrapinghub.com/crawlera.html "
418+
"for more information"
419+
)
420+
421+
# test it ignores case
422+
req = Request('http://www.scrapytest.org/other',
423+
headers={'x-crawlera-ua': 'desktop'})
424+
assert mw.process_request(req, spider) is None
425+
self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop')
426+
self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop')
427+
mock_logger.warn.assert_called_with(
428+
"The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
429+
"g on request http://www.scrapytest.org/other. X-Crawlera-UA will b"
430+
"e ignored. Please check https://doc.scrapinghub.com/crawlera.html "
431+
"for more information"
432+
)

0 commit comments

Comments
 (0)