diff --git a/docs/index.rst b/docs/index.rst index 3dc1d51..50593c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -89,6 +89,47 @@ requests with `DEFAULT_REQUEST_HEADERS `. + +Reusing sessions +================ + +To create a request in a callback and have that request reuse the same Crawlera +session as the callback response, you have to write something like:: + + def callback(self, response): + session = response.headers.get('X-Crawlera-Session') + # … + headers = {} + if session: + headers = {'X-Crawlera-Session': session} + yield Request(url, callback=self.callback, headers=headers) + +scrapy-crawlera provides an optional spider middleware that, if enabled, allows +setting ``crawlera_session_reuse`` to ``True`` in your request to reuse the +Crawlera session from the source response:: + + def callback(self, response): + meta = {'crawlera_session_reuse': True} + yield Request(url, callback=self.callback, meta=meta) + +To enable the Crawlera session reuse spider middleware, add it to your +``SPIDER_MIDDLEWARES`` setting:: + + SPIDER_MIDDLEWARES = { + 'scrapy_crawlera.CrawleraSessionReuseMiddleware': 1000, + } + +By default, ``CrawleraSessionReuseMiddleware`` removes ``X-Crawlera-Session`` +from the request headers if the source response did not use a Crawlera session, +or the source Crawlera session ID was bad. Use the +``CRAWLERA_SESSION_REUSE_DEFAULT_SESSION`` setting to set a fallback Crawlera +session value instead. For example, to create a new Crawlera session on +requests that come from responses without a Crawlera session or with a bad +Crawlera session ID:: + + CRAWLERA_SESSION_REUSE_DEFAULT_SESSION = 'create' + + All the rest ============ diff --git a/scrapy_crawlera/__init__.py b/scrapy_crawlera/__init__.py index 06bf154..f208fb6 100644 --- a/scrapy_crawlera/__init__.py +++ b/scrapy_crawlera/__init__.py @@ -1,4 +1,5 @@ from .middleware import CrawleraMiddleware +from .spidermiddlewares import CrawleraSessionReuseMiddleware __version__ = '1.6.0' diff --git a/scrapy_crawlera/spidermiddlewares.py b/scrapy_crawlera/spidermiddlewares.py new file mode 100644 index 0000000..c7e833c --- /dev/null +++ b/scrapy_crawlera/spidermiddlewares.py @@ -0,0 +1,37 @@ +from scrapy import Request + + +class CrawleraSessionReuseMiddleware(object): + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + setting = 'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION' + self._default_session = crawler.settings.get(setting) + + def process_spider_output(self, response, result, spider): + def _set_session(request_or_item): + if not isinstance(request_or_item, Request): + return request_or_item + + request = request_or_item + header = b'X-Crawlera-Session' + meta_key = 'crawlera_session_reuse' + + if request.meta.get(meta_key) is not True: + return request + + session = response.headers.get(header) + error = response.headers.get(b'X-Crawlera-Error') + session_is_bad = error == b'bad_session_id' + + if session is not None and not session_is_bad: + request.headers[header] = session + elif self._default_session: + request.headers[header] = self._default_session + return request + + return (_set_session(request_or_item) + for request_or_item in result or ()) diff --git a/tests/test_spidermiddlewares_session.py b/tests/test_spidermiddlewares_session.py new file mode 100644 index 0000000..27fcd5e --- /dev/null +++ b/tests/test_spidermiddlewares_session.py @@ -0,0 +1,211 @@ +import pytest +from scrapy import Spider as _Spider +from scrapy.http import Response, Request +from scrapy.item import Item +from scrapy.utils.reqser import request_to_dict +from scrapy.utils.test import get_crawler + +from scrapy_crawlera.spidermiddlewares import CrawleraSessionReuseMiddleware + + +SESSION = '1' + + +def compare_requests(request1, request2): + assert request_to_dict(request1) == request_to_dict(request2) + + +def process_output(response, result, settings=None): + crawler = get_crawler(Spider, settings) + mw = CrawleraSessionReuseMiddleware.from_crawler(crawler) + generator = mw.process_spider_output(response, [result], Spider()) + return list(generator)[0] + + +def get_request(reuse=False, session=None): + headers = {} + if session is not None: + headers['X-Crawlera-Session'] = session + meta = {} + if reuse is True: + meta['crawlera_session_reuse'] = True + return Request('https://example.com', headers=headers, meta=meta) + + +def get_response(session=None, error=None): + headers = {} + if session is not None: + headers['X-Crawlera-Session'] = session + if error is not None: + headers['X-Crawlera-Error'] = error + return Response('https://example.com', headers=headers) + + +class Spider(_Spider): + name = 'spider' + + +@pytest.mark.parametrize( + 'item', + [ + ( + {}, + ), + ( + Item(), + ), + ] +) +def test_item(item): + response = get_response(session=SESSION) + assert process_output(response, item) == item + + +def test_no_session(): + response = get_response() + input_request = get_request() + processed_request = process_output(response, input_request) + expected_request = get_request() + compare_requests(processed_request, expected_request) + + +def test_bad_session_id(): + response = get_response(session=SESSION, error='bad_session_id') + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request(reuse=True) + compare_requests(processed_request, expected_request) + + +def test_bad_session_id_default_session(): + response = get_response(session=SESSION, error='bad_session_id') + input_request = get_request(reuse=True) + settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'} + processed_request = process_output(response, input_request, settings) + expected_request = get_request(reuse=True, session='create') + compare_requests(processed_request, expected_request) + + +def test_user_session_limit(): + # This session error is only expected to come from a response that has no + # ``X-Crawlera-Session`` value, caused by a request with ``create`` as + # ``X-Crawlera-Session`` value. + response = get_response(error='user_session_limit') + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request(reuse=True) + compare_requests(processed_request, expected_request) + + +@pytest.mark.parametrize( + 'error', + [ + # https://doc.scrapinghub.com/crawlera.html#errors + ( + 'bad_proxy_auth', + ), + ( + 'too_many_conns', + ), + ( + 'header_auth', + ), + ( + '', + ), + ( + 'nxdomain', + ), + ( + 'ehostunreach', + ), + ( + 'econnrefused', + ), + ( + 'econnreset', + ), + ( + 'socket_closed_remotely', + ), + ( + 'client_conn_closed', + ), + ( + 'noslaves', + ), + ( + 'banned', + ), + ( + 'serverbusy', + ), + ( + 'timeout', + ), + ( + 'msgtimeout', + ), + ( + 'domain_forbidden', + ), + ( + 'bad_header', + ), + ( + 'data_error', + ), + ] +) +def test_non_session_error(error): + session = SESSION + response = get_response(session=session, error=error) + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request(reuse=True, session=SESSION) + compare_requests(processed_request, expected_request) + + +def test_session(): + session = SESSION + response = get_response(session=session) + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request(reuse=True, session=SESSION) + compare_requests(processed_request, expected_request) + + +def test_create_on_sessionless_reuse(): + response = get_response() + input_request = get_request(reuse=True) + settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'} + processed_request = process_output(response, input_request, settings) + expected_request = get_request(reuse=True, session='create') + compare_requests(processed_request, expected_request) + + +def test_dont_create_on_sessionless_reuse(): + response = get_response() + input_request = get_request(reuse=True) + processed_request = process_output(response, input_request) + expected_request = get_request(reuse=True) + compare_requests(processed_request, expected_request) + + +@pytest.mark.parametrize( + 'session', + [ + ( + SESSION, + ), + ( + 'create', + ), + ] +) +def test_header_without_reuse(session): + response = get_response() + input_request = get_request(session=session) + processed_request = process_output(response, input_request) + expected_request = get_request(session=session) + compare_requests(processed_request, expected_request)