don't send cookies to Splash itself

kmike · kmike · commit 0ceeed0f55d1 · 2018-01-13T09:10:56.000+05:00
diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py
@@ -38,10 +38,11 @@ class SlotPolicy(object):
 
 class SplashCookiesMiddleware(object):
     """
-    This middleware maintains cookiejars for Splash requests.
+    This downloader middleware maintains cookiejars for Splash requests.
 
     It gets cookies from 'cookies' field in Splash JSON responses
-    and sends current cookies in 'cookies' JSON POST argument.
+    and sends current cookies in 'cookies' JSON POST argument instead of
+    sending them in http headers.
 
     It should process requests before SplashMiddleware, and process responses
     after SplashMiddleware.
@@ -57,12 +58,14 @@ def from_crawler(cls, crawler):
     def process_request(self, request, spider):
         """
         For Splash requests add 'cookies' key with current
-        cookies to request.meta['splash']['args']
+        cookies to ``request.meta['splash']['args']`` and remove cookie
+        headers sent to Splash itself.
         """
         if 'splash' not in request.meta:
             return
 
         if request.meta.get('_splash_processed'):
+            request.headers.pop('Cookie', None)
             return
 
         splash_options = request.meta['splash']
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import scrapy
 from pytest_twisted import inlineCallbacks
+from twisted.web.resource import Resource
 
 from scrapy_splash import SplashRequest
 from .utils import crawl_items, requires_splash, HtmlResource
@@ -35,7 +36,20 @@ class HelloWorld(HtmlResource):
     html = """
     <html><body><script>document.write('hello world!');</script></body></html>
     """
-    extra_headers = {'X-MyHeader': 'my value'}
+    extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
+
+
+
+class ManyCookies(Resource, object):
+    class SetMyCookie(HtmlResource):
+        html = "hello!"
+        extra_headers = {'Set-Cookie': 'login=1'}
+
+    def __init__(self):
+        super(ManyCookies, self).__init__()
+        self.putChild(b'', HelloWorld())
+        self.putChild(b'login', self.SetMyCookie())
+
 
 
 class ResponseSpider(scrapy.Spider):
@@ -49,24 +63,6 @@ def parse(self, response):
         yield {'response': response}
 
 
-class ReloadSpider(ResponseSpider):
-    """ Make two requests to URL, store both responses.
-    This spider activates both start_requests and parse methods,
-    and checks that dupefilter takes fragment into account. """
-
-    def parse(self, response):
-        yield {'response': response}
-        yield SplashRequest(self.url + '#foo')
-
-
-class LuaScriptSpider(ResponseSpider):
-    """ Make a request using a Lua script similar to the one from README """
-
-    def start_requests(self):
-        yield SplashRequest(self.url + "#foo", endpoint='execute',
-                            args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
-
-
 @requires_splash
 @inlineCallbacks
 def test_basic(settings):
@@ -81,6 +77,16 @@ def test_basic(settings):
 @requires_splash
 @inlineCallbacks
 def test_reload(settings):
+
+    class ReloadSpider(ResponseSpider):
+        """ Make two requests to URL, store both responses.
+        This spider activates both start_requests and parse methods,
+        and checks that dupefilter takes fragment into account. """
+
+        def parse(self, response):
+            yield {'response': response}
+            yield SplashRequest(self.url + '#foo')
+
     items, url, crawler = yield crawl_items(ReloadSpider, HelloWorld, settings)
     assert len(items) == 2
     assert crawler.stats.get_value('dupefilter/filtered') == 1
@@ -97,6 +103,15 @@ def test_reload(settings):
 @requires_splash
 @inlineCallbacks
 def test_basic_lua(settings):
+
+    class LuaScriptSpider(ResponseSpider):
+        """ Make a request using a Lua script similar to the one from README
+        """
+        def start_requests(self):
+            yield SplashRequest(self.url + "#foo", endpoint='execute',
+                            args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
+
+
     items, url, crawler = yield crawl_items(LuaScriptSpider, HelloWorld,
                                             settings)
     assert len(items) == 1
@@ -106,3 +121,109 @@ def test_basic_lua(settings):
     assert resp.data['jsvalue'] == 3
     assert resp.headers['X-MyHeader'] == b'my value'
     assert resp.data['args']['foo'] == 'bar'
+
+
+@requires_splash
+@inlineCallbacks
+def test_cookies(settings):
+    BOMB = 'x' * 64000
+    class LuaScriptSpider(ResponseSpider):
+        """ Cookies must be sent to website, not to Splash """
+        custom_settings = {
+            'SPLASH_COOKIES_DEBUG': True,
+            'COOKIES_DEBUG': True,
+        }
+
+        def start_requests(self):
+            # cookies set without Splash should be still
+            # sent to a remote website. FIXME: this is not the case.
+            yield scrapy.Request(self.url + "/login", self.parse,
+                                 cookies={'x-set-scrapy': '1'})
+
+        def parse(self, response):
+            yield SplashRequest(self.url + "#egg", self.parse_1,
+                                endpoint='execute',
+                                args={'lua_source': DEFAULT_SCRIPT},
+                                cookies={'x-set-splash': '1'})
+
+        def parse_1(self, response):
+            yield {'response': response}
+            yield SplashRequest(self.url + "#foo", self.parse_2,
+                                endpoint='execute',
+                                args={'lua_source': DEFAULT_SCRIPT})
+
+        def parse_2(self, response):
+            yield {'response': response}
+            yield scrapy.Request(self.url, self.parse_3)
+
+        def parse_3(self, response):
+            # Splash (Twisted) drops requests with huge http headers,
+            # but this one should work, as cookies are not sent
+            # to Splash itself.
+            yield {'response': response}
+            yield SplashRequest(self.url + "#bar", self.parse_4,
+                                endpoint='execute',
+                                args={'lua_source': DEFAULT_SCRIPT},
+                                cookies={'bomb': BOMB})
+
+
+        def parse_4(self, response):
+            yield {'response': response}
+
+
+    def _cookie_dict(har_cookies):
+        return {c['name']: c['value'] for c in har_cookies}
+
+    items, url, crawler = yield crawl_items(LuaScriptSpider, ManyCookies,
+                                            settings)
+    assert len(items) == 4
+
+    # cookie should be sent to remote website, not to Splash
+    resp = items[0]['response']
+    splash_headers = resp.request.headers
+    cookies = resp.data['args']['cookies']
+    print(splash_headers)
+    print(cookies)
+    assert _cookie_dict(cookies) == {
+        # 'login': '1',   # FIXME
+        'x-set-splash': '1'
+    }
+    assert splash_headers.get(b'Cookie') is None
+
+    # new cookie should be also sent to remote website, not to Splash
+    resp2 = items[1]['response']
+    splash_headers = resp2.request.headers
+    headers = resp2.data['args']['headers']
+    cookies = resp2.data['args']['cookies']
+    assert headers['Referer'].strip('/') == url.strip('/')
+    assert _cookie_dict(cookies) == {
+        # 'login': '1',
+        'x-set-splash': '1',
+        'sessionid': 'ABCD'
+    }
+    print(splash_headers)
+    print(headers)
+    print(cookies)
+    assert splash_headers.get(b'Cookie') is None
+
+    # TODO/FIXME: Cookies fetched when working with Splash should be picked up
+    # by Scrapy
+    resp3 = items[2]['response']
+    splash_headers = resp3.request.headers
+    cookie_header = splash_headers.get(b'Cookie')
+    assert b'x-set-scrapy=1' in cookie_header
+    assert b'login=1' in cookie_header
+    assert b'x-set-splash=1' in cookie_header
+    # assert b'sessionid=ABCD' in cookie_header  # FIXME
+
+    # cookie bomb shouldn't cause problems
+    resp4 = items[3]['response']
+    splash_headers = resp4.request.headers
+    cookies = resp4.data['args']['cookies']
+    assert _cookie_dict(cookies) == {
+        # 'login': '1',
+        'x-set-splash': '1',
+        'sessionid': 'ABCD',
+        'bomb': BOMB,
+    }
+    assert splash_headers.get(b'Cookie') is None