Skip to content

Commit 0ceeed0

Browse files
committed
don't send cookies to Splash itself
1 parent 869f955 commit 0ceeed0

File tree

2 files changed

+146
-22
lines changed

2 files changed

+146
-22
lines changed

scrapy_splash/middleware.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,11 @@ class SlotPolicy(object):
3838

3939
class SplashCookiesMiddleware(object):
4040
"""
41-
This middleware maintains cookiejars for Splash requests.
41+
This downloader middleware maintains cookiejars for Splash requests.
4242
4343
It gets cookies from 'cookies' field in Splash JSON responses
44-
and sends current cookies in 'cookies' JSON POST argument.
44+
and sends current cookies in 'cookies' JSON POST argument instead of
45+
sending them in http headers.
4546
4647
It should process requests before SplashMiddleware, and process responses
4748
after SplashMiddleware.
@@ -57,12 +58,14 @@ def from_crawler(cls, crawler):
5758
def process_request(self, request, spider):
5859
"""
5960
For Splash requests add 'cookies' key with current
60-
cookies to request.meta['splash']['args']
61+
cookies to ``request.meta['splash']['args']`` and remove cookie
62+
headers sent to Splash itself.
6163
"""
6264
if 'splash' not in request.meta:
6365
return
6466

6567
if request.meta.get('_splash_processed'):
68+
request.headers.pop('Cookie', None)
6669
return
6770

6871
splash_options = request.meta['splash']

tests/test_integration.py

Lines changed: 140 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import scrapy
33
from pytest_twisted import inlineCallbacks
4+
from twisted.web.resource import Resource
45

56
from scrapy_splash import SplashRequest
67
from .utils import crawl_items, requires_splash, HtmlResource
@@ -35,7 +36,20 @@ class HelloWorld(HtmlResource):
3536
html = """
3637
<html><body><script>document.write('hello world!');</script></body></html>
3738
"""
38-
extra_headers = {'X-MyHeader': 'my value'}
39+
extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
40+
41+
42+
43+
class ManyCookies(Resource, object):
44+
class SetMyCookie(HtmlResource):
45+
html = "hello!"
46+
extra_headers = {'Set-Cookie': 'login=1'}
47+
48+
def __init__(self):
49+
super(ManyCookies, self).__init__()
50+
self.putChild(b'', HelloWorld())
51+
self.putChild(b'login', self.SetMyCookie())
52+
3953

4054

4155
class ResponseSpider(scrapy.Spider):
@@ -49,24 +63,6 @@ def parse(self, response):
4963
yield {'response': response}
5064

5165

52-
class ReloadSpider(ResponseSpider):
53-
""" Make two requests to URL, store both responses.
54-
This spider activates both start_requests and parse methods,
55-
and checks that dupefilter takes fragment into account. """
56-
57-
def parse(self, response):
58-
yield {'response': response}
59-
yield SplashRequest(self.url + '#foo')
60-
61-
62-
class LuaScriptSpider(ResponseSpider):
63-
""" Make a request using a Lua script similar to the one from README """
64-
65-
def start_requests(self):
66-
yield SplashRequest(self.url + "#foo", endpoint='execute',
67-
args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
68-
69-
7066
@requires_splash
7167
@inlineCallbacks
7268
def test_basic(settings):
@@ -81,6 +77,16 @@ def test_basic(settings):
8177
@requires_splash
8278
@inlineCallbacks
8379
def test_reload(settings):
80+
81+
class ReloadSpider(ResponseSpider):
82+
""" Make two requests to URL, store both responses.
83+
This spider activates both start_requests and parse methods,
84+
and checks that dupefilter takes fragment into account. """
85+
86+
def parse(self, response):
87+
yield {'response': response}
88+
yield SplashRequest(self.url + '#foo')
89+
8490
items, url, crawler = yield crawl_items(ReloadSpider, HelloWorld, settings)
8591
assert len(items) == 2
8692
assert crawler.stats.get_value('dupefilter/filtered') == 1
@@ -97,6 +103,15 @@ def test_reload(settings):
97103
@requires_splash
98104
@inlineCallbacks
99105
def test_basic_lua(settings):
106+
107+
class LuaScriptSpider(ResponseSpider):
108+
""" Make a request using a Lua script similar to the one from README
109+
"""
110+
def start_requests(self):
111+
yield SplashRequest(self.url + "#foo", endpoint='execute',
112+
args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
113+
114+
100115
items, url, crawler = yield crawl_items(LuaScriptSpider, HelloWorld,
101116
settings)
102117
assert len(items) == 1
@@ -106,3 +121,109 @@ def test_basic_lua(settings):
106121
assert resp.data['jsvalue'] == 3
107122
assert resp.headers['X-MyHeader'] == b'my value'
108123
assert resp.data['args']['foo'] == 'bar'
124+
125+
126+
@requires_splash
127+
@inlineCallbacks
128+
def test_cookies(settings):
129+
BOMB = 'x' * 64000
130+
class LuaScriptSpider(ResponseSpider):
131+
""" Cookies must be sent to website, not to Splash """
132+
custom_settings = {
133+
'SPLASH_COOKIES_DEBUG': True,
134+
'COOKIES_DEBUG': True,
135+
}
136+
137+
def start_requests(self):
138+
# cookies set without Splash should be still
139+
# sent to a remote website. FIXME: this is not the case.
140+
yield scrapy.Request(self.url + "/login", self.parse,
141+
cookies={'x-set-scrapy': '1'})
142+
143+
def parse(self, response):
144+
yield SplashRequest(self.url + "#egg", self.parse_1,
145+
endpoint='execute',
146+
args={'lua_source': DEFAULT_SCRIPT},
147+
cookies={'x-set-splash': '1'})
148+
149+
def parse_1(self, response):
150+
yield {'response': response}
151+
yield SplashRequest(self.url + "#foo", self.parse_2,
152+
endpoint='execute',
153+
args={'lua_source': DEFAULT_SCRIPT})
154+
155+
def parse_2(self, response):
156+
yield {'response': response}
157+
yield scrapy.Request(self.url, self.parse_3)
158+
159+
def parse_3(self, response):
160+
# Splash (Twisted) drops requests with huge http headers,
161+
# but this one should work, as cookies are not sent
162+
# to Splash itself.
163+
yield {'response': response}
164+
yield SplashRequest(self.url + "#bar", self.parse_4,
165+
endpoint='execute',
166+
args={'lua_source': DEFAULT_SCRIPT},
167+
cookies={'bomb': BOMB})
168+
169+
170+
def parse_4(self, response):
171+
yield {'response': response}
172+
173+
174+
def _cookie_dict(har_cookies):
175+
return {c['name']: c['value'] for c in har_cookies}
176+
177+
items, url, crawler = yield crawl_items(LuaScriptSpider, ManyCookies,
178+
settings)
179+
assert len(items) == 4
180+
181+
# cookie should be sent to remote website, not to Splash
182+
resp = items[0]['response']
183+
splash_headers = resp.request.headers
184+
cookies = resp.data['args']['cookies']
185+
print(splash_headers)
186+
print(cookies)
187+
assert _cookie_dict(cookies) == {
188+
# 'login': '1', # FIXME
189+
'x-set-splash': '1'
190+
}
191+
assert splash_headers.get(b'Cookie') is None
192+
193+
# new cookie should be also sent to remote website, not to Splash
194+
resp2 = items[1]['response']
195+
splash_headers = resp2.request.headers
196+
headers = resp2.data['args']['headers']
197+
cookies = resp2.data['args']['cookies']
198+
assert headers['Referer'].strip('/') == url.strip('/')
199+
assert _cookie_dict(cookies) == {
200+
# 'login': '1',
201+
'x-set-splash': '1',
202+
'sessionid': 'ABCD'
203+
}
204+
print(splash_headers)
205+
print(headers)
206+
print(cookies)
207+
assert splash_headers.get(b'Cookie') is None
208+
209+
# TODO/FIXME: Cookies fetched when working with Splash should be picked up
210+
# by Scrapy
211+
resp3 = items[2]['response']
212+
splash_headers = resp3.request.headers
213+
cookie_header = splash_headers.get(b'Cookie')
214+
assert b'x-set-scrapy=1' in cookie_header
215+
assert b'login=1' in cookie_header
216+
assert b'x-set-splash=1' in cookie_header
217+
# assert b'sessionid=ABCD' in cookie_header # FIXME
218+
219+
# cookie bomb shouldn't cause problems
220+
resp4 = items[3]['response']
221+
splash_headers = resp4.request.headers
222+
cookies = resp4.data['args']['cookies']
223+
assert _cookie_dict(cookies) == {
224+
# 'login': '1',
225+
'x-set-splash': '1',
226+
'sessionid': 'ABCD',
227+
'bomb': BOMB,
228+
}
229+
assert splash_headers.get(b'Cookie') is None

0 commit comments

Comments
 (0)