11# -*- coding: utf-8 -*-
22import scrapy
33from pytest_twisted import inlineCallbacks
4+ from twisted .web .resource import Resource
45
56from scrapy_splash import SplashRequest
67from .utils import crawl_items , requires_splash , HtmlResource
@@ -35,7 +36,20 @@ class HelloWorld(HtmlResource):
3536 html = """
3637 <html><body><script>document.write('hello world!');</script></body></html>
3738 """
38- extra_headers = {'X-MyHeader' : 'my value' }
39+ extra_headers = {'X-MyHeader' : 'my value' , 'Set-Cookie' : 'sessionid=ABCD' }
40+
41+
42+
43+ class ManyCookies (Resource , object ):
44+ class SetMyCookie (HtmlResource ):
45+ html = "hello!"
46+ extra_headers = {'Set-Cookie' : 'login=1' }
47+
48+ def __init__ (self ):
49+ super (ManyCookies , self ).__init__ ()
50+ self .putChild (b'' , HelloWorld ())
51+ self .putChild (b'login' , self .SetMyCookie ())
52+
3953
4054
4155class ResponseSpider (scrapy .Spider ):
@@ -49,24 +63,6 @@ def parse(self, response):
4963 yield {'response' : response }
5064
5165
52- class ReloadSpider (ResponseSpider ):
53- """ Make two requests to URL, store both responses.
54- This spider activates both start_requests and parse methods,
55- and checks that dupefilter takes fragment into account. """
56-
57- def parse (self , response ):
58- yield {'response' : response }
59- yield SplashRequest (self .url + '#foo' )
60-
61-
62- class LuaScriptSpider (ResponseSpider ):
63- """ Make a request using a Lua script similar to the one from README """
64-
65- def start_requests (self ):
66- yield SplashRequest (self .url + "#foo" , endpoint = 'execute' ,
67- args = {'lua_source' : DEFAULT_SCRIPT , 'foo' : 'bar' })
68-
69-
7066@requires_splash
7167@inlineCallbacks
7268def test_basic (settings ):
@@ -81,6 +77,16 @@ def test_basic(settings):
8177@requires_splash
8278@inlineCallbacks
8379def test_reload (settings ):
80+
81+ class ReloadSpider (ResponseSpider ):
82+ """ Make two requests to URL, store both responses.
83+ This spider activates both start_requests and parse methods,
84+ and checks that dupefilter takes fragment into account. """
85+
86+ def parse (self , response ):
87+ yield {'response' : response }
88+ yield SplashRequest (self .url + '#foo' )
89+
8490 items , url , crawler = yield crawl_items (ReloadSpider , HelloWorld , settings )
8591 assert len (items ) == 2
8692 assert crawler .stats .get_value ('dupefilter/filtered' ) == 1
@@ -97,6 +103,15 @@ def test_reload(settings):
97103@requires_splash
98104@inlineCallbacks
99105def test_basic_lua (settings ):
106+
107+ class LuaScriptSpider (ResponseSpider ):
108+ """ Make a request using a Lua script similar to the one from README
109+ """
110+ def start_requests (self ):
111+ yield SplashRequest (self .url + "#foo" , endpoint = 'execute' ,
112+ args = {'lua_source' : DEFAULT_SCRIPT , 'foo' : 'bar' })
113+
114+
100115 items , url , crawler = yield crawl_items (LuaScriptSpider , HelloWorld ,
101116 settings )
102117 assert len (items ) == 1
@@ -106,3 +121,109 @@ def test_basic_lua(settings):
106121 assert resp .data ['jsvalue' ] == 3
107122 assert resp .headers ['X-MyHeader' ] == b'my value'
108123 assert resp .data ['args' ]['foo' ] == 'bar'
124+
125+
126+ @requires_splash
127+ @inlineCallbacks
128+ def test_cookies (settings ):
129+ BOMB = 'x' * 64000
130+ class LuaScriptSpider (ResponseSpider ):
131+ """ Cookies must be sent to website, not to Splash """
132+ custom_settings = {
133+ 'SPLASH_COOKIES_DEBUG' : True ,
134+ 'COOKIES_DEBUG' : True ,
135+ }
136+
137+ def start_requests (self ):
138+ # cookies set without Splash should be still
139+ # sent to a remote website. FIXME: this is not the case.
140+ yield scrapy .Request (self .url + "/login" , self .parse ,
141+ cookies = {'x-set-scrapy' : '1' })
142+
143+ def parse (self , response ):
144+ yield SplashRequest (self .url + "#egg" , self .parse_1 ,
145+ endpoint = 'execute' ,
146+ args = {'lua_source' : DEFAULT_SCRIPT },
147+ cookies = {'x-set-splash' : '1' })
148+
149+ def parse_1 (self , response ):
150+ yield {'response' : response }
151+ yield SplashRequest (self .url + "#foo" , self .parse_2 ,
152+ endpoint = 'execute' ,
153+ args = {'lua_source' : DEFAULT_SCRIPT })
154+
155+ def parse_2 (self , response ):
156+ yield {'response' : response }
157+ yield scrapy .Request (self .url , self .parse_3 )
158+
159+ def parse_3 (self , response ):
160+ # Splash (Twisted) drops requests with huge http headers,
161+ # but this one should work, as cookies are not sent
162+ # to Splash itself.
163+ yield {'response' : response }
164+ yield SplashRequest (self .url + "#bar" , self .parse_4 ,
165+ endpoint = 'execute' ,
166+ args = {'lua_source' : DEFAULT_SCRIPT },
167+ cookies = {'bomb' : BOMB })
168+
169+
170+ def parse_4 (self , response ):
171+ yield {'response' : response }
172+
173+
174+ def _cookie_dict (har_cookies ):
175+ return {c ['name' ]: c ['value' ] for c in har_cookies }
176+
177+ items , url , crawler = yield crawl_items (LuaScriptSpider , ManyCookies ,
178+ settings )
179+ assert len (items ) == 4
180+
181+ # cookie should be sent to remote website, not to Splash
182+ resp = items [0 ]['response' ]
183+ splash_headers = resp .request .headers
184+ cookies = resp .data ['args' ]['cookies' ]
185+ print (splash_headers )
186+ print (cookies )
187+ assert _cookie_dict (cookies ) == {
188+ # 'login': '1', # FIXME
189+ 'x-set-splash' : '1'
190+ }
191+ assert splash_headers .get (b'Cookie' ) is None
192+
193+ # new cookie should be also sent to remote website, not to Splash
194+ resp2 = items [1 ]['response' ]
195+ splash_headers = resp2 .request .headers
196+ headers = resp2 .data ['args' ]['headers' ]
197+ cookies = resp2 .data ['args' ]['cookies' ]
198+ assert headers ['Referer' ].strip ('/' ) == url .strip ('/' )
199+ assert _cookie_dict (cookies ) == {
200+ # 'login': '1',
201+ 'x-set-splash' : '1' ,
202+ 'sessionid' : 'ABCD'
203+ }
204+ print (splash_headers )
205+ print (headers )
206+ print (cookies )
207+ assert splash_headers .get (b'Cookie' ) is None
208+
209+ # TODO/FIXME: Cookies fetched when working with Splash should be picked up
210+ # by Scrapy
211+ resp3 = items [2 ]['response' ]
212+ splash_headers = resp3 .request .headers
213+ cookie_header = splash_headers .get (b'Cookie' )
214+ assert b'x-set-scrapy=1' in cookie_header
215+ assert b'login=1' in cookie_header
216+ assert b'x-set-splash=1' in cookie_header
217+ # assert b'sessionid=ABCD' in cookie_header # FIXME
218+
219+ # cookie bomb shouldn't cause problems
220+ resp4 = items [3 ]['response' ]
221+ splash_headers = resp4 .request .headers
222+ cookies = resp4 .data ['args' ]['cookies' ]
223+ assert _cookie_dict (cookies ) == {
224+ # 'login': '1',
225+ 'x-set-splash' : '1' ,
226+ 'sessionid' : 'ABCD' ,
227+ 'bomb' : BOMB ,
228+ }
229+ assert splash_headers .get (b'Cookie' ) is None
0 commit comments