Skip to content

Commit cde6b3f

Browse files
authored
Merge pull request #158 from scrapy-plugins/expose-original-info
allow to get original response information
2 parents eb0b291 + 74b2d6e commit cde6b3f

File tree

6 files changed

+132
-25
lines changed

6 files changed

+132
-25
lines changed

README.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,9 @@ to set ``meta['splash']['args']`` use ``SplashRequest(..., args=myargs)``.
271271
and ``assert(splash:go(..))`` fails with an HTTP error
272272
response.status is also set to HTTP error code.
273273

274+
Original URL, status and headers are available as ``response.real_url``,
275+
``response.splash_response_status`` and ``response.splash_response_headers``.
276+
274277
This option is set to True by default if you use SplashRequest.
275278
``render.json`` and ``execute`` endpoints may not have all the necessary
276279
keys/values in the response.
@@ -631,7 +634,9 @@ aware of:
631634

632635
3. As seen by Scrapy, response.url is an URL of the Splash server.
633636
scrapy-splash fixes it to be an URL of a requested page.
634-
"Real" URL is still available as ``response.real_url``.
637+
"Real" URL is still available as ``response.real_url``. scrapy-splash also
638+
allows to handle ``response.status`` and ``response.headers`` transparently
639+
on Scrapy side.
635640

636641
4. Some options depend on each other - for example, if you use timeout_
637642
Splash option then you may want to set ``download_timeout``

scrapy_splash/middleware.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
json_based_hash,
2424
parse_x_splash_saved_arguments_header,
2525
)
26+
from scrapy_splash.response import get_splash_status, get_splash_headers
2627

2728

2829
logger = logging.getLogger(__name__)
@@ -379,7 +380,7 @@ def process_response(self, request, response, spider):
379380

380381
# handle save_args/load_args
381382
self._process_x_splash_saved_arguments(request, response)
382-
if response.status == 498:
383+
if get_splash_status(response) == 498:
383384
logger.debug("Got HTTP 498 response for {}; "
384385
"sending arguments again.".format(request),
385386
extra={'spider': spider})
@@ -390,7 +391,7 @@ def process_response(self, request, response, spider):
390391

391392
response = self._change_response_class(request, response)
392393

393-
if self.log_400 and response.status == 400:
394+
if self.log_400 and get_splash_status(response) == 400:
394395
self._log_400(request, response, spider)
395396

396397
return response
@@ -423,7 +424,7 @@ def _log_400(self, request, response, spider):
423424

424425
def _process_x_splash_saved_arguments(self, request, response):
425426
""" Keep track of arguments saved by Splash. """
426-
saved_args = response.headers.get(b'X-Splash-Saved-Arguments')
427+
saved_args = get_splash_headers(response).get(b'X-Splash-Saved-Arguments')
427428
if not saved_args:
428429
return
429430
saved_args = parse_x_splash_saved_arguments_header(saved_args)

scrapy_splash/response.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@
1111
from scrapy_splash.utils import headers_to_scrapy
1212

1313

14+
def get_splash_status(resp):
15+
return getattr(resp, 'splash_response_status', resp.status)
16+
17+
18+
def get_splash_headers(resp):
19+
return getattr(resp, 'splash_response_headers', resp.headers)
20+
21+
1422
class _SplashResponseMixin(object):
1523
"""
1624
This mixin fixes response.url and adds response.real_url
@@ -30,14 +38,23 @@ def __init__(self, url, *args, **kwargs):
3038
if _url is not None:
3139
self.real_url = url
3240
url = _url
41+
self.splash_response_status = kwargs.pop('splash_response_status',
42+
None)
43+
self.splash_response_headers = kwargs.pop('splash_response_headers',
44+
None)
3345
super(_SplashResponseMixin, self).__init__(url, *args, **kwargs)
46+
if self.splash_response_status is None:
47+
self.splash_response_status = self.status
48+
if self.splash_response_headers is None:
49+
self.splash_response_headers = self.headers.copy()
3450

3551
def replace(self, *args, **kwargs):
3652
"""Create a new Response with the same attributes except for those
3753
given new values.
3854
"""
3955
for x in ['url', 'status', 'headers', 'body', 'request', 'flags',
40-
'real_url']:
56+
'real_url', 'splash_response_status',
57+
'splash_response_headers']:
4158
kwargs.setdefault(x, getattr(self, x))
4259
cls = kwargs.pop('cls', self.__class__)
4360
return cls(*args, **kwargs)
@@ -80,11 +97,14 @@ class SplashJsonResponse(SplashResponse):
8097
(['splash']['magic_response'] is not False), several other response
8198
attributes (headers, body, url, status code) are set automatically:
8299
83-
* response.headers are filled from 'headers' keys;
84-
* response.url is set to the value of 'url' key;
100+
* response.url is set to the value of 'url' key, original url is
101+
available as ``responce.real_url``;
102+
* response.headers are filled from 'headers' keys; original headers are
103+
available as ``response.splash_response_headers``;
104+
* response.status is set from the value of 'http_status' key; original
105+
status is available as ``response.splash_response_status``;
85106
* response.body is set to the value of 'html' key,
86107
or to base64-decoded value of 'body' key;
87-
* response.status is set from the value of 'http_status' key.
88108
"""
89109
def __init__(self, *args, **kwargs):
90110
self.cookiejar = None

tests/test_integration.py

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@
1010
DEFAULT_SCRIPT = """
1111
function main(splash)
1212
splash:init_cookies(splash.args.cookies)
13-
assert(splash:go{
13+
splash:go{
1414
splash.args.url,
1515
headers=splash.args.headers,
1616
http_method=splash.args.http_method,
1717
body=splash.args.body,
18-
})
19-
assert(splash:wait(0.5))
18+
}
19+
local wait = tonumber(splash.args.wait or 0.5)
20+
assert(splash:wait(wait))
2021
2122
local entries = splash:history()
2223
local last_response = entries[#entries].response
@@ -40,6 +41,11 @@ class HelloWorld(HtmlResource):
4041
extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
4142

4243

44+
class Http400Resource(HtmlResource):
45+
status_code = 400
46+
html = "Website returns HTTP 400 error"
47+
48+
4349

4450
class ManyCookies(Resource, object):
4551
class SetMyCookie(HtmlResource):
@@ -94,6 +100,9 @@ def parse(self, response):
94100
resp = items[0]['response']
95101
assert resp.url == url
96102
assert resp.css('body::text').get().strip() == "hello world!"
103+
assert resp.status == resp.splash_response_status == 200
104+
assert resp.headers == resp.splash_response_headers
105+
assert resp.splash_response_headers['Content-Type'] == b"text/html; charset=utf-8"
97106

98107
resp2 = items[1]['response']
99108
assert resp2.body == resp.body
@@ -118,12 +127,78 @@ def start_requests(self):
118127
assert len(items) == 1
119128
resp = items[0]['response']
120129
assert resp.url == url + "/#foo"
130+
assert resp.status == resp.splash_response_status == 200
121131
assert resp.css('body::text').get().strip() == "hello world!"
122132
assert resp.data['jsvalue'] == 3
123133
assert resp.headers['X-MyHeader'] == b'my value'
134+
assert resp.headers['Content-Type'] == b'text/html'
135+
assert resp.splash_response_headers['Content-Type'] == b'application/json'
124136
assert resp.data['args']['foo'] == 'bar'
125137

126138

139+
@requires_splash
140+
@inlineCallbacks
141+
def test_bad_request(settings):
142+
class BadRequestSpider(ResponseSpider):
143+
custom_settings = {'HTTPERROR_ALLOW_ALL': True}
144+
145+
def start_requests(self):
146+
yield SplashRequest(self.url, endpoint='execute',
147+
args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'})
148+
149+
class GoodRequestSpider(ResponseSpider):
150+
custom_settings = {'HTTPERROR_ALLOW_ALL': True}
151+
152+
def start_requests(self):
153+
yield SplashRequest(self.url, endpoint='execute',
154+
args={'lua_source': DEFAULT_SCRIPT})
155+
156+
157+
items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld,
158+
settings)
159+
resp = items[0]['response']
160+
assert resp.status == 400
161+
assert resp.splash_response_status == 400
162+
163+
items, url, crawler = yield crawl_items(GoodRequestSpider, Http400Resource,
164+
settings)
165+
resp = items[0]['response']
166+
assert resp.status == 400
167+
assert resp.splash_response_status == 200
168+
169+
170+
@requires_splash
171+
@inlineCallbacks
172+
def test_cache_args(settings):
173+
174+
class CacheArgsSpider(ResponseSpider):
175+
def _request(self, url):
176+
return SplashRequest(url, endpoint='execute',
177+
args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'},
178+
cache_args=['lua_source'])
179+
180+
def start_requests(self):
181+
yield self._request(self.url)
182+
183+
def parse(self, response):
184+
yield {'response': response}
185+
yield self._request(self.url + "#foo")
186+
187+
188+
items, url, crawler = yield crawl_items(CacheArgsSpider, HelloWorld,
189+
settings)
190+
assert len(items) == 2
191+
resp = items[0]['response']
192+
assert b"function main(splash)" in resp.request.body
193+
assert b"yy" in resp.request.body
194+
print(resp.body, resp.request.body)
195+
196+
resp = items[1]['response']
197+
assert b"function main(splash)" not in resp.request.body
198+
assert b"yy" in resp.request.body
199+
print(resp.body, resp.request.body)
200+
201+
127202
@requires_splash
128203
@inlineCallbacks
129204
def test_cookies(settings):
@@ -171,7 +246,6 @@ def parse_3(self, response):
171246
args={'lua_source': DEFAULT_SCRIPT},
172247
cookies={'bomb': BOMB})
173248

174-
175249
def parse_4(self, response):
176250
yield {'response': response}
177251

@@ -185,19 +259,19 @@ def _cookie_dict(har_cookies):
185259

186260
# cookie should be sent to remote website, not to Splash
187261
resp = items[0]['response']
188-
splash_headers = resp.request.headers
262+
splash_request_headers = resp.request.headers
189263
cookies = resp.data['args']['cookies']
190-
print(splash_headers)
264+
print(splash_request_headers)
191265
print(cookies)
192266
assert _cookie_dict(cookies) == {
193267
# 'login': '1', # FIXME
194268
'x-set-splash': '1'
195269
}
196-
assert splash_headers.get(b'Cookie') is None
270+
assert splash_request_headers.get(b'Cookie') is None
197271

198272
# new cookie should be also sent to remote website, not to Splash
199273
resp2 = items[1]['response']
200-
splash_headers = resp2.request.headers
274+
splash_request_headers = resp2.request.headers
201275
headers = resp2.data['args']['headers']
202276
cookies = resp2.data['args']['cookies']
203277
assert canonicalize_url(headers['Referer']) == canonicalize_url(url)
@@ -206,29 +280,29 @@ def _cookie_dict(har_cookies):
206280
'x-set-splash': '1',
207281
'sessionid': 'ABCD'
208282
}
209-
print(splash_headers)
283+
print(splash_request_headers)
210284
print(headers)
211285
print(cookies)
212-
assert splash_headers.get(b'Cookie') is None
286+
assert splash_request_headers.get(b'Cookie') is None
213287

214288
# TODO/FIXME: Cookies fetched when working with Splash should be picked up
215289
# by Scrapy
216290
resp3 = items[2]['response']
217-
splash_headers = resp3.request.headers
218-
cookie_header = splash_headers.get(b'Cookie')
291+
splash_request_headers = resp3.request.headers
292+
cookie_header = splash_request_headers.get(b'Cookie')
219293
assert b'x-set-scrapy=1' in cookie_header
220294
assert b'login=1' in cookie_header
221295
assert b'x-set-splash=1' in cookie_header
222296
# assert b'sessionid=ABCD' in cookie_header # FIXME
223297

224298
# cookie bomb shouldn't cause problems
225299
resp4 = items[3]['response']
226-
splash_headers = resp4.request.headers
300+
splash_request_headers = resp4.request.headers
227301
cookies = resp4.data['args']['cookies']
228302
assert _cookie_dict(cookies) == {
229303
# 'login': '1',
230304
'x-set-splash': '1',
231305
'sessionid': 'ABCD',
232306
'bomb': BOMB,
233307
}
234-
assert splash_headers.get(b'Cookie') is None
308+
assert splash_request_headers.get(b'Cookie') is None

tests/test_middleware.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,8 @@ def cb():
188188
assert response2.text == response2.body_as_unicode() == res_body
189189
assert response2.encoding == 'utf8'
190190
assert response2.headers == {b'Content-Type': [b'application/json']}
191-
assert response2.status == 200
191+
assert response2.splash_response_headers == response2.headers
192+
assert response2.status == response2.splash_response_status == 200
192193

193194

194195
def test_magic_response():
@@ -233,7 +234,9 @@ def test_magic_response():
233234
b'X-My-Header': [b'foo'],
234235
b'Set-Cookie': [b'bar=baz'],
235236
}
237+
assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
236238
assert resp2.status == 404
239+
assert resp2.splash_response_status == 200
237240
assert resp2.url == "http://exmaple.com/#id42"
238241
assert len(resp2.cookiejar) == 3
239242
cookies = [c for c in resp2.cookiejar]
@@ -359,7 +362,8 @@ def test_magic_response2():
359362
assert resp2.data == resp_data
360363
assert resp2.body == b'binary data'
361364
assert resp2.headers == {b'Content-Type': [b'text/plain']}
362-
assert resp2.status == 200
365+
assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
366+
assert resp2.status == resp2.splash_response_status == 200
363367
assert resp2.url == "http://example.com/"
364368

365369

@@ -397,12 +401,13 @@ def test_magic_response_http_error():
397401
"error": 400,
398402
"type": "ScriptError"
399403
}
400-
resp = TextResponse("http://mysplash.example.com/execute",
404+
resp = TextResponse("http://mysplash.example.com/execute", status=400,
401405
headers={b'Content-Type': b'application/json'},
402406
body=json.dumps(resp_data).encode('utf8'))
403407
resp = mw.process_response(req, resp, None)
404408
assert resp.data == resp_data
405409
assert resp.status == 404
410+
assert resp.splash_response_status == 400
406411
assert resp.url == "http://example.com/foo"
407412

408413

tests/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@ class HtmlResource(Resource):
2020
content_type = 'text/html'
2121
html = ''
2222
extra_headers = {}
23+
status_code = 200
2324

2425
def render_GET(self, request):
2526
request.setHeader(b'content-type', to_bytes(self.content_type))
2627
for name, value in self.extra_headers.items():
2728
request.setHeader(to_bytes(name), to_bytes(value))
29+
request.setResponseCode(self.status_code)
2830
return to_bytes(self.html)
2931

3032

0 commit comments

Comments
 (0)