Skip to content

Commit 99066e2

Browse files
committed
don't log response content when it is website who returns 400, not Splash
it only works with a proper Lua script
1 parent bfba9ad commit 99066e2

File tree

5 files changed

+50
-2
lines changed

5 files changed

+50
-2
lines changed

scrapy_splash/middleware.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
json_based_hash,
2424
parse_x_splash_saved_arguments_header,
2525
)
26+
from scrapy_splash.response import get_splash_status, get_splash_headers
2627

2728

2829
logger = logging.getLogger(__name__)
@@ -390,7 +391,7 @@ def process_response(self, request, response, spider):
390391

391392
response = self._change_response_class(request, response)
392393

393-
if self.log_400 and response.status == 400:
394+
if self.log_400 and get_splash_status(response) == 400:
394395
self._log_400(request, response, spider)
395396

396397
return response

scrapy_splash/response.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@
1111
from scrapy_splash.utils import headers_to_scrapy
1212

1313

14+
def get_splash_status(resp):
15+
return getattr(resp, 'splash_response_status', resp.status)
16+
17+
18+
def get_splash_headers(resp):
19+
return getattr(resp, 'splash_response_headers', resp.headers)
20+
21+
1422
class _SplashResponseMixin(object):
1523
"""
1624
This mixin fixes response.url and adds response.real_url

tests/test_integration.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ class HelloWorld(HtmlResource):
4141
extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
4242

4343

44+
class Http400Resource(HtmlResource):
45+
status_code = 400
46+
html = "Website returns HTTP 400 error"
47+
48+
4449

4550
class ManyCookies(Resource, object):
4651
class SetMyCookie(HtmlResource):
@@ -131,6 +136,37 @@ def start_requests(self):
131136
assert resp.data['args']['foo'] == 'bar'
132137

133138

139+
@requires_splash
140+
@inlineCallbacks
141+
def test_bad_request(settings):
142+
class BadRequestSpider(ResponseSpider):
143+
custom_settings = {'HTTPERROR_ALLOW_ALL': True}
144+
145+
def start_requests(self):
146+
yield SplashRequest(self.url, endpoint='execute',
147+
args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'})
148+
149+
class GoodRequestSpider(ResponseSpider):
150+
custom_settings = {'HTTPERROR_ALLOW_ALL': True}
151+
152+
def start_requests(self):
153+
yield SplashRequest(self.url, endpoint='execute',
154+
args={'lua_source': DEFAULT_SCRIPT})
155+
156+
157+
items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld,
158+
settings)
159+
resp = items[0]['response']
160+
assert resp.status == 400
161+
assert resp.splash_response_status == 400
162+
163+
items, url, crawler = yield crawl_items(GoodRequestSpider, Http400Resource,
164+
settings)
165+
resp = items[0]['response']
166+
assert resp.status == 400
167+
assert resp.splash_response_status == 200
168+
169+
134170
@requires_splash
135171
@inlineCallbacks
136172
def test_cookies(settings):

tests/test_middleware.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,12 +401,13 @@ def test_magic_response_http_error():
401401
"error": 400,
402402
"type": "ScriptError"
403403
}
404-
resp = TextResponse("http://mysplash.example.com/execute",
404+
resp = TextResponse("http://mysplash.example.com/execute", status=400,
405405
headers={b'Content-Type': b'application/json'},
406406
body=json.dumps(resp_data).encode('utf8'))
407407
resp = mw.process_response(req, resp, None)
408408
assert resp.data == resp_data
409409
assert resp.status == 404
410+
assert resp.splash_response_status == 400
410411
assert resp.url == "http://example.com/foo"
411412

412413

tests/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@ class HtmlResource(Resource):
2020
content_type = 'text/html'
2121
html = ''
2222
extra_headers = {}
23+
status_code = 200
2324

2425
def render_GET(self, request):
2526
request.setHeader(b'content-type', to_bytes(self.content_type))
2627
for name, value in self.extra_headers.items():
2728
request.setHeader(to_bytes(name), to_bytes(value))
29+
request.setResponseCode(self.status_code)
2830
return to_bytes(self.html)
2931

3032

0 commit comments

Comments
 (0)