Skip to content

Commit d660633

Browse files
authored
Merge pull request #116 from lopuhin/change-response-weird-headers
Do not convert TextResponse to SplashResponse
2 parents 7a56b3c + dc37d4a commit d660633

File tree

2 files changed

+45
-0
lines changed

2 files changed

+45
-0
lines changed

scrapy_splash/middleware.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import scrapy
1414
from scrapy.exceptions import NotConfigured
1515
from scrapy.http.headers import Headers
16+
from scrapy.http.response.text import TextResponse
1617
from scrapy import signals
1718

1819
from scrapy_splash.responsetypes import responsetypes
@@ -399,6 +400,12 @@ def _change_response_class(self, request, response):
399400
# downloader middlewares are executed. Here it is set earlier.
400401
# Does it have any negative consequences?
401402
respcls = responsetypes.from_args(headers=response.headers)
403+
if isinstance(response, TextResponse) and respcls is SplashResponse:
404+
# Even if the headers say it's binary, it has already
405+
# been detected as a text response by scrapy (for example
406+
# because it was decoded successfully), so we should not
407+
# convert it to SplashResponse.
408+
respcls = SplashTextResponse
402409
response = response.replace(cls=respcls, request=request)
403410
return response
404411

tests/test_middleware.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,44 @@ def test_magic_response_http_error():
406406
assert resp.url == "http://example.com/foo"
407407

408408

409+
def test_change_response_class_to_text():
410+
mw = _get_mw()
411+
req = SplashRequest('http://example.com/', magic_response=True)
412+
req = mw.process_request(req, None)
413+
# Such response can come when downloading a file,
414+
# or returning splash:html(): the headers say it's binary,
415+
# but it can be decoded so it becomes a TextResponse.
416+
resp = TextResponse('http://mysplash.example.com/execute',
417+
headers={b'Content-Type': b'application/pdf'},
418+
body=b'ascii binary data',
419+
encoding='utf-8')
420+
resp2 = mw.process_response(req, resp, None)
421+
assert isinstance(resp2, TextResponse)
422+
assert resp2.url == 'http://example.com/'
423+
assert resp2.headers == {b'Content-Type': [b'application/pdf']}
424+
assert resp2.body == b'ascii binary data'
425+
426+
427+
def test_change_response_class_to_json_binary():
428+
mw = _get_mw()
429+
# We set magic_response to False, because it's not a kind of data we would
430+
# expect from splash: we just return binary data.
431+
# If we set magic_response to True, the middleware will fail,
432+
# but this is ok because magic_response presumes we are expecting
433+
# a valid splash json response.
434+
req = SplashRequest('http://example.com/', magic_response=False)
435+
req = mw.process_request(req, None)
436+
resp = Response('http://mysplash.example.com/execute',
437+
headers={b'Content-Type': b'application/json'},
438+
body=b'non-decodable data: \x98\x11\xe7\x17\x8f',
439+
)
440+
resp2 = mw.process_response(req, resp, None)
441+
assert isinstance(resp2, Response)
442+
assert resp2.url == 'http://example.com/'
443+
assert resp2.headers == {b'Content-Type': [b'application/json']}
444+
assert resp2.body == b'non-decodable data: \x98\x11\xe7\x17\x8f'
445+
446+
409447
def test_magic_response_caching(tmpdir):
410448
# prepare middlewares
411449
spider = scrapy.Spider(name='foo')

0 commit comments

Comments
 (0)