Skip to content

Commit b030299

Browse files
committed
[resources] add support for passing crawl arguments
1 parent 400f940 commit b030299

File tree

2 files changed

+50
-9
lines changed

2 files changed

+50
-9
lines changed

scrapyrt/resources.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# -*- coding: utf-8 -*-
2+
import json
3+
from urllib.parse import unquote
4+
25
import demjson
36
from scrapy.utils.misc import load_object
47
from scrapy.utils.serialize import ScrapyJSONEncoder
@@ -134,6 +137,7 @@ def render_GET(self, request, **kwargs):
134137
scrapy_request_args = extract_scrapy_request_args(api_params,
135138
raise_error=False)
136139
self.validate_options(scrapy_request_args, api_params)
140+
137141
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)
138142

139143
def render_POST(self, request, **kwargs):
@@ -171,7 +175,7 @@ def render_POST(self, request, **kwargs):
171175
_request = self.get_required_argument(api_params, "request")
172176
try:
173177
scrapy_request_args = extract_scrapy_request_args(
174-
_request, raise_error=False
178+
_request, raise_error=True
175179
)
176180
except ValueError as e:
177181
raise Error('400', str(e))
@@ -224,17 +228,33 @@ def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs):
224228
max_requests = api_params['max_requests']
225229
except (KeyError, IndexError):
226230
max_requests = None
231+
232+
crawl_args = api_params.get("crawl_args")
233+
if isinstance(crawl_args, str):
234+
try:
235+
crawl_args = json.loads(unquote(crawl_args))
236+
except Exception as e:
237+
msg = "crawl_args must be valid url encoded JSON"
238+
msg += " this string cannot be decoded with JSON"
239+
msg += f' {str(e)}'
240+
raise Error('400', message=msg)
241+
227242
dfd = self.run_crawl(
228243
spider_name, scrapy_request_args, max_requests,
229-
start_requests=start_requests, *args, **kwargs)
244+
start_requests=start_requests,
245+
crawl_args=crawl_args,
246+
*args,
247+
**kwargs)
230248
dfd.addCallback(
231249
self.prepare_response, request_data=api_params, *args, **kwargs)
232250
return dfd
233251

234252
def run_crawl(self, spider_name, scrapy_request_args,
235-
max_requests=None, start_requests=False, *args, **kwargs):
253+
max_requests=None, crawl_args=None, start_requests=False, *args, **kwargs):
236254
crawl_manager_cls = load_object(settings.CRAWL_MANAGER)
237255
manager = crawl_manager_cls(spider_name, scrapy_request_args, max_requests, start_requests=start_requests)
256+
if crawl_args:
257+
kwargs.update(crawl_args)
238258
dfd = manager.crawl(*args, **kwargs)
239259
return dfd
240260

tests/test_resource_crawl.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import json
33
import os
4+
from urllib.parse import quote
45

56
import pytest
67
import re
@@ -413,15 +414,14 @@ def test_bytes_in_item(self, server, method):
413414
assert res.status_code == 200
414415
assert res.json()["items"] == [{'name': 'Some bytes here'}]
415416

416-
@pytest.mark.parametrize("method", [
417-
perform_get, perform_post
418-
])
419-
def test_crawl_with_argument(self, server, method):
417+
def test_crawl_with_argument_get(self, server):
420418
url = server.url("crawl.json")
421419
postcode = "43-300"
422-
res = method(url, {"spider_name": "test"}, {
420+
argument = json.dumps({"postcode": postcode})
421+
argument = quote(argument)
422+
res = perform_get(url, {"spider_name": "test"}, {
423423
"url": server.target_site.url("page1.html"),
424-
"postcode": postcode,
424+
"crawl_args": argument,
425425
"callback": 'return_argument'
426426
})
427427
expected_items = [{
@@ -430,7 +430,28 @@ def test_crawl_with_argument(self, server, method):
430430
res_json = res.json()
431431
assert res_json["status"] == "ok"
432432
assert res_json["items_dropped"] == []
433+
assert res_json['items']
434+
assert len(res_json['items']) == len(expected_items)
435+
assert res_json["items"] == expected_items
433436

437+
def test_crawl_with_argument_post(self, server):
438+
url = server.url("crawl.json")
439+
postcode = "43-300"
440+
argument = {"postcode": postcode}
441+
res = perform_post(url, {
442+
"spider_name": "test",
443+
"crawl_args": argument
444+
}, {
445+
"url": server.target_site.url("page1.html"),
446+
"callback": 'return_argument'
447+
})
448+
expected_items = [{
449+
u'name': postcode,
450+
}]
451+
res_json = res.json()
452+
assert res_json["status"] == "ok"
453+
assert not res_json.get("errors")
454+
assert res_json["items_dropped"] == []
434455
assert res_json['items']
435456
assert len(res_json['items']) == len(expected_items)
436457
assert res_json["items"] == expected_items

0 commit comments

Comments
 (0)