Skip to content

Commit de1f7c3

Browse files
authored
Merge pull request #120 from scrapinghub/params-for-spider
Params for spider
2 parents b65bf50 + 3d7b60f commit de1f7c3

File tree

5 files changed

+147
-3
lines changed

5 files changed

+147
-3
lines changed

scrapyrt/core.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ def __init__(self, settings, scrapyrt_manager):
6060
def crawl(self, spidercls, *args, **kwargs):
6161
if isinstance(spidercls, six.string_types):
6262
spidercls = self.spider_loader.load(spidercls)
63+
64+
for kw in kwargs:
65+
attr_or_m = getattr(spidercls, kw, None)
66+
if attr_or_m and callable(attr_or_m):
67+
msg = 'Crawl argument cannot override spider method.'
68+
msg += ' Got argument {} that overrides spider method {}'
69+
raise Error('400', message=msg.format(kw, getattr(spidercls, kw)))
6370
# creating our own crawler that will allow us to disable start requests easily
6471
crawler = ScrapyrtCrawler(
6572
spidercls, self.settings, self.scrapyrt_manager.start_requests)

scrapyrt/resources.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# -*- coding: utf-8 -*-
2+
import json
3+
from urllib.parse import unquote
4+
25
import demjson
36
from scrapy.utils.misc import load_object
47
from scrapy.utils.serialize import ScrapyJSONEncoder
@@ -134,6 +137,7 @@ def render_GET(self, request, **kwargs):
134137
scrapy_request_args = extract_scrapy_request_args(api_params,
135138
raise_error=False)
136139
self.validate_options(scrapy_request_args, api_params)
140+
137141
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)
138142

139143
def render_POST(self, request, **kwargs):
@@ -154,10 +158,12 @@ def render_POST(self, request, **kwargs):
154158
"""
155159
request_body = request.content.getvalue()
156160
try:
161+
# TODO replace demjson with json.loads
157162
api_params = demjson.decode(request_body)
158163
except demjson.JSONDecodeError as e:
159164
message = "Invalid JSON in POST body. {}"
160165
message = message.format(e.pretty_description())
166+
# TODO should be integer not string
161167
raise Error('400', message=message)
162168

163169
log.msg("{}".format(api_params))
@@ -222,17 +228,33 @@ def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs):
222228
max_requests = api_params['max_requests']
223229
except (KeyError, IndexError):
224230
max_requests = None
231+
232+
crawl_args = api_params.get("crawl_args")
233+
if isinstance(crawl_args, str):
234+
try:
235+
crawl_args = json.loads(unquote(crawl_args))
236+
except Exception as e:
237+
msg = "crawl_args must be valid url encoded JSON"
238+
msg += " this string cannot be decoded with JSON"
239+
msg += f' {str(e)}'
240+
raise Error('400', message=msg)
241+
225242
dfd = self.run_crawl(
226243
spider_name, scrapy_request_args, max_requests,
227-
start_requests=start_requests, *args, **kwargs)
244+
start_requests=start_requests,
245+
crawl_args=crawl_args,
246+
*args,
247+
**kwargs)
228248
dfd.addCallback(
229249
self.prepare_response, request_data=api_params, *args, **kwargs)
230250
return dfd
231251

232252
def run_crawl(self, spider_name, scrapy_request_args,
233-
max_requests=None, start_requests=False, *args, **kwargs):
253+
max_requests=None, crawl_args=None, start_requests=False, *args, **kwargs):
234254
crawl_manager_cls = load_object(settings.CRAWL_MANAGER)
235255
manager = crawl_manager_cls(spider_name, scrapy_request_args, max_requests, start_requests=start_requests)
256+
if crawl_args:
257+
kwargs.update(crawl_args)
236258
dfd = manager.crawl(*args, **kwargs)
237259
return dfd
238260

scrapyrt/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def extract_scrapy_request_args(dictionary, raise_error=False):
1515
arguments.
1616
"""
1717
result = dictionary.copy()
18-
args = inspect.getargspec(Request.__init__).args
18+
args = inspect.getfullargspec(Request.__init__).args
1919
for key in dictionary.keys():
2020
if key not in args:
2121
result.pop(key)

tests/sample_data/testproject/testproject/spiders/testspider.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
class TestSpider(scrapy.Spider):
88

99
name = 'test'
10+
some_attribute = "Yes|No"
1011

1112
def parse(self, response):
1213
name = response.xpath('//h1/text()').extract()
@@ -18,3 +19,6 @@ def return_bytes(self, response):
1819
def some_errback(self, err):
1920
self.logger.error("Logging some error {}".format(err))
2021
return
22+
23+
def return_argument(self, response):
24+
return TestprojectItem(name=self.postcode)

tests/test_resource_crawl.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import json
33
import os
4+
from urllib.parse import quote
45

56
import pytest
67
import re
@@ -412,3 +413,113 @@ def test_bytes_in_item(self, server, method):
412413
'callback': 'return_bytes'})
413414
assert res.status_code == 200
414415
assert res.json()["items"] == [{'name': 'Some bytes here'}]
416+
417+
def test_crawl_with_argument_get(self, server):
418+
url = server.url("crawl.json")
419+
postcode = "43-300"
420+
argument = json.dumps({"postcode": postcode})
421+
argument = quote(argument)
422+
res = perform_get(url, {"spider_name": "test"}, {
423+
"url": server.target_site.url("page1.html"),
424+
"crawl_args": argument,
425+
"callback": 'return_argument'
426+
})
427+
expected_items = [{
428+
u'name': postcode,
429+
}]
430+
res_json = res.json()
431+
assert res_json["status"] == "ok"
432+
assert res_json["items_dropped"] == []
433+
assert res_json['items']
434+
assert len(res_json['items']) == len(expected_items)
435+
assert res_json["items"] == expected_items
436+
437+
def test_crawl_with_argument_post(self, server):
438+
url = server.url("crawl.json")
439+
postcode = "43-300"
440+
argument = {"postcode": postcode}
441+
res = perform_post(url, {
442+
"spider_name": "test",
443+
"crawl_args": argument
444+
}, {
445+
"url": server.target_site.url("page1.html"),
446+
"callback": 'return_argument'
447+
})
448+
expected_items = [{
449+
u'name': postcode,
450+
}]
451+
res_json = res.json()
452+
assert res.status_code == 200
453+
assert res_json["status"] == "ok"
454+
assert not res_json.get("errors")
455+
assert res_json["items_dropped"] == []
456+
assert res_json['items']
457+
assert len(res_json['items']) == len(expected_items)
458+
assert res_json["items"] == expected_items
459+
460+
def test_crawl_with_argument_invalid_json(self, server):
461+
url = server.url("crawl.json")
462+
argument = '"this is not valid json'
463+
argument = quote(argument)
464+
res = perform_get(url, {"spider_name": "test"}, {
465+
"url": server.target_site.url("page1.html"),
466+
"crawl_args": argument,
467+
"callback": 'return_argument'
468+
})
469+
assert res.status_code == 400
470+
res_json = res.json()
471+
assert res_json["status"] == "error"
472+
assert res_json.get('items') is None
473+
assert res_json['code'] == 400
474+
assert re.search(' must be valid url encoded JSON', res_json['message'])
475+
476+
def test_crawl_with_argument_invalid_name(self, server):
477+
url = server.url("crawl.json")
478+
argument = quote(json.dumps({"parse": "string"}))
479+
res = perform_get(url, {"spider_name": "test"}, {
480+
"url": server.target_site.url("page1.html"),
481+
"crawl_args": argument,
482+
})
483+
484+
def check_res(res):
485+
res_json = res.json()
486+
assert res.status_code == 400
487+
assert res_json["status"] == "error"
488+
assert res_json.get('items') is None
489+
assert res_json['code'] == 400
490+
491+
msg = 'Crawl argument cannot override spider method'
492+
assert re.search(msg, res_json['message'])
493+
494+
check_res(res)
495+
496+
res = perform_post(url, {
497+
"spider_name": "test",
498+
"crawl_args": argument
499+
}, {
500+
"url": server.target_site.url("page1.html"),
501+
"callback": 'return_argument'
502+
})
503+
504+
check_res(res)
505+
506+
def test_crawl_with_argument_attribute_collision(self, server):
507+
"""If there is attribute collision and some argument to spider
508+
passed via API, and this argument collides with spider attribute,
509+
argument from request overrides spider attribute.
510+
"""
511+
url = server.url("crawl.json")
512+
argument = quote(json.dumps({"some_attribute": "string"}))
513+
res = perform_get(url, {"spider_name": "test"}, {
514+
"url": server.target_site.url("page1.html"),
515+
"crawl_args": argument,
516+
})
517+
518+
def check_res(res):
519+
res_json = res.json()
520+
assert res_json["status"] == "ok"
521+
assert res.status_code == 200
522+
assert res_json['items']
523+
assert len(res_json['items']) == 1
524+
525+
check_res(res)

0 commit comments

Comments
 (0)