Skip to content

Commit 3d7b60f

Browse files
committed
[spider manager] handling crawl params edge cases
1 parent 87c515c commit 3d7b60f

File tree

3 files changed

+61
-4
lines changed

3 files changed

+61
-4
lines changed

scrapyrt/core.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ def __init__(self, settings, scrapyrt_manager):
6060
def crawl(self, spidercls, *args, **kwargs):
6161
if isinstance(spidercls, six.string_types):
6262
spidercls = self.spider_loader.load(spidercls)
63+
64+
for kw in kwargs:
65+
attr_or_m = getattr(spidercls, kw, None)
66+
if attr_or_m and callable(attr_or_m):
67+
msg = 'Crawl argument cannot override spider method.'
68+
msg += ' Got argument {} that overrides spider method {}'
69+
raise Error('400', message=msg.format(kw, getattr(spidercls, kw)))
6370
# creating our own crawler that will allow us to disable start requests easily
6471
crawler = ScrapyrtCrawler(
6572
spidercls, self.settings, self.scrapyrt_manager.start_requests)

tests/sample_data/testproject/testproject/spiders/testspider.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
class TestSpider(scrapy.Spider):
88

99
name = 'test'
10+
some_attribute = "Yes|No"
1011

1112
def parse(self, response):
1213
name = response.xpath('//h1/text()').extract()

tests/test_resource_crawl.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@ def test_crawl_with_argument_post(self, server):
449449
u'name': postcode,
450450
}]
451451
res_json = res.json()
452+
assert res.status_code == 200
452453
assert res_json["status"] == "ok"
453454
assert not res_json.get("errors")
454455
assert res_json["items_dropped"] == []
@@ -458,19 +459,67 @@ def test_crawl_with_argument_post(self, server):
458459

459460
def test_crawl_with_argument_invalid_json(self, server):
460461
url = server.url("crawl.json")
461-
postcode = "43-300"
462462
argument = '"this is not valid json'
463463
argument = quote(argument)
464464
res = perform_get(url, {"spider_name": "test"}, {
465465
"url": server.target_site.url("page1.html"),
466466
"crawl_args": argument,
467467
"callback": 'return_argument'
468468
})
469-
expected_items = [{
470-
u'name': postcode,
471-
}]
469+
assert res.status_code == 400
472470
res_json = res.json()
473471
assert res_json["status"] == "error"
474472
assert res_json.get('items') is None
475473
assert res_json['code'] == 400
476474
assert re.search(' must be valid url encoded JSON', res_json['message'])
475+
476+
def test_crawl_with_argument_invalid_name(self, server):
477+
url = server.url("crawl.json")
478+
argument = quote(json.dumps({"parse": "string"}))
479+
res = perform_get(url, {"spider_name": "test"}, {
480+
"url": server.target_site.url("page1.html"),
481+
"crawl_args": argument,
482+
})
483+
484+
def check_res(res):
485+
res_json = res.json()
486+
assert res.status_code == 400
487+
assert res_json["status"] == "error"
488+
assert res_json.get('items') is None
489+
assert res_json['code'] == 400
490+
491+
msg = 'Crawl argument cannot override spider method'
492+
assert re.search(msg, res_json['message'])
493+
494+
check_res(res)
495+
496+
res = perform_post(url, {
497+
"spider_name": "test",
498+
"crawl_args": argument
499+
}, {
500+
"url": server.target_site.url("page1.html"),
501+
"callback": 'return_argument'
502+
})
503+
504+
check_res(res)
505+
506+
def test_crawl_with_argument_attribute_collision(self, server):
507+
"""If there is attribute collision and some argument to spider
508+
passed via API, and this argument collides with spider attribute,
509+
argument from request overrides spider attribute.
510+
"""
511+
url = server.url("crawl.json")
512+
argument = quote(json.dumps({"some_attribute": "string"}))
513+
res = perform_get(url, {"spider_name": "test"}, {
514+
"url": server.target_site.url("page1.html"),
515+
"crawl_args": argument,
516+
})
517+
518+
def check_res(res):
519+
res_json = res.json()
520+
assert res_json["status"] == "ok"
521+
assert res.status_code == 200
522+
assert res_json['items']
523+
assert len(res_json['items']) == 1
524+
525+
check_res(res)

0 commit comments

Comments
 (0)