Skip to content

Commit 0f9d9bc

Browse files
authored
Merge pull request #93 from scrapinghub/pedro-errback
Pedro errback
2 parents a6c7892 + fc446d6 commit 0f9d9bc

File tree

13 files changed

+123
-18
lines changed

13 files changed

+123
-18
lines changed

docs/source/api.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,16 @@ callback
9696
- type: string
9797
- optional
9898

99-
Should exist as method of scheduled spider, does not need to contain self.
99+
Must exist as method of scheduled spider, does not need to contain string "self".
100100
If not passed or not found on spider default callback `parse`_ will be used.
101101

102+
errback
103+
- type: string
104+
- optional
105+
106+
Scrapy errback for request made from spider. It must exist as method of
107+
scheduled spider, otherwise exception will be raised. String does not need to contain 'self'.
108+
102109
max_requests
103110
- type: integer
104111
- optional

docs/source/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@
5555
# built documents.
5656
#
5757
# The short X.Y version.
58-
version = '0.10'
58+
version = '0.12'
5959
# The full version, including alpha/beta/rc tags.
60-
release = '0.10'
60+
release = '0.12'
6161

6262
# The language for content autogenerated by Sphinx. Refer to documentation
6363
# for a list of supported languages.

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ mock==1.3.0
55
pytest==2.9.1
66
pytest-cov==2.2.1
77
port-for==0.3.1
8+
Flask==1.1.1

scrapyrt/core.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ def __init__(self, spider_name, request_kwargs, max_requests=None, start_request
105105
# callback will be added after instantiation of crawler object
106106
# because we need to know if spider has method available
107107
self.callback_name = request_kwargs.pop('callback', None) or 'parse'
108+
# do the same for errback
109+
self.errback_name = request_kwargs.pop('errback', None) or 'parse'
110+
108111
if request_kwargs.get("url"):
109112
self.request = self.create_spider_request(deepcopy(request_kwargs))
110113
else:
@@ -145,7 +148,7 @@ def spider_idle(self, spider):
145148
"""Handler of spider_idle signal.
146149
147150
Schedule request for url given to api, with optional callback
148-
that can be passed as GET parameter.
151+
and errback that can be passed as GET parameter.
149152
150153
spider_idle signal is used because we want to optionally enable
151154
start_requests for the spider and if request is scheduled in
@@ -157,6 +160,10 @@ def spider_idle(self, spider):
157160
callback = getattr(self.crawler.spider, self.callback_name)
158161
assert callable(callback), 'Invalid callback'
159162
self.request = self.request.replace(callback=callback)
163+
164+
errback = getattr(self.crawler.spider, self.errback_name)
165+
assert callable(errback), 'Invalid errback'
166+
self.request = self.request.replace(errback=errback)
160167
modify_request = getattr(
161168
self.crawler.spider, "modify_realtime_request", None)
162169
if callable(modify_request):

tests/sample_data/testproject/testproject/spiders/testspider.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,7 @@ class TestSpider(scrapy.Spider):
1111
def parse(self, response):
1212
name = response.xpath('//h1/text()').extract()
1313
return TestprojectItem(name=name)
14+
15+
def some_errback(self, err):
16+
self.logger.error("Logging some error {}".format(err))
17+
return

tests/servers.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
11
# -*- coding: utf-8 -*-
2-
import six
3-
from subprocess import Popen, PIPE
4-
from six.moves.urllib.parse import urljoin
52
import fcntl
63
import os
74
import shutil
85
import socket
96
import sys
107
import tempfile
118
import time
9+
from subprocess import Popen, PIPE
1210

1311
import port_for
12+
from six.moves.urllib.parse import urljoin
1413

15-
from . import SAMPLE_DATA
14+
from . import TESTS_PATH
1615
from .utils import get_testenv, generate_project
1716

1817
DEVNULL = open(os.devnull, 'wb')
@@ -30,12 +29,9 @@ def __init__(self, host='localhost', port=None, cwd=None, shell=False,
3029
self.stdin = stdin
3130
self.stdout = stdout
3231
self.stderr = stderr
33-
if six.PY2:
34-
command = 'SimpleHTTPServer'
35-
else:
36-
command = 'http.server'
32+
3733
self.arguments = [
38-
sys.executable, '-u', '-m', command, str(self.port)
34+
'flask', 'run', '-p', str(self.port)
3935
]
4036

4137
def start(self):
@@ -126,4 +122,4 @@ class MockServer(BaseTestServer):
126122

127123
def __init__(self, *args, **kwargs):
128124
super(MockServer, self).__init__(*args, **kwargs)
129-
self.cwd = os.path.join(SAMPLE_DATA, 'testsite')
125+
self.cwd = os.path.join(TESTS_PATH, 'testsite')

tests/test_crawl_manager.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import os
3+
import re
34
from time import sleep
45
import datetime
56

@@ -29,14 +30,15 @@ def setUp(self):
2930
self.crawler = MagicMock()
3031
self.spider = MetaSpider.from_crawler(self.crawler)
3132
self.crawler.spider = self.spider
32-
self.crawl_manager = self._create_crawl_manager()
33+
self.crawl_manager = self.create_crawl_manager()
3334
self.crawl_manager.crawler = self.crawler
3435
self.item = Item()
3536
self.response = Response('http://localhost')
3637
self.another_spider = MetaSpider.from_crawler(self.crawler)
3738

38-
def _create_crawl_manager(self):
39-
crawl_manager = CrawlManager(self.spider.name, self.kwargs.copy())
39+
def create_crawl_manager(self, kwargs=None):
40+
kwargs = kwargs if kwargs else self.kwargs.copy()
41+
crawl_manager = CrawlManager(self.spider.name, kwargs)
4042
crawl_manager.crawler = self.crawler
4143
return crawl_manager
4244

@@ -136,6 +138,33 @@ def test_modify_realtime_request_is_not_callable(self):
136138
self.crawl_manager.request, self.spider)
137139
self.assertNotEqual(self.request, self.crawl_manager.request)
138140

141+
def test_pass_wrong_spider_errback(self):
142+
mng = self.create_crawl_manager(
143+
{'url': 'http://localhost', 'errback': 'handle_error'}
144+
)
145+
try:
146+
with pytest.raises(AttributeError) as err:
147+
mng.spider_idle(self.spider)
148+
except DontCloseSpider:
149+
pass
150+
151+
assert mng.request.errback is None
152+
msg = "AttributeError: 'MetaSpider' object has no attribute 'handle_error'"
153+
assert re.search(msg, str(err))
154+
155+
def test_pass_good_spider_errback(self):
156+
mng = self.create_crawl_manager(
157+
{'url': 'http://localhost', 'errback': 'handle_error'}
158+
)
159+
self.crawler.spider.handle_error = lambda x: x
160+
try:
161+
mng.spider_idle(self.spider)
162+
except DontCloseSpider:
163+
pass
164+
165+
assert callable(mng.request.errback)
166+
assert mng.request.errback('something') == 'something'
167+
139168

140169
class TestHandleScheduling(TestCrawlManager):
141170

@@ -178,7 +207,7 @@ def test_string_number_timeout_value(self):
178207
_timeout = settings.TIMEOUT_LIMIT
179208
try:
180209
settings.TIMEOUT_LIMIT = '1'
181-
self.crawl_manager = self._create_crawl_manager()
210+
self.crawl_manager = self.create_crawl_manager()
182211
self._test_limit_runtime()
183212
finally:
184213
settings.TIMEOUT_LIMIT = _timeout

tests/test_resource_crawl.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import json
3+
import os
34

45
import pytest
56
import re
@@ -378,3 +379,24 @@ def test_invalid_json_in_post(self, server):
378379
assert res_json[k] == v
379380
msg = "Invalid JSON in POST body"
380381
assert msg in res_json['message']
382+
383+
@pytest.mark.parametrize("method", [
384+
perform_get, perform_post
385+
])
386+
def test_passing_errback(self, server, method):
387+
url = server.url("crawl.json")
388+
res = method(url,
389+
{"spider_name": "test"},
390+
{"url": server.target_site.url("err/503"),
391+
'errback': 'some_errback'})
392+
393+
res_json = res.json()
394+
assert res_json.get('stats').get('log_count/ERROR') == 1
395+
assert res_json['status'] == 'ok'
396+
logs_path = os.path.join(server.cwd, 'logs', 'test')
397+
logs_files = os.listdir(logs_path)
398+
with open(os.path.join(logs_path, logs_files[0])) as f:
399+
log_file = f.read()
400+
401+
msg = 'ERROR: Logging some error'
402+
assert re.search(msg, log_file)

tests/testsite/app.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from flask import Flask, abort
2+
3+
4+
app = Flask(__name__)
5+
6+
7+
def read_file(filename):
8+
with open(filename) as f:
9+
return f.read()
10+
11+
12+
@app.route('/')
13+
def base():
14+
return b'hello'
15+
16+
17+
@app.route('/index.html')
18+
def index():
19+
return read_file('index.html')
20+
21+
22+
@app.route('/page1.html')
23+
def page1():
24+
return read_file('page1.html')
25+
26+
27+
@app.route('/page2.html')
28+
def page2():
29+
return read_file('page2.html')
30+
31+
32+
@app.route('/page3.html')
33+
def page3():
34+
return read_file('page3.html')
35+
36+
37+
@app.route('/err/<int:code>')
38+
def return_code(code):
39+
abort(code)
File renamed without changes.

0 commit comments

Comments
 (0)