Skip to content

Commit f496cd3

Browse files
authored
Merge pull request #158 from scrapinghub/handling-errbacks
Handling errbacks
2 parents 652b835 + 3b5a394 commit f496cd3

File tree

6 files changed

+73
-16
lines changed

6 files changed

+73
-16
lines changed

docs/source/api.rst

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,20 @@ callback
9797
- optional
9898

9999
Must exist as method of scheduled spider, does not need to contain string "self".
100-
If not passed or not found on spider default callback `parse`_ will be used.
100+
If not passed default Scrapy callback `parse`_ will be used. If there is no spider method
101+
with name specified by callback argument or callback is not callable API will return 400 HTTP error.
102+
103+
Example request with callback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&callback=parse_page``
101104

102105
errback
103106
- type: string
104107
- optional
105108

106109
Scrapy errback for request made from spider. It must exist as method of
107-
scheduled spider, otherwise exception will be raised. String does not need to contain 'self'.
110+
scheduled spider, otherwise API will return 400 HTTP error. String does not need to contain 'self'.
111+
Defaults to None, can be adjusted with `DEFAULT_ERRBACK_NAME`_ setting.
112+
113+
Example request with errback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&errback=my_errback``
108114

109115
max_requests
110116
- type: integer
@@ -517,6 +523,18 @@ Encoding that's used to encode log messages.
517523

518524
Default: ``utf-8``.
519525

526+
DEFAULT_ERRBACK_NAME
527+
~~~~~~~~~~~~~~~~~~~~
528+
529+
Default: ``None``
530+
531+
String with the name of the default errback_.
532+
533+
Use this setting to set default errback for scrapy spider requests made from ScrapyRT.
534+
Errback must exist as method of spider and must be callable, otherwise 400 HTTP error will be raised.
535+
536+
.. _errback: https://docs.scrapy.org/en/latest/topics/request-response.htm#using-errbacks-to-catch-exceptions-in-request-processing
537+
520538

521539
Spider settings
522540
---------------

scrapyrt/conf/default_settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,6 @@
3131
# disable in production
3232
DEBUG = True
3333

34-
TWISTED_REACTOR = None
34+
TWISTED_REACTOR = None
35+
36+
DEFAULT_ERRBACK_NAME = None

scrapyrt/core.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from copy import deepcopy
44
import datetime
55
import os
6+
import traceback
67

78
from scrapy import signals
89
from scrapy.crawler import CrawlerRunner, Crawler
@@ -109,6 +110,7 @@ def __init__(self, spider_name, request_kwargs,
109110
self.items = []
110111
self.items_dropped = []
111112
self.errors = []
113+
self.user_error = None
112114
self.max_requests = int(max_requests) if max_requests else None
113115
self.timeout_limit = int(app_settings.TIMEOUT_LIMIT)
114116
self.request_count = 0
@@ -120,7 +122,7 @@ def __init__(self, spider_name, request_kwargs,
120122
# because we need to know if spider has method available
121123
self.callback_name = request_kwargs.pop('callback', None) or 'parse'
122124
# do the same for errback
123-
self.errback_name = request_kwargs.pop('errback', None) or 'parse'
125+
self.errback_name = request_kwargs.pop('errback', None) or app_settings.DEFAULT_ERRBACK_NAME
124126

125127
if request_kwargs.get("url"):
126128
self.request = self.create_spider_request(deepcopy(request_kwargs))
@@ -171,17 +173,30 @@ def spider_idle(self, spider):
171173
172174
"""
173175
if spider is self.crawler.spider and self.request and not self._request_scheduled:
174-
callback = getattr(self.crawler.spider, self.callback_name)
175-
assert callable(callback), 'Invalid callback'
176-
self.request = self.request.replace(callback=callback)
176+
try:
177+
callback = getattr(self.crawler.spider, self.callback_name)
178+
assert callable(callback), 'Invalid callback'
179+
self.request = self.request.replace(callback=callback)
180+
except (AssertionError, AttributeError):
181+
msg = f"Invalid spider callback {self.callback_name}, callback not callable or not a method of a spider {self.spider_name}"
182+
self.user_error = Error(400, message=msg)
183+
try:
184+
if self.errback_name:
185+
errback = getattr(self.crawler.spider, self.errback_name)
186+
assert callable(errback), 'Invalid errback'
187+
self.request = self.request.replace(errback=errback)
188+
except (AssertionError, AttributeError):
189+
msg = f"Invalid spider errback {self.errback_name}, errback not callable or not a method of a spider {self.spider_name}"
190+
self.user_error = Error(400, message=msg)
191+
if self.user_error:
192+
log.msg(self.user_error.message, level=log.ERROR)
193+
return
177194

178-
errback = getattr(self.crawler.spider, self.errback_name)
179-
assert callable(errback), 'Invalid errback'
180-
self.request = self.request.replace(errback=errback)
181195
modify_request = getattr(
182196
self.crawler.spider, "modify_realtime_request", None)
183197
if callable(modify_request):
184198
self.request = modify_request(self.request)
199+
185200
spider.crawler.engine.crawl(self.request)
186201
self._request_scheduled = True
187202
raise DontCloseSpider
@@ -238,6 +253,9 @@ def return_items(self, result):
238253
"stats": stats,
239254
"spider_name": self.spider_name,
240255
}
256+
257+
results["user_error"] = self.user_error
258+
241259
if self.debug:
242260
results["errors"] = self.errors
243261
return results

scrapyrt/resources.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,9 @@ def run_crawl(self, spider_name, scrapy_request_args,
261261

262262
def prepare_response(self, result, *args, **kwargs):
263263
items = result.get("items")
264+
user_error = result.get("user_error", None)
265+
if user_error:
266+
raise user_error
264267
response = {
265268
"status": "ok",
266269
"items": items,

tests/test_crawl_manager.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,10 @@ def test_spider_opened(self):
111111

112112
def test_raise_error_if_not_callable(self):
113113
self.spider.parse_something = None
114-
self.assertRaises(
115-
AssertionError, self.crawl_manager.spider_idle, self.spider)
114+
self._call_spider_idle()
115+
self.assertIsNotNone(self.crawl_manager.user_error)
116+
msg = "Invalid spider callback parse_something"
117+
assert re.search(msg, self.crawl_manager.user_error.message)
116118
self.assertFalse(self.crawler.engine.crawl.called)
117119

118120
def test_modify_realtime_request(self):
@@ -142,15 +144,17 @@ def test_pass_wrong_spider_errback(self):
142144
mng = self.create_crawl_manager(
143145
{'url': 'http://localhost', 'errback': 'handle_error'}
144146
)
147+
145148
try:
146-
with pytest.raises(AttributeError) as err:
147-
mng.spider_idle(self.spider)
149+
mng.spider_idle(self.spider)
148150
except DontCloseSpider:
149151
pass
150152

151153
assert mng.request.errback is None
152-
msg = "has no attribute 'handle_error'"
153-
assert re.search(msg, str(err))
154+
155+
self.assertIsNotNone(mng.user_error)
156+
msg = "Invalid spider errback"
157+
assert re.search(msg, mng.user_error.message)
154158

155159
def test_pass_good_spider_errback(self):
156160
mng = self.create_crawl_manager(
@@ -330,6 +334,7 @@ def setUp(self):
330334
'items_dropped': self.crawl_manager.items_dropped,
331335
'stats': self.stats.copy(),
332336
'spider_name': self.spider.name,
337+
'user_error': None,
333338
}
334339

335340
def test_return_items(self):

tests/test_resource_crawl.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,17 @@ def test_prepare_response(self, resource):
142142
for key, value in expected:
143143
assert prepared_res[key] == value
144144

145+
def test_prepare_response_user_error_raised(self, resource):
146+
result = {
147+
'items': [1, 2],
148+
'stats': [99],
149+
'spider_name': 'test'
150+
}
151+
result['user_error'] = Exception("my exception")
152+
with pytest.raises(Exception) as e_info:
153+
resource.prepare_response(result)
154+
assert e_info.message == "my exception"
155+
145156

146157
class TestCrawlResourceGetRequiredArgument(unittest.TestCase):
147158

0 commit comments

Comments
 (0)