Skip to content

Commit 31cb156

Browse files
committed
Add user_error support
Currently the application is not reporting to the user when the user provides an invalid errback or callback method. The scheduling of the request and validation of the spider callback and errback happens in a different thread than the one which is handling the api request. So, we need a different mechanism to communicate with the api request thread than simply raising the exception. We already do this for other errors and responses by adding properties to the CrawlManager object. So it seems best to also communicate this exception to the api request by using a user_error property on the CrawlManager. Then the exception can be raised in the context of the api request.
1 parent 40aa643 commit 31cb156

File tree

4 files changed

+49
-21
lines changed

4 files changed

+49
-21
lines changed

scrapyrt/core.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from copy import deepcopy
44
import datetime
55
import os
6+
import traceback
67

78
from scrapy import signals
89
from scrapy.crawler import CrawlerRunner, Crawler
@@ -109,6 +110,7 @@ def __init__(self, spider_name, request_kwargs,
109110
self.items = []
110111
self.items_dropped = []
111112
self.errors = []
113+
self.user_error = None
112114
self.max_requests = int(max_requests) if max_requests else None
113115
self.timeout_limit = int(app_settings.TIMEOUT_LIMIT)
114116
self.request_count = 0
@@ -171,22 +173,26 @@ def spider_idle(self, spider):
171173
172174
"""
173175
if spider is self.crawler.spider and self.request and not self._request_scheduled:
174-
callback = getattr(self.crawler.spider, self.callback_name)
175-
assert callable(callback), 'Invalid callback'
176-
self.request = self.request.replace(callback=callback)
177-
178-
179-
if self.errback_name:
180-
errback = getattr(self.crawler.spider, self.errback_name)
181-
assert callable(errback), 'Invalid errback'
182-
self.request = self.request.replace(errback=errback)
183-
modify_request = getattr(
184-
self.crawler.spider, "modify_realtime_request", None)
185-
if callable(modify_request):
186-
self.request = modify_request(self.request)
187-
spider.crawler.engine.crawl(self.request)
188-
self._request_scheduled = True
189-
raise DontCloseSpider
176+
try:
177+
callback = getattr(self.crawler.spider, self.callback_name)
178+
assert callable(callback), 'Invalid callback'
179+
self.request = self.request.replace(callback=callback)
180+
181+
182+
if self.errback_name:
183+
errback = getattr(self.crawler.spider, self.errback_name)
184+
assert callable(errback), 'Invalid errback'
185+
self.request = self.request.replace(errback=errback)
186+
modify_request = getattr(
187+
self.crawler.spider, "modify_realtime_request", None)
188+
if callable(modify_request):
189+
self.request = modify_request(self.request)
190+
spider.crawler.engine.crawl(self.request)
191+
self._request_scheduled = True
192+
except Exception as e:
193+
self.user_error = Error(400, message=traceback.format_exc())
194+
else:
195+
raise DontCloseSpider
190196

191197
def handle_scheduling(self, request, spider):
192198
"""Handler of request_scheduled signal.
@@ -240,6 +246,9 @@ def return_items(self, result):
240246
"stats": stats,
241247
"spider_name": self.spider_name,
242248
}
249+
250+
results["user_error"] = self.user_error
251+
243252
if self.debug:
244253
results["errors"] = self.errors
245254
return results

scrapyrt/resources.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,9 @@ def run_crawl(self, spider_name, scrapy_request_args,
261261

262262
def prepare_response(self, result, *args, **kwargs):
263263
items = result.get("items")
264+
user_error = result.get("user_error", None)
265+
if user_error:
266+
raise user_error
264267
response = {
265268
"status": "ok",
266269
"items": items,

tests/test_crawl_manager.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,10 @@ def test_spider_opened(self):
111111

112112
def test_raise_error_if_not_callable(self):
113113
self.spider.parse_something = None
114-
self.assertRaises(
115-
AssertionError, self.crawl_manager.spider_idle, self.spider)
114+
self._call_spider_idle()
115+
self.assertIsNotNone(self.crawl_manager.user_error)
116+
msg = "Invalid callback"
117+
assert re.search(msg, self.crawl_manager.user_error.message)
116118
self.assertFalse(self.crawler.engine.crawl.called)
117119

118120
def test_modify_realtime_request(self):
@@ -142,15 +144,17 @@ def test_pass_wrong_spider_errback(self):
142144
mng = self.create_crawl_manager(
143145
{'url': 'http://localhost', 'errback': 'handle_error'}
144146
)
147+
145148
try:
146-
with pytest.raises(AttributeError) as err:
147-
mng.spider_idle(self.spider)
149+
mng.spider_idle(self.spider)
148150
except DontCloseSpider:
149151
pass
150152

151153
assert mng.request.errback is None
154+
155+
self.assertIsNotNone(mng.user_error)
152156
msg = "has no attribute 'handle_error'"
153-
assert re.search(msg, str(err))
157+
assert re.search(msg, mng.user_error.message)
154158

155159
def test_pass_good_spider_errback(self):
156160
mng = self.create_crawl_manager(
@@ -330,6 +334,7 @@ def setUp(self):
330334
'items_dropped': self.crawl_manager.items_dropped,
331335
'stats': self.stats.copy(),
332336
'spider_name': self.spider.name,
337+
'user_error': None,
333338
}
334339

335340
def test_return_items(self):

tests/test_resource_crawl.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,17 @@ def test_prepare_response(self, resource):
142142
for key, value in expected:
143143
assert prepared_res[key] == value
144144

145+
def test_prepare_response_user_error_raised(self, resource):
146+
result = {
147+
'items': [1, 2],
148+
'stats': [99],
149+
'spider_name': 'test'
150+
}
151+
result['user_error'] = Exception("my exception")
152+
with pytest.raises(Exception) as e_info:
153+
resource.prepare_response(result)
154+
assert e_info.message == "my exception"
155+
145156

146157
class TestCrawlResourceGetRequiredArgument(unittest.TestCase):
147158

0 commit comments

Comments
 (0)