Skip to content

Commit aaced79

Browse files
authored
Merge pull request #38 from scrapinghub/start_requests
Start requests
2 parents e84db05 + ebec1ae commit aaced79

File tree

14 files changed

+387
-119
lines changed

14 files changed

+387
-119
lines changed

docs/source/api.rst

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,19 @@ spider_name
5757

5858
url
5959
- type: string
60-
- required
60+
- required if start_requests not enabled
6161

6262
Absolute URL to send request to. URL should be urlencoded so that
6363
querystring from url will not interfere with api parameters.
6464

65+
By default API will crawl this url and won't execute any other requests.
66+
Most importantly it will not execute ``start_requests`` and spider will
67+
not visit urls defined in ``start_urls`` spider attribute. There will be
68+
only one single request scheduled in API - request for resource identified
69+
by url argument.
70+
71+
If you want to execute request pass start_requests argument.
72+
6573
callback
6674
- type: string
6775
- optional
@@ -73,13 +81,26 @@ max_requests
7381
- type: integer
7482
- optional
7583

76-
Maximal amount of requests spider can generate. E.g. if it is set to ``1``
84+
Maximum amount of requests spider can generate. E.g. if it is set to ``1``
7785
spider will only schedule one single request, other requests generated
7886
by spider (for example in callback, following links in first response)
7987
will be ignored. If your spider generates many requests in callback
8088
and you don't want to wait forever for it to finish
8189
you should probably pass it.
8290

91+
start_requests
92+
- type: boolean
93+
- optional
94+
95+
Whether spider should execute ``Scrapy.Spider.start_requests`` method.
96+
``start_requests`` are executed by default when you run Scrapy Spider
97+
normally without ScrapyRT, but this method is NOT executed in API by
98+
default. By default we assume that spider is expected to crawl ONLY url
99+
provided in parameters without making any requests to ``start_urls``
100+
defined in ``Spider`` class. start_requests argument overrides this
101+
behavior. If this argument is present API will execute start_requests
102+
Spider method.
103+
83104
If required parameters are missing api will return 400 Bad Request
84105
with hopefully helpful error message.
85106

requirements-dev.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
-r requirements.txt
22

33
fabric
4-
requests
5-
mock
6-
pytest
7-
pytest-cov
8-
port-for
4+
requests==2.9.1
5+
mock==1.3.0
6+
pytest==2.9.1
7+
pytest-cov==2.2.1
8+
port-for==0.3.1

scrapyrt/core.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ class CrawlManager(object):
130130
Runs crawls
131131
"""
132132

133-
def __init__(self, spider_name, request_kwargs, max_requests=None):
133+
def __init__(self, spider_name, request_kwargs, max_requests=None, start_requests=False):
134134
self.spider_name = spider_name
135135
self.log_dir = settings.LOG_DIR
136136
self.items = []
@@ -145,8 +145,11 @@ def __init__(self, spider_name, request_kwargs, max_requests=None):
145145
# callback will be added after instantiation of crawler object
146146
# because we need to know if spider has method available
147147
self.callback_name = request_kwargs.pop('callback', None) or 'parse'
148-
self.request = self.create_spider_request(deepcopy(request_kwargs))
149-
self.start_requests = False
148+
if request_kwargs.get("url"):
149+
self.request = self.create_spider_request(deepcopy(request_kwargs))
150+
else:
151+
self.request = None
152+
self.start_requests = start_requests
150153
self._request_scheduled = False
151154

152155
def crawl(self, *args, **kwargs):
@@ -190,7 +193,7 @@ def spider_idle(self, spider):
190193
which is totally wrong.
191194
192195
"""
193-
if spider is self.crawler.spider and not self._request_scheduled:
196+
if spider is self.crawler.spider and self.request and not self._request_scheduled:
194197
callback = getattr(self.crawler.spider, self.callback_name)
195198
assert callable(callback), 'Invalid callback'
196199
self.request = self.request.replace(callback=callback)
@@ -264,15 +267,7 @@ def create_spider_request(self, kwargs):
264267
try:
265268
req = Request(url, **kwargs)
266269
except (TypeError, ValueError) as e:
267-
# Bad arguments for scrapy Request
268-
# we don't want to schedule spider if someone
269-
# passes meaingless arguments to Request.
270-
# We must raise this here so that this will be returned to client,
271-
# Otherwise if this is raised in spider_idle it goes to
272-
# spider logs where it does not really belong.
273-
# It is needed because in POST handler we can pass
274-
# all possible requests kwargs, so it is easy to make mistakes.
275-
message = "Error while creating Request, {}".format(e.message)
270+
message = "Error while creating Scrapy Request, {}".format(e.message)
276271
raise Error('400', message=message)
277272

278273
req.dont_filter = True

scrapyrt/resources.py

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
# -*- coding: utf-8 -*-
2+
import demjson
23
from scrapy.utils.misc import load_object
34
from scrapy.utils.serialize import ScrapyJSONEncoder
45
from twisted.internet.defer import Deferred
56
from twisted.python.failure import Failure
6-
from twisted.web import server, resource
7-
from twisted.web.error import UnsupportedMethod, Error
8-
import demjson
7+
from twisted.web import resource, server
8+
from twisted.web.error import Error, UnsupportedMethod
99

1010
from . import log
1111
from .conf import settings
12+
from .utils import extract_scrapy_request_args
1213

1314

1415
class ServiceResource(resource.Resource, object):
@@ -110,24 +111,14 @@ def render_GET(self, request, **kwargs):
110111
At the moment kwargs for scrapy request are not supported in GET.
111112
They are supported in POST handler.
112113
"""
113-
request_data = dict(
114+
api_params = dict(
114115
(name.decode('utf-8'), value[0].decode('utf-8'))
115116
for name, value in request.args.items()
116117
)
117-
118-
spider_data = {
119-
'url': self.get_required_argument(request_data, 'url'),
120-
# TODO get optional Request arguments here
121-
# distinguish between proper Request args and
122-
# api parameters
123-
}
124-
try:
125-
callback = request_data['callback']
126-
except KeyError:
127-
pass
128-
else:
129-
spider_data['callback'] = callback
130-
return self.prepare_crawl(request_data, spider_data, **kwargs)
118+
scrapy_request_args = extract_scrapy_request_args(api_params,
119+
raise_error=False)
120+
self.validate_options(scrapy_request_args, api_params)
121+
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)
131122

132123
def render_POST(self, request, **kwargs):
133124
"""
@@ -147,66 +138,85 @@ def render_POST(self, request, **kwargs):
147138
"""
148139
request_body = request.content.getvalue()
149140
try:
150-
request_data = demjson.decode(request_body)
141+
api_params = demjson.decode(request_body)
151142
except ValueError as e:
152143
message = "Invalid JSON in POST body. {}"
153144
message.format(e.pretty_description())
154145
raise Error('400', message=message)
155146

156-
log.msg("{}".format(request_data))
157-
spider_data = self.get_required_argument(request_data, "request")
158-
error_msg = "Missing required key 'url' in 'request' object"
159-
self.get_required_argument(spider_data, "url", error_msg=error_msg)
147+
log.msg("{}".format(api_params))
148+
if api_params.get("start_requests"):
149+
# start requests passed so 'request' argument is optional
150+
_request = api_params.get("request", {})
151+
else:
152+
# no start_requests, 'request' is required
153+
_request = self.get_required_argument(api_params, "request")
154+
try:
155+
scrapy_request_args = extract_scrapy_request_args(
156+
_request, raise_error=True
157+
)
158+
except ValueError as e:
159+
raise Error(400, e.message)
160+
161+
self.validate_options(scrapy_request_args, api_params)
162+
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)
160163

161-
return self.prepare_crawl(request_data, spider_data, **kwargs)
164+
def validate_options(self, scrapy_request_args, api_params):
165+
url = scrapy_request_args.get("url")
166+
start_requests = api_params.get("start_requests")
167+
if not url and not start_requests:
168+
raise Error(400,
169+
"'url' is required if start_requests are disabled")
162170

163-
def get_required_argument(self, request_data, name, error_msg=None):
171+
def get_required_argument(self, api_params, name, error_msg=None):
164172
"""Get required API key from dict-like object.
165173
166-
:param dict request_data:
174+
:param dict api_params:
167175
dictionary with names and values of parameters supplied to API.
168176
:param str name:
169-
required key that must be found in request_data
177+
required key that must be found in api_params
170178
:return: value of required param
171179
:raises Error: Bad Request response
172180
173181
"""
174182
if error_msg is None:
175183
error_msg = 'Missing required parameter: {}'.format(repr(name))
176184
try:
177-
value = request_data[name]
185+
value = api_params[name]
178186
except KeyError:
179187
raise Error('400', message=error_msg)
180188
if not value:
181189
raise Error('400', message=error_msg)
182190
return value
183191

184-
def prepare_crawl(self, request_data, spider_data, *args, **kwargs):
192+
def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs):
185193
"""Schedule given spider with CrawlManager.
186194
187-
:param dict request_data:
195+
:param dict api_params:
188196
arguments needed to find spider and set proper api parameters
189197
for crawl (max_requests for example)
190198
191-
:param dict spider_data:
199+
:param dict scrapy_request_args:
192200
should contain positional and keyword arguments for Scrapy
193201
Request object that will be created
194202
"""
195-
spider_name = self.get_required_argument(request_data, 'spider_name')
203+
spider_name = self.get_required_argument(api_params, 'spider_name')
204+
start_requests = api_params.get("start_requests", False)
196205
try:
197-
max_requests = request_data['max_requests']
206+
max_requests = api_params['max_requests']
198207
except (KeyError, IndexError):
199208
max_requests = None
200209
dfd = self.run_crawl(
201-
spider_name, spider_data, max_requests, *args, **kwargs)
210+
spider_name, scrapy_request_args, max_requests,
211+
start_requests=start_requests, *args, **kwargs)
202212
dfd.addCallback(
203-
self.prepare_response, request_data=request_data, *args, **kwargs)
213+
self.prepare_response, request_data=api_params, *args, **kwargs)
204214
return dfd
205215

206-
def run_crawl(self, spider_name, spider_data,
207-
max_requests=None, *args, **kwargs):
216+
def run_crawl(self, spider_name, scrapy_request_args,
217+
max_requests=None, start_requests=False, *args, **kwargs):
208218
crawl_manager_cls = load_object(settings.CRAWL_MANAGER)
209-
manager = crawl_manager_cls(spider_name, spider_data, max_requests)
219+
manager = crawl_manager_cls(spider_name, scrapy_request_args, max_requests, start_requests=start_requests)
210220
dfd = manager.crawl(*args, **kwargs)
211221
return dfd
212222

@@ -223,4 +233,3 @@ def prepare_response(self, result, *args, **kwargs):
223233
if errors:
224234
response["errors"] = errors
225235
return response
226-

scrapyrt/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import inspect
2+
from scrapy import Request
3+
4+
5+
def extract_scrapy_request_args(dictionary, raise_error=False):
6+
"""
7+
:param dictionary: Dictionary with parameters passed to API
8+
:param raise_error: raise ValueError if key is not valid arg for
9+
scrapy.http.Request
10+
:return: dictionary of valid scrapy.http.Request positional and keyword
11+
arguments.
12+
"""
13+
result = dictionary.copy()
14+
args = inspect.getargspec(Request.__init__).args
15+
for key in dictionary.keys():
16+
if key not in args:
17+
result.pop(key)
18+
if raise_error:
19+
msg = u"{!r} is not a valid argument for scrapy.Request.__init__"
20+
raise ValueError(msg.format(key))
21+
return result

tests/sample_data/testproject/testproject/items.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@
44

55
class TestprojectItem(scrapy.Item):
66
name = scrapy.Field()
7+
referer = scrapy.Field()
8+
response = scrapy.Field()
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
import scrapy
3+
4+
from ..items import TestprojectItem
5+
6+
7+
class TestSpider(scrapy.Spider):
8+
9+
name = 'test_with_sr'
10+
initial_urls = ["{0}", "{1}"]
11+
12+
def start_requests(self):
13+
for url in self.initial_urls:
14+
yield scrapy.Request(url, callback=self.some_callback, meta=dict(referer=url))
15+
16+
def some_callback(self, response):
17+
name = response.xpath('//h1/text()').extract()
18+
return TestprojectItem(name=name, referer=response.meta["referer"])
19+
20+
def parse(self, response):
21+
name = response.xpath("//h1/text()").extract()
22+
return TestprojectItem(name=name, referer=response.meta.get("referer"),
23+
response=response.url)
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
<html>
22

33
<head>
4-
<title>Page 1</title>
4+
<title>Page 2</title>
55
</head>
66

77
<body>
88

9-
<h1>Page 1</h1>
9+
<h1>Page 2</h1>
1010

1111
</body>
1212
</html>
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<html>
2+
3+
<head>
4+
<title>Page 3</title>
5+
</head>
6+
7+
<body>
8+
9+
<h1>Page 3</h1>
10+
11+
</body>
12+
</html>

tests/servers.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,28 @@ def _non_block_read(output):
103103

104104
class ScrapyrtTestServer(BaseTestServer):
105105

106-
def __init__(self, *args, **kwargs):
106+
def __init__(self, site=None, *args, **kwargs):
107107
super(ScrapyrtTestServer, self).__init__(*args, **kwargs)
108108
self.arguments = [
109109
sys.executable, '-m', 'scrapyrt.cmdline', '-p', str(self.port)
110110
]
111111
self.stderr = PIPE
112112
self.tmp_dir = tempfile.mkdtemp()
113113
self.cwd = os.path.join(self.tmp_dir, 'testproject')
114+
114115
source = os.path.join(SAMPLE_DATA, 'testproject')
115116
shutil.copytree(
116117
source, self.cwd, ignore=shutil.ignore_patterns('*.pyc'))
118+
# Pass site url to spider doing start requests
119+
spider_name = "testspider_startrequests.py"
120+
spider_filename = os.path.join(self.cwd, "testproject", "spider_templates", spider_name)
121+
spider_target_place = os.path.join(self.cwd, "testproject", "spiders", spider_name)
122+
if not site:
123+
return
124+
with open(spider_filename) as spider_file:
125+
spider_string = spider_file.read().format(site.url("page1.html"), site.url("page2.html"))
126+
with open(spider_target_place, "wb") as file_target:
127+
file_target.write(spider_string)
117128

118129
def stop(self):
119130
super(ScrapyrtTestServer, self).stop()

0 commit comments

Comments
 (0)