Skip to content

Commit 9e94fc9

Browse files
authored
Merge pull request #229 from scrapinghub/batched-create-requests
batch oriented request creation in crawling strategy
2 parents 9952f0f + 79b20cc commit 9e94fc9

File tree

3 files changed

+74
-6
lines changed

3 files changed

+74
-6
lines changed

frontera/worker/strategies/__init__.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ def schedule(self, request, score=1.0, dont_queue=False):
9898

9999
def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''):
100100
"""
101-
Creates request with specified fields, with state fetched from backend.
101+
Creates request with specified fields, with state fetched from backend. This method only creates request, but
102+
isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states
103+
from storage.
102104
103105
:param url: str
104106
:param method: str
@@ -110,5 +112,12 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No
110112
"""
111113
r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body)
112114
self.url_mw._add_fingerprint(r)
113-
self._states_context.refresh_and_keep(r)
114115
return r
116+
117+
def refresh_states(self, requests):
118+
"""
119+
Retrieves states for all requests from storage.
120+
121+
:param requests: list(:class:`Request <frontera.core.models.Request>`)
122+
"""
123+
self._states_context.refresh_and_keep(requests)

frontera/worker/strategy.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,11 @@ def fetch(self):
6363
self._states.fetch(self._fingerprints)
6464
self._fingerprints.clear()
6565

66-
def refresh_and_keep(self, request):
67-
self._states.fetch([request.meta[b'fingerprint']])
68-
self._states.set_states(request)
69-
self._requests.append(request)
66+
def refresh_and_keep(self, requests):
67+
self.to_fetch(requests)
68+
self.fetch()
69+
self._states.set_states(requests)
70+
self._requests.extend(requests)
7071

7172
def release(self):
7273
self._states.update_cache(self._requests)

tests/test_strategy.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# -*- coding: utf-8 -*-
2+
from frontera.worker.strategies import BaseCrawlingStrategy
3+
from frontera.worker.strategy import StatesContext
4+
from frontera.settings import Settings
5+
from tests.mocks.frontier_manager import FakeFrontierManager
6+
7+
from frontera.contrib.backends.memory import MemoryStates
8+
from frontera.core.components import States
9+
10+
11+
class TestingCrawlingStrategy(BaseCrawlingStrategy):
12+
def add_seeds(self, seeds):
13+
pass
14+
15+
def page_crawled(self, response):
16+
pass
17+
18+
def page_error(self, request, error):
19+
pass
20+
21+
def links_extracted(self, request, links):
22+
pass
23+
24+
25+
class MessageBusStream(object):
26+
def send(self, request, score=1.0, dont_queue=False):
27+
pass
28+
29+
def flush(self):
30+
pass
31+
32+
33+
class TestCrawlingStrategy(object):
34+
def strategy(self):
35+
settings = Settings()
36+
manager = FakeFrontierManager(settings)
37+
stream = MessageBusStream()
38+
states = MemoryStates(10)
39+
states_ctx = StatesContext(states)
40+
return TestingCrawlingStrategy.from_worker(manager, stream, states_ctx)
41+
42+
def test_create_request(self):
43+
s = self.strategy()
44+
req = s.create_request("http://test.com/someurl")
45+
assert req.meta[b'fingerprint'] == b'955ac04f1b1a96de60a5139ad90c80be87822159'
46+
47+
def test_states_refresh(self):
48+
s = self.strategy()
49+
states = s._states_context._states
50+
url = "http://test.com/someurl"
51+
req1 = s.create_request(url)
52+
req1.meta[b'state'] = States.CRAWLED
53+
states.update_cache(req1)
54+
55+
req2 = s.create_request(url)
56+
s.refresh_states([req2])
57+
assert req2.meta[b'state'] == req1.meta[b'state']
58+
assert req2.meta[b'state'] == States.CRAWLED

0 commit comments

Comments
 (0)