Skip to content

Commit 1cae2bc

Browse files
authored
feat: Add RQ id, name, alias args to add_requests and enqueue_links methods (#1413)
### Description - Add support for specifying RQ dst for `add_requests` and `enqueue_links` methods ### Issues - Closes: #1402
1 parent 6f1752e commit 1cae2bc

File tree

6 files changed

+461
-4
lines changed

6 files changed

+461
-4
lines changed

src/crawlee/_types.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
180180
requests: Sequence[str | Request]
181181
"""Requests to be added to the `RequestManager`."""
182182

183+
rq_id: str | None
184+
"""ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
185+
186+
rq_name: str | None
187+
"""Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
188+
"""
189+
190+
rq_alias: str | None
191+
"""Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
192+
"""
193+
183194

184195
class PushDataKwargs(TypedDict):
185196
"""Keyword arguments for dataset's `push_data` method."""
@@ -261,10 +272,18 @@ def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
261272
async def add_requests(
262273
self,
263274
requests: Sequence[str | Request],
275+
rq_id: str | None = None,
276+
rq_name: str | None = None,
277+
rq_alias: str | None = None,
264278
**kwargs: Unpack[EnqueueLinksKwargs],
265279
) -> None:
266280
"""Track a call to the `add_requests` context helper."""
267-
self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs))
281+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
282+
if specified_params > 1:
283+
raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
284+
self.add_requests_calls.append(
285+
AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
286+
)
268287

269288
async def push_data(
270289
self,
@@ -311,12 +330,21 @@ class AddRequestsFunction(Protocol):
311330
def __call__(
312331
self,
313332
requests: Sequence[str | Request],
333+
rq_id: str | None = None,
334+
rq_name: str | None = None,
335+
rq_alias: str | None = None,
314336
**kwargs: Unpack[EnqueueLinksKwargs],
315337
) -> Coroutine[None, None, None]:
316338
"""Call dunder method.
317339
318340
Args:
319341
requests: Requests to be added to the `RequestManager`.
342+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
343+
provided.
344+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
345+
can be provided.
346+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
347+
can be provided.
320348
**kwargs: Additional keyword arguments.
321349
"""
322350

@@ -344,12 +372,21 @@ def __call__(
344372
label: str | None = None,
345373
user_data: dict[str, Any] | None = None,
346374
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
375+
rq_id: str | None = None,
376+
rq_name: str | None = None,
377+
rq_alias: str | None = None,
347378
**kwargs: Unpack[EnqueueLinksKwargs],
348379
) -> Coroutine[None, None, None]: ...
349380

350381
@overload
351382
def __call__(
352-
self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
383+
self,
384+
*,
385+
requests: Sequence[str | Request] | None = None,
386+
rq_id: str | None = None,
387+
rq_name: str | None = None,
388+
rq_alias: str | None = None,
389+
**kwargs: Unpack[EnqueueLinksKwargs],
353390
) -> Coroutine[None, None, None]: ...
354391

355392
def __call__(
@@ -360,6 +397,9 @@ def __call__(
360397
user_data: dict[str, Any] | None = None,
361398
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
362399
requests: Sequence[str | Request] | None = None,
400+
rq_id: str | None = None,
401+
rq_name: str | None = None,
402+
rq_alias: str | None = None,
363403
**kwargs: Unpack[EnqueueLinksKwargs],
364404
) -> Coroutine[None, None, None]:
365405
"""Call enqueue links function.
@@ -377,6 +417,12 @@ def __call__(
377417
- `'skip'` to exclude the request from being enqueued,
378418
- `'unchanged'` to use the original request options without modification.
379419
requests: Requests to be added to the `RequestManager`.
420+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
421+
provided.
422+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
423+
can be provided.
424+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
425+
can be provided.
380426
**kwargs: Additional keyword arguments.
381427
"""
382428

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,9 @@ async def enqueue_links(
944944
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
945945
| None = None,
946946
requests: Sequence[str | Request] | None = None,
947+
rq_id: str | None = None,
948+
rq_name: str | None = None,
949+
rq_alias: str | None = None,
947950
**kwargs: Unpack[EnqueueLinksKwargs],
948951
) -> None:
949952
kwargs.setdefault('strategy', 'same-hostname')
@@ -955,7 +958,9 @@ async def enqueue_links(
955958
'`transform_request_function` arguments when `requests` is provided.'
956959
)
957960
# Add directly passed requests.
958-
await context.add_requests(requests or list[str | Request](), **kwargs)
961+
await context.add_requests(
962+
requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
963+
)
959964
else:
960965
# Add requests from extracted links.
961966
await context.add_requests(
@@ -965,6 +970,9 @@ async def enqueue_links(
965970
user_data=user_data,
966971
transform_request_function=transform_request_function,
967972
),
973+
rq_id=rq_id,
974+
rq_name=rq_name,
975+
rq_alias=rq_alias,
968976
**kwargs,
969977
)
970978

@@ -1241,10 +1249,28 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) ->
12411249
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
12421250
result = self._context_result_map[context]
12431251

1244-
request_manager = await self.get_request_manager()
1252+
base_request_manager = await self.get_request_manager()
1253+
12451254
origin = context.request.loaded_url or context.request.url
12461255

12471256
for add_requests_call in result.add_requests_calls:
1257+
rq_id = add_requests_call.get('rq_id')
1258+
rq_name = add_requests_call.get('rq_name')
1259+
rq_alias = add_requests_call.get('rq_alias')
1260+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1261+
if specified_params > 1:
1262+
raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1263+
if rq_id or rq_name or rq_alias:
1264+
request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1265+
id=rq_id,
1266+
name=rq_name,
1267+
alias=rq_alias,
1268+
storage_client=self._service_locator.get_storage_client(),
1269+
configuration=self._service_locator.get_configuration(),
1270+
)
1271+
else:
1272+
request_manager = base_request_manager
1273+
12481274
requests = list[Request]()
12491275

12501276
base_url = url if (url := add_requests_call.get('base_url')) else origin

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,3 +1549,71 @@ def listener(event_data: EventCrawlerStatusData) -> None:
15491549
event_manager.off(event=Event.CRAWLER_STATUS, listener=listener)
15501550

15511551
assert status_message_listener.called
1552+
1553+
1554+
@pytest.mark.parametrize(
1555+
('queue_name', 'queue_alias', 'by_id'),
1556+
[
1557+
pytest.param('named-queue', None, False, id='with rq_name'),
1558+
pytest.param(None, 'alias-queue', False, id='with rq_alias'),
1559+
pytest.param('id-queue', None, True, id='with rq_id'),
1560+
],
1561+
)
1562+
async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: str | None, *, by_id: bool) -> None:
1563+
crawler = BasicCrawler()
1564+
rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
1565+
if by_id:
1566+
queue_id = rq.id
1567+
queue_name = None
1568+
else:
1569+
queue_id = None
1570+
visit_urls = set()
1571+
1572+
check_requests = [
1573+
Request.from_url('https://a.placeholder.com'),
1574+
Request.from_url('https://b.placeholder.com'),
1575+
Request.from_url('https://c.placeholder.com'),
1576+
]
1577+
1578+
@crawler.router.default_handler
1579+
async def handler(context: BasicCrawlingContext) -> None:
1580+
visit_urls.add(context.request.url)
1581+
await context.add_requests(check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)
1582+
1583+
await crawler.run(['https://start.placeholder.com'])
1584+
1585+
requests_from_queue = []
1586+
while request := await rq.fetch_next_request():
1587+
requests_from_queue.append(request)
1588+
1589+
assert requests_from_queue == check_requests
1590+
assert visit_urls == {'https://start.placeholder.com'}
1591+
1592+
await rq.drop()
1593+
1594+
1595+
@pytest.mark.parametrize(
1596+
('queue_name', 'queue_alias', 'queue_id'),
1597+
[
1598+
pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),
1599+
pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),
1600+
pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),
1601+
pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),
1602+
],
1603+
)
1604+
async def test_add_requests_error_with_multi_params(
1605+
queue_id: str | None, queue_name: str | None, queue_alias: str | None
1606+
) -> None:
1607+
crawler = BasicCrawler()
1608+
1609+
@crawler.router.default_handler
1610+
async def handler(context: BasicCrawlingContext) -> None:
1611+
with pytest.raises(ValueError, match='Only one of `rq_id`, `rq_name` or `rq_alias` can be set'):
1612+
await context.add_requests(
1613+
[Request.from_url('https://a.placeholder.com')],
1614+
rq_id=queue_id,
1615+
rq_name=queue_name,
1616+
rq_alias=queue_alias,
1617+
)
1618+
1619+
await crawler.run(['https://start.placeholder.com'])

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
from typing import TYPE_CHECKING
44
from unittest import mock
55

6+
import pytest
7+
68
from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
79
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
10+
from crawlee.storages import RequestQueue
811

912
if TYPE_CHECKING:
1013
from yarl import URL
@@ -198,3 +201,107 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
198201

199202
assert len(extracted_links) == 1
200203
assert extracted_links[0] == str(server_url / 'page_1')
204+
205+
206+
@pytest.mark.parametrize(
207+
('queue_name', 'queue_alias', 'by_id'),
208+
[
209+
pytest.param('named-queue', None, False, id='with rq_name'),
210+
pytest.param(None, 'alias-queue', False, id='with rq_alias'),
211+
pytest.param('id-queue', None, True, id='with rq_id'),
212+
],
213+
)
214+
async def test_enqueue_links_with_rq_param(
215+
server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool
216+
) -> None:
217+
crawler = BeautifulSoupCrawler(http_client=http_client)
218+
rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
219+
if by_id:
220+
queue_name = None
221+
queue_id = rq.id
222+
else:
223+
queue_id = None
224+
visit_urls: set[str] = set()
225+
226+
@crawler.router.default_handler
227+
async def handler(context: BeautifulSoupCrawlingContext) -> None:
228+
visit_urls.add(context.request.url)
229+
await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)
230+
231+
await crawler.run([str(server_url / 'start_enqueue')])
232+
233+
requests_from_queue: list[str] = []
234+
while request := await rq.fetch_next_request():
235+
requests_from_queue.append(request.url)
236+
237+
assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')}
238+
assert visit_urls == {str(server_url / 'start_enqueue')}
239+
240+
await rq.drop()
241+
242+
243+
@pytest.mark.parametrize(
244+
('queue_name', 'queue_alias', 'by_id'),
245+
[
246+
pytest.param('named-queue', None, False, id='with rq_name'),
247+
pytest.param(None, 'alias-queue', False, id='with rq_alias'),
248+
pytest.param('id-queue', None, True, id='with rq_id'),
249+
],
250+
)
251+
async def test_enqueue_links_requests_with_rq_param(
252+
server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool
253+
) -> None:
254+
crawler = BeautifulSoupCrawler(http_client=http_client)
255+
rq = await RequestQueue.open(name=queue_name, alias=queue_alias)
256+
if by_id:
257+
queue_name = None
258+
queue_id = rq.id
259+
else:
260+
queue_id = None
261+
visit_urls: set[str] = set()
262+
263+
check_requests: list[str] = [
264+
'https://a.placeholder.com',
265+
'https://b.placeholder.com',
266+
'https://c.placeholder.com',
267+
]
268+
269+
@crawler.router.default_handler
270+
async def handler(context: BeautifulSoupCrawlingContext) -> None:
271+
visit_urls.add(context.request.url)
272+
await context.enqueue_links(
273+
requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all'
274+
)
275+
276+
await crawler.run([str(server_url / 'start_enqueue')])
277+
278+
requests_from_queue: list[str] = []
279+
while request := await rq.fetch_next_request():
280+
requests_from_queue.append(request.url)
281+
282+
assert set(requests_from_queue) == set(check_requests)
283+
assert visit_urls == {str(server_url / 'start_enqueue')}
284+
285+
await rq.drop()
286+
287+
288+
@pytest.mark.parametrize(
289+
('queue_id', 'queue_name', 'queue_alias'),
290+
[
291+
pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'),
292+
pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'),
293+
pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'),
294+
pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'),
295+
],
296+
)
297+
async def test_enqueue_links_error_with_multi_params(
298+
server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None
299+
) -> None:
300+
crawler = BeautifulSoupCrawler(http_client=http_client)
301+
302+
@crawler.router.default_handler
303+
async def handler(context: BeautifulSoupCrawlingContext) -> None:
304+
with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'):
305+
await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)
306+
307+
await crawler.run([str(server_url / 'start_enqueue')])

0 commit comments

Comments
 (0)