44import re
55from asyncio import Task
66from functools import partial
7- from typing import Any
7+ from typing import Any , Union
88
9- from pydantic import BaseModel , Field
9+ from pydantic import BaseModel , Field , TypeAdapter
1010
1111from crawlee import Request
1212from crawlee ._types import HttpMethod
1313from crawlee .http_clients import BaseHttpClient , HttpxHttpClient
14- from crawlee .storages import RequestList
15-
16- from ._known_actor_input_keys import ActorInputKeys
14+ from crawlee .storages import RequestList as CrawleeRequestList
1715
1816URL_NO_COMMAS_REGEX = re .compile (
1917 r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
@@ -24,50 +22,63 @@ class _RequestDetails(BaseModel):
2422 method : HttpMethod = 'GET'
2523 payload : str = ''
2624 headers : dict [str , str ] = Field (default_factory = dict )
27- user_data : dict [str , str ] = Field (default_factory = dict , alias = ActorInputKeys . startUrls . userData )
25+ user_data : dict [str , str ] = Field (default_factory = dict , alias = ' userData' )
2826
2927
3028class _RequestsFromUrlInput (_RequestDetails ):
31- requests_from_url : str = Field (alias = ActorInputKeys . startUrls . requestsFromUrl )
29+ requests_from_url : str = Field (alias = ' requestsFromUrl' )
3230
3331
3432class _SimpleUrlInput (_RequestDetails ):
3533 url : str
3634
3735
38- async def create_request_list (
39- actor_start_urls_input : list [dict [str , Any ]], http_client : BaseHttpClient | None = None
36+ url_input_adapter = TypeAdapter (list [Union [_RequestsFromUrlInput , _SimpleUrlInput ]])
37+
38+
39+ class RequestList (CrawleeRequestList ):
40+ """Extends crawlee RequestList."""
41+
42+ @classmethod
43+ async def open (
44+ cls ,
45+ name : str | None = None ,
46+ actor_start_urls_input : list [dict [str , Any ]] | None = None ,
47+ http_client : BaseHttpClient | None = None ,
48+ ) -> RequestList :
49+ """Creates RequestList from Actor input requestListSources.
50+
51+ name is name of the returned RequestList
52+ actor_start_urls_input can contain list dicts with either url or requestsFromUrl key
53+ http_client is client that will be used to send get request to url defined in requestsFromUrl
54+
55+ Example actor_start_urls_input:
56+ [
57+ # Gather urls from response body.
58+ {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
59+ # Directly include this url.
60+ {'url': 'https://crawlee.dev', 'method': 'GET'}
61+ ]
62+ """
63+ actor_start_urls_input = actor_start_urls_input or []
64+ return await _create_request_list (name , actor_start_urls_input , http_client )
65+
66+
67+ async def _create_request_list (
68+ name : str | None , actor_start_urls_input : list [dict [str , Any ]], http_client : BaseHttpClient | None
4069) -> RequestList :
41- """Creates RequestList from Actor input requestListSources.
42-
43- actor_start_urls_input can contain list dicts with either url or requestsFromUrl key
44- http_client is client that will be used to send get request to url defined in requestsFromUrl
45-
46- Example:
47- actor_start_urls_input = [
48- # Gather urls from response body.
49- {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
50- # Directly include this url.
51- {'url': 'https://crawlee.dev', 'method': 'GET'}
52- ]
53- """
5470 if not http_client :
5571 http_client = HttpxHttpClient ()
56- simple_url_requests_inputs = [
57- _SimpleUrlInput (** request_input )
58- for request_input in actor_start_urls_input
59- if ActorInputKeys .startUrls .url in request_input
60- ]
61- remote_url_requests_inputs = [
62- _RequestsFromUrlInput (** request_input )
63- for request_input in actor_start_urls_input
64- if ActorInputKeys .startUrls .requestsFromUrl in request_input
65- ]
6672
67- simple_url_requests = _create_requests_from_input (simple_url_requests_inputs )
68- remote_url_requests = await _create_requests_from_url (remote_url_requests_inputs , http_client = http_client )
73+ ulr_inputs = url_input_adapter .validate_python (actor_start_urls_input ) # instance of list[Union[...]]
74+
75+ simple_url_inputs = [url_input for url_input in ulr_inputs if type (url_input ) is _SimpleUrlInput ]
76+ remote_url_inputs = [url_input for url_input in ulr_inputs if type (url_input ) is _RequestsFromUrlInput ]
77+
78+ simple_url_requests = _create_requests_from_input (simple_url_inputs )
79+ remote_url_requests = await _create_requests_from_url (remote_url_inputs , http_client = http_client )
6980
70- return RequestList (requests = simple_url_requests + remote_url_requests )
81+ return RequestList (name = name , requests = simple_url_requests + remote_url_requests )
7182
7283
7384def _create_requests_from_input (simple_url_inputs : list [_SimpleUrlInput ]) -> list [Request ]:
0 commit comments