Skip to content

Commit 3f33145

Browse files
committed
Addresing review comments
1 parent 6ff9e90 commit 3f33145

File tree

4 files changed

+161
-153
lines changed

4 files changed

+161
-153
lines changed

src/apify/storages/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
22

3-
from .request_list import RequestList
3+
from ._request_list import RequestList
44

55
__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList']
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import re
5+
from asyncio import Task
6+
from functools import partial
7+
from typing import Annotated, Any, Union
8+
9+
from pydantic import BaseModel, Field, TypeAdapter
10+
11+
from crawlee import Request
12+
from crawlee._types import HttpMethod
13+
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
14+
from crawlee.storages import RequestList as CrawleeRequestList
15+
16+
URL_NO_COMMAS_REGEX = re.compile(
17+
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
18+
)
19+
20+
21+
class _RequestDetails(BaseModel):
22+
method: HttpMethod = 'GET'
23+
payload: str = ''
24+
headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
25+
user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
26+
27+
28+
class _RequestsFromUrlInput(_RequestDetails):
29+
requests_from_url: str = Field(alias='requestsFromUrl')
30+
31+
32+
class _SimpleUrlInput(_RequestDetails):
33+
url: str
34+
35+
36+
url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
37+
38+
39+
# @docs_group('Classes') # Not yet available in crawlee
40+
class RequestList(CrawleeRequestList):
41+
"""Extends crawlee RequestList.
42+
43+
Method open is used to create RequestList from actor's requestListSources input.
44+
"""
45+
46+
@staticmethod
47+
async def open(
48+
name: str | None = None,
49+
request_list_sources_input: list[dict[str, Any]] | None = None,
50+
http_client: BaseHttpClient | None = None,
51+
) -> RequestList:
52+
"""Creates RequestList from Actor input requestListSources.
53+
54+
name is name of the returned RequestList
55+
request_list_sources_input can contain list dicts with either url or requestsFromUrl key
56+
http_client is client that will be used to send get request to url defined in requestsFromUrl
57+
58+
Example request_list_sources_input:
59+
[
60+
# Gather urls from response body.
61+
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
62+
# Directly include this url.
63+
{'url': 'https://crawlee.dev', 'method': 'GET'}
64+
]
65+
"""
66+
request_list_sources_input = request_list_sources_input or []
67+
return await RequestList._create_request_list(name, request_list_sources_input, http_client)
68+
69+
@staticmethod
70+
async def _create_request_list(
71+
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None
72+
) -> RequestList:
73+
if not http_client:
74+
http_client = HttpxHttpClient()
75+
76+
ulr_inputs = url_input_adapter.validate_python(request_list_sources_input) # instance of list[Union[...]]
77+
78+
simple_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _SimpleUrlInput]
79+
remote_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _RequestsFromUrlInput]
80+
81+
simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
82+
remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
83+
84+
return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
85+
86+
@staticmethod
87+
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
88+
return [
89+
Request.from_url(
90+
method=request_input.method,
91+
url=request_input.url,
92+
payload=request_input.payload.encode('utf-8'),
93+
headers=request_input.headers,
94+
user_data=request_input.user_data,
95+
)
96+
for request_input in simple_url_inputs
97+
]
98+
99+
@staticmethod
100+
async def _fetch_requests_from_url(
101+
remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
102+
) -> list[Request]:
103+
"""Crete list of requests from url.
104+
105+
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
106+
callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
107+
collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
108+
"""
109+
created_requests: list[Request] = []
110+
111+
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
112+
"""Callback to scrape response body with regexp and create Requests from matches."""
113+
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
114+
created_requests.extend(
115+
[
116+
Request.from_url(
117+
match.group(0),
118+
method=request_input.method,
119+
payload=request_input.payload.encode('utf-8'),
120+
headers=request_input.headers,
121+
user_data=request_input.user_data,
122+
)
123+
for match in matches
124+
]
125+
)
126+
127+
remote_url_requests = []
128+
for remote_url_requests_input in remote_url_requests_inputs:
129+
get_response_task = asyncio.create_task(
130+
http_client.send_request(
131+
method='GET',
132+
url=remote_url_requests_input.requests_from_url,
133+
)
134+
)
135+
136+
get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
137+
remote_url_requests.append(get_response_task)
138+
139+
await asyncio.gather(*remote_url_requests)
140+
return created_requests

src/apify/storages/request_list.py

Lines changed: 0 additions & 137 deletions
This file was deleted.

tests/unit/actor/test_request_list.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from crawlee._types import HttpHeaders, HttpMethod
1212
from crawlee.http_clients import HttpResponse, HttpxHttpClient
1313

14-
from apify.storages.request_list import URL_NO_COMMAS_REGEX, RequestList
14+
from apify.storages._request_list import URL_NO_COMMAS_REGEX, RequestList
1515

1616

1717
@pytest.mark.parametrize('request_method', get_args(HttpMethod))
@@ -35,7 +35,7 @@ async def test_request_list_open_request_types(request_method: HttpMethod, optio
3535
}
3636
request_dict_input = {**minimal_request_dict_input, **optional_input}
3737

38-
request_list = await RequestList.open(actor_start_urls_input=[request_dict_input])
38+
request_list = await RequestList.open(request_list_sources_input=[request_dict_input])
3939
assert not await request_list.is_empty()
4040
request = await request_list.fetch_next_request()
4141
assert request is not None
@@ -76,7 +76,7 @@ def read(self) -> bytes:
7676

7777
async def test__request_list_open_from_url_correctly_send_requests() -> None:
7878
"""Test that injected HttpClient's method send_request is called with properly passed arguments."""
79-
actor_start_urls_input: list[dict[str, Any]] = [
79+
request_list_sources_input: list[dict[str, Any]] = [
8080
{
8181
'requestsFromUrl': 'https://abc.dev/file.txt',
8282
'method': 'GET',
@@ -94,19 +94,20 @@ async def test__request_list_open_from_url_correctly_send_requests() -> None:
9494
},
9595
]
9696

97-
mocked_read_outputs = ('' for url in actor_start_urls_input)
98-
http_client = HttpxHttpClient()
97+
mocked_read_outputs = ('' for url in request_list_sources_input)
98+
99+
mocked_http_client = mock.Mock(spec_set=HttpxHttpClient)
99100
with mock.patch.object(
100-
http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs)
101+
mocked_http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs)
101102
) as mocked_send_request:
102-
await RequestList.open(actor_start_urls_input=actor_start_urls_input, http_client=http_client)
103+
await RequestList.open(request_list_sources_input=request_list_sources_input, http_client=mocked_http_client)
103104

104105
expected_calls = [
105106
call(
106107
method='GET',
107108
url=example_input['requestsFromUrl'],
108109
)
109-
for example_input in actor_start_urls_input
110+
for example_input in request_list_sources_input
110111
]
111112
mocked_send_request.assert_has_calls(expected_calls)
112113

@@ -124,7 +125,7 @@ async def test_request_list_open_from_url() -> None:
124125
)
125126
)
126127

127-
actor_start_urls_input = [
128+
request_list_sources_input = [
128129
{
129130
'requestsFromUrl': 'https://abc.dev/file.txt',
130131
'method': 'GET',
@@ -136,9 +137,11 @@ async def test_request_list_open_from_url() -> None:
136137
},
137138
]
138139

139-
http_client = HttpxHttpClient()
140-
with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
141-
request_list = await RequestList.open(actor_start_urls_input=actor_start_urls_input, http_client=http_client)
140+
mocked_http_client = mock.Mock(spec_set=HttpxHttpClient)
141+
with mock.patch.object(mocked_http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
142+
request_list = await RequestList.open(
143+
request_list_sources_input=request_list_sources_input, http_client=mocked_http_client
144+
)
142145
generated_requests = []
143146
while request := await request_list.fetch_next_request():
144147
generated_requests.append(request)
@@ -159,9 +162,11 @@ async def test_request_list_open_from_url_additional_inputs() -> None:
159162
}
160163

161164
response_bodies = iter((expected_simple_url,))
162-
http_client = HttpxHttpClient()
163-
with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
164-
request_list = await RequestList.open(actor_start_urls_input=[example_start_url_input], http_client=http_client)
165+
mocked_http_client = mock.Mock(spec_set=HttpxHttpClient)
166+
with mock.patch.object(mocked_http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
167+
request_list = await RequestList.open(
168+
request_list_sources_input=[example_start_url_input], http_client=mocked_http_client
169+
)
165170
request = await request_list.fetch_next_request()
166171

167172
# Check all properties correctly created for request

0 commit comments

Comments
 (0)