Skip to content

Commit a101a9a

Browse files
committed
WIP
Finalize tests. Split to its own file.
1 parent eb875e3 commit a101a9a

File tree

3 files changed

+158
-63
lines changed

3 files changed

+158
-63
lines changed

src/apify/_actor.py

Lines changed: 16 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from itertools import chain
99
from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
1010

11-
from crawlee.http_clients import HttpxHttpClient, HttpResponse, BaseHttpClient
1211
from lazy_object_proxy import Proxy
1312
from pydantic import AliasChoices
1413
from typing_extensions import Self
@@ -18,8 +17,10 @@
1817
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
1918
from crawlee import Request, service_container
2019
from crawlee.events._types import Event, EventPersistStateData
20+
from crawlee.http_clients import BaseHttpClient, HttpResponse, HttpxHttpClient
2121
from crawlee.storages import RequestList
2222

23+
from apify._actor_inputs import _create_request_list
2324
from apify._configuration import Configuration
2425
from apify._consts import EVENT_LISTENERS_TIMEOUT
2526
from apify._crypto import decrypt_input_secrets, load_private_key
@@ -42,9 +43,6 @@
4243

4344
MainReturnType = TypeVar('MainReturnType')
4445

45-
URL_NO_COMMAS_REGEX = re.compile(r"https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?")
46-
# JS version. TODO rewrite to Python regexp
47-
# /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?/giu;
4846

4947

5048
class _ActorType:
@@ -983,44 +981,20 @@ async def create_proxy_configuration(
983981
return proxy_configuration
984982

985983
@staticmethod
986-
async def create_request_list(*, actor_start_urls_input: dict, http_client: BaseHttpClient = HttpxHttpClient()) -> RequestList:
987-
simple_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "url" in request_input]
988-
remote_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "requestsFromUrl" in request_input]
989-
990-
simple_url_requests = Actor._create_requests_from_input(simple_url_requests_inputs)
991-
remote_url_requests = await Actor._create_requests_from_url(remote_url_requests_inputs, http_client=http_client)
992-
993-
return RequestList(requests=simple_url_requests + remote_url_requests)
994-
995-
@staticmethod
996-
def _create_requests_from_input(simple_url_requests_inputs: list[dict[str,str]]) -> list[Request]:
997-
return [
998-
Request.from_url(
999-
method=request_input.get('method'),
1000-
url=request_input.get('url'),
1001-
payload=request_input.get('payload', '').encode('utf-8'),
1002-
headers=request_input.get('headers', {}),
1003-
user_data=request_input.get('userData', {}),
1004-
)
1005-
for request_input in simple_url_requests_inputs]
1006-
1007-
@staticmethod
1008-
async def _create_requests_from_url(remote_url_requests_inputs: list[dict[str,str]], http_client: BaseHttpClient ) -> list[Request]:
1009-
remote_url_requests = []
1010-
for input in remote_url_requests_inputs:
1011-
remote_url_requests.append(asyncio.create_task(http_client.send_request(
1012-
url=input["requestsFromUrl"],
1013-
headers=input.get("headers", {}),
1014-
payload=input.get("payload", "").encode('utf-8'),
1015-
)))
1016-
await asyncio.gather(*remote_url_requests)
1017-
# TODO as callbacks
1018-
return list(chain.from_iterable((Actor.extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests)))
1019-
1020-
@staticmethod
1021-
def extract_requests_from_response(response: HttpResponse) -> list[Request]:
1022-
matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8')))
1023-
return [Request.from_url(match.group(0)) for match in matches]
984+
async def create_request_list(
985+
*, actor_start_urls_input: list[dict[str,str]], http_client: BaseHttpClient | None= None
986+
) -> RequestList:
987+
"""Creates request list from Actor input requestListSources. This accepts list of urls and requestsFromUrl.
988+
989+
Example:
990+
actor_start_urls_input = [
991+
# Gather urls from response body.
992+
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
993+
# Directly include this url.
994+
{'url': 'https://crawlee.dev', 'method': 'GET'}
995+
]
996+
"""
997+
return await _create_request_list(actor_start_urls_input=actor_start_urls_input, http_client=http_client)
1024998

1025999
Actor = cast(_ActorType, Proxy(_ActorType))
10261000
"""The entry point of the SDK, through which all the Actor operations should be done."""

src/apify/_actor_inputs.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import asyncio
2+
from itertools import chain
3+
import re
4+
5+
from crawlee import Request
6+
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient, HttpResponse
7+
from crawlee.storages import RequestList
8+
9+
URL_NO_COMMAS_REGEX = re.compile(
10+
r'https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?'
11+
)
12+
13+
@staticmethod
14+
async def _create_request_list(
15+
*, actor_start_urls_input: dict, http_client: BaseHttpClient | None = None
16+
) -> RequestList:
17+
if not http_client:
18+
http_client = HttpxHttpClient()
19+
simple_url_requests_inputs = [
20+
request_input for request_input in actor_start_urls_input if 'url' in request_input
21+
]
22+
remote_url_requests_inputs = [
23+
request_input for request_input in actor_start_urls_input if 'requestsFromUrl' in request_input
24+
]
25+
26+
simple_url_requests = _create_requests_from_input(simple_url_requests_inputs)
27+
remote_url_requests = await _create_requests_from_url(remote_url_requests_inputs, http_client=http_client)
28+
29+
return RequestList(requests=simple_url_requests + remote_url_requests)
30+
31+
32+
@staticmethod
33+
def _create_requests_from_input(simple_url_requests_inputs: list[dict[str, str]]) -> list[Request]:
34+
return [
35+
Request.from_url(
36+
method=request_input.get('method'),
37+
url=request_input.get('url'),
38+
payload=request_input.get('payload', '').encode('utf-8'),
39+
headers=request_input.get('headers', {}),
40+
user_data=request_input.get('userData', {}),
41+
)
42+
for request_input in simple_url_requests_inputs
43+
]
44+
45+
46+
@staticmethod
47+
async def _create_requests_from_url(
48+
remote_url_requests_inputs: list[dict[str, str]], http_client: BaseHttpClient
49+
) -> list[Request]:
50+
remote_url_requests = []
51+
for request_input in remote_url_requests_inputs:
52+
remote_url_requests.append(
53+
asyncio.create_task(
54+
http_client.send_request(
55+
method=request_input['method'],
56+
url=request_input['requestsFromUrl'],
57+
headers=request_input.get('headers', {}),
58+
payload=request_input.get('payload', '').encode('utf-8'),
59+
)
60+
)
61+
)
62+
await asyncio.gather(*remote_url_requests)
63+
# TODO as callbacks
64+
a = list(
65+
extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests
66+
)
67+
return list(chain.from_iterable(a))
68+
69+
70+
@staticmethod
71+
def extract_requests_from_response(response: HttpResponse) -> list[Request]:
72+
matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8')))
73+
return [Request.from_url(match.group(0)) for match in matches]

tests/unit/actor/test_actor_create_proxy_configuration.py

Lines changed: 69 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import typing
44
from typing import TYPE_CHECKING
55
from unittest import mock
6+
from unittest.mock import call
67

78
import httpx
89
import pytest
@@ -11,7 +12,7 @@
1112
from apify_shared.consts import ApifyEnvVars
1213
from crawlee._request import UserData
1314
from crawlee._types import HttpHeaders, HttpMethod
14-
from crawlee.http_clients import HttpxHttpClient, HttpResponse
15+
from crawlee.http_clients import HttpResponse, HttpxHttpClient
1516

1617
from apify import Actor
1718

@@ -160,14 +161,14 @@ async def test_proxy_configuration_with_actor_proxy_input(
160161
async def test_actor_create_request_list_request_types(
161162
request_method: HttpMethod, optional_input: dict[str, str]
162163
) -> None:
163-
"""Tests proper request list generation from both minimal and full inputs for all method types."""
164+
"""Test proper request list generation from both minimal and full inputs for all method types for simple input."""
164165
minimal_request_dict_input = {'url': 'https://www.abc.com', 'method': request_method}
165166
request_dict_input = {**minimal_request_dict_input, **optional_input}
166167
example_start_urls_input = [
167168
request_dict_input,
168169
]
169170

170-
generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input)
171+
generated_request_list = await Actor.create_request_list(actor_start_urls_input=example_start_urls_input)
171172

172173
assert not await generated_request_list.is_empty()
173174
generated_request = await generated_request_list.fetch_next_request()
@@ -185,42 +186,89 @@ async def test_actor_create_request_list_request_types(
185186
assert generated_request.headers == expected_headers
186187

187188

188-
async def test_actor_create_request_list_from_url():
189-
expected_urls = {"http://www.something.com", "https://www.something_else.com", "http://www.bla.net"}
190-
response_body = "blablabla{} more blablabla{} ,\n even more blablbablba.{}".format(*expected_urls)
191-
mocked_http_client = HttpxHttpClient()
189+
def _create_dummy_response(read_output: typing.Iterable[str]) -> HttpResponse:
190+
"""Create dummy_response that will iterate through read_output when called like dummy_response.read()"""
191+
192192
class DummyResponse(HttpResponse):
193193
@property
194194
def http_version(self) -> str:
195-
"""The HTTP version used in the response."""
196-
return ""
195+
return ''
197196

198197
@property
199198
def status_code(self) -> int:
200-
"""The HTTP status code received from the server."""
201199
return 200
202200

203201
@property
204202
def headers(self) -> HttpHeaders:
205-
"""The HTTP headers received in the response."""
206203
return HttpHeaders()
207204

208205
def read(self) -> bytes:
209-
return response_body.encode('utf-8')
206+
return next(read_output).encode('utf-8')
210207

208+
return DummyResponse()
211209

212-
async def mocked_send_request(*args, **kwargs):
213-
return DummyResponse()
214-
with mock.patch.object(mocked_http_client, "send_request", mocked_send_request) as mocked_send_request2:
215210

216-
example_start_urls_input = [
217-
{"requestsFromUrl": "https://crawlee.dev/file.txt", 'method': "GET"}
218-
]
211+
async def test_actor_create_request_list_from_url_correctly_send_requests() -> None:
212+
"""Test that injected HttpClient's method send_request is called with properly passed arguments."""
219213

214+
example_start_urls_input = [
215+
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
216+
{'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'PUT'},
217+
{
218+
'requestsFromUrl': 'https://www.something.som',
219+
'method': 'POST',
220+
'headers': {'key': 'value'},
221+
'payload': 'some_payload',
222+
'userData': 'irrelevant',
223+
},
224+
]
225+
mocked_read_outputs = ('' for url in example_start_urls_input)
226+
http_client = HttpxHttpClient()
227+
with mock.patch.object(
228+
http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs)
229+
) as mocked_send_request:
230+
await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=http_client)
231+
232+
expected_calls = [
233+
call(
234+
method=example_input['method'],
235+
url=example_input['requestsFromUrl'],
236+
headers=example_input.get('headers', {}),
237+
payload=example_input.get('payload', '').encode('utf-8'),
238+
)
239+
for example_input in example_start_urls_input
240+
]
241+
mocked_send_request.assert_has_calls(expected_calls)
242+
243+
244+
async def test_actor_create_request_list_from_url() -> None:
245+
"""Test that create_request_list is correctly reading urls from remote url sources and also from simple input."""
246+
expected_simple_url = 'https://www.someurl.com'
247+
expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'}
248+
expected_remote_urls_2 = {'http://www.ok.com', 'https://www.true-positive.com'}
249+
expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url}
250+
response_bodies = iter(
251+
(
252+
'blablabla{} more blablabla{} , even more blablabla. {} '.format(*expected_remote_urls_1),
253+
'some stuff{} more stuff{} www.falsepositive www.false_positive.com'.format(*expected_remote_urls_2),
254+
)
255+
)
220256

221-
generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=mocked_http_client)
257+
example_start_urls_input = [
258+
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
259+
{'url': expected_simple_url, 'method': 'GET'},
260+
{'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'GET'},
261+
]
262+
263+
http_client = HttpxHttpClient()
264+
with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
265+
generated_request_list = await Actor.create_request_list(
266+
actor_start_urls_input=example_start_urls_input, http_client=http_client
267+
)
222268
generated_requests = []
223-
while request:= await generated_request_list.fetch_next_request():
269+
while request := await generated_request_list.fetch_next_request():
270+
print(request)
224271
generated_requests.append(request)
225272

226-
assert set(generated_request.url for generated_request in generated_requests) == expected_urls
273+
# Check correctly created requests' urls in request list
274+
assert {generated_request.url for generated_request in generated_requests} == expected_urls

0 commit comments

Comments
 (0)