Skip to content

Commit f4b3fc5

Browse files
committed
Use Pydantic to handle raw inputs
Fix typing issues WIP
1 parent a101a9a commit f4b3fc5

File tree

6 files changed

+201
-175
lines changed

6 files changed

+201
-175
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ indent-style = "space"
141141
docstring-quotes = "double"
142142
inline-quotes = "single"
143143

144+
[tool.ruff.lint.flake8-type-checking]
145+
runtime-evaluated-base-classes = ["pydantic.BaseModel"]
146+
144147
[tool.ruff.lint.flake8-builtins]
145148
builtins-ignorelist = ["id"]
146149

src/apify/_actor.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
import asyncio
44
import os
5-
import re
65
import sys
76
from datetime import timedelta
8-
from itertools import chain
97
from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
108

119
from lazy_object_proxy import Proxy
@@ -15,10 +13,8 @@
1513
from apify_client import ApifyClientAsync
1614
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
1715
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
18-
from crawlee import Request, service_container
16+
from crawlee import service_container
1917
from crawlee.events._types import Event, EventPersistStateData
20-
from crawlee.http_clients import BaseHttpClient, HttpResponse, HttpxHttpClient
21-
from crawlee.storages import RequestList
2218

2319
from apify._actor_inputs import _create_request_list
2420
from apify._configuration import Configuration
@@ -36,15 +32,16 @@
3632
import logging
3733
from types import TracebackType
3834

35+
from crawlee.http_clients import BaseHttpClient
3936
from crawlee.proxy_configuration import _NewUrlFunction
37+
from crawlee.storages import RequestList
4038

4139
from apify._models import Webhook
4240

4341

4442
MainReturnType = TypeVar('MainReturnType')
4543

4644

47-
4845
class _ActorType:
4946
"""The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
5047

@@ -982,19 +979,20 @@ async def create_proxy_configuration(
982979

983980
@staticmethod
984981
async def create_request_list(
985-
*, actor_start_urls_input: list[dict[str,str]], http_client: BaseHttpClient | None= None
982+
*, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None
986983
) -> RequestList:
987-
"""Creates request list from Actor input requestListSources. This accepts list of urls and requestsFromUrl.
984+
"""Creates request list from Actor input requestListSources. This accepts list of urls and requests_from_url.
988985
989986
Example:
990987
actor_start_urls_input = [
991988
# Gather urls from response body.
992-
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
989+
{'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'GET'},
993990
# Directly include this url.
994991
{'url': 'https://crawlee.dev', 'method': 'GET'}
995992
]
996-
"""
993+
"""
997994
return await _create_request_list(actor_start_urls_input=actor_start_urls_input, http_client=http_client)
998995

996+
999997
Actor = cast(_ActorType, Proxy(_ActorType))
1000998
"""The entry point of the SDK, through which all the Actor operations should be done."""

src/apify/_actor_inputs.py

Lines changed: 50 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,46 @@
1+
from __future__ import annotations
2+
13
import asyncio
2-
from itertools import chain
34
import re
5+
from asyncio import Task
6+
from typing import Any
7+
8+
from pydantic import BaseModel, Field
49

510
from crawlee import Request
6-
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient, HttpResponse
11+
from crawlee._types import HttpMethod # TODO: Make public in Crawlee?
12+
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
713
from crawlee.storages import RequestList
814

915
URL_NO_COMMAS_REGEX = re.compile(
1016
r'https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?'
1117
)
1218

19+
class _RequestDetails(BaseModel):
20+
method: HttpMethod
21+
payload: str = ''
22+
headers: dict[str, str] = Field(default_factory=dict)
23+
user_data: dict[str, str]= Field(default_factory=dict, alias='user_data')
24+
25+
class _RequestsFromUrlInput(_RequestDetails):
26+
requests_from_url: str = Field(alias='requests_from_url')
27+
28+
class _SimpleUrlInput(_RequestDetails):
29+
url: str
30+
31+
1332
@staticmethod
1433
async def _create_request_list(
15-
*, actor_start_urls_input: dict, http_client: BaseHttpClient | None = None
34+
*, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None
1635
) -> RequestList:
1736
if not http_client:
1837
http_client = HttpxHttpClient()
1938
simple_url_requests_inputs = [
20-
request_input for request_input in actor_start_urls_input if 'url' in request_input
21-
]
39+
_SimpleUrlInput(**request_input) for request_input in actor_start_urls_input
40+
if 'url' in request_input]
2241
remote_url_requests_inputs = [
23-
request_input for request_input in actor_start_urls_input if 'requestsFromUrl' in request_input
42+
_RequestsFromUrlInput(**request_input) for request_input in actor_start_urls_input
43+
if 'requests_from_url' in request_input
2444
]
2545

2646
simple_url_requests = _create_requests_from_input(simple_url_requests_inputs)
@@ -30,44 +50,41 @@ async def _create_request_list(
3050

3151

3252
@staticmethod
33-
def _create_requests_from_input(simple_url_requests_inputs: list[dict[str, str]]) -> list[Request]:
53+
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
3454
return [
3555
Request.from_url(
36-
method=request_input.get('method'),
37-
url=request_input.get('url'),
38-
payload=request_input.get('payload', '').encode('utf-8'),
39-
headers=request_input.get('headers', {}),
40-
user_data=request_input.get('userData', {}),
56+
method=request_input.method,
57+
url=request_input.url,
58+
payload=request_input.payload.encode('utf-8'),
59+
headers=request_input.headers,
60+
user_data=request_input.user_data,
4161
)
42-
for request_input in simple_url_requests_inputs
62+
for request_input in simple_url_inputs
4363
]
4464

4565

4666
@staticmethod
4767
async def _create_requests_from_url(
48-
remote_url_requests_inputs: list[dict[str, str]], http_client: BaseHttpClient
68+
remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
4969
) -> list[Request]:
70+
created_requests: list[Request] = []
71+
72+
def extract_requests_from_response(task: Task) -> list[Request]:
73+
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
74+
created_requests.extend([Request.from_url(match.group(0)) for match in matches])
75+
5076
remote_url_requests = []
5177
for request_input in remote_url_requests_inputs:
52-
remote_url_requests.append(
53-
asyncio.create_task(
54-
http_client.send_request(
55-
method=request_input['method'],
56-
url=request_input['requestsFromUrl'],
57-
headers=request_input.get('headers', {}),
58-
payload=request_input.get('payload', '').encode('utf-8'),
59-
)
78+
task = asyncio.create_task(
79+
http_client.send_request(
80+
method=request_input.method,
81+
url=request_input.requests_from_url,
82+
headers=request_input.headers,
83+
payload=request_input.payload.encode('utf-8'),
6084
)
6185
)
62-
await asyncio.gather(*remote_url_requests)
63-
# TODO as callbacks
64-
a = list(
65-
extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests
66-
)
67-
return list(chain.from_iterable(a))
86+
task.add_done_callback(extract_requests_from_response)
87+
remote_url_requests.append(task)
6888

69-
70-
@staticmethod
71-
def extract_requests_from_response(response: HttpResponse) -> list[Request]:
72-
matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8')))
73-
return [Request.from_url(match.group(0)) for match in matches]
89+
await asyncio.gather(*remote_url_requests)
90+
return created_requests

src/apify/_platform_event_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
import asyncio
4-
from datetime import datetime # noqa: TCH003
4+
from datetime import datetime
55
from typing import TYPE_CHECKING, Annotated, Any, Literal, Union
66

77
import websockets.client
Lines changed: 0 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,12 @@
11
from __future__ import annotations
22

3-
import typing
43
from typing import TYPE_CHECKING
5-
from unittest import mock
6-
from unittest.mock import call
74

85
import httpx
96
import pytest
107

118
from apify_client import ApifyClientAsync
129
from apify_shared.consts import ApifyEnvVars
13-
from crawlee._request import UserData
14-
from crawlee._types import HttpHeaders, HttpMethod
15-
from crawlee.http_clients import HttpResponse, HttpxHttpClient
1610

1711
from apify import Actor
1812

@@ -147,128 +141,3 @@ async def test_proxy_configuration_with_actor_proxy_input(
147141
assert len(route.calls) == 2
148142

149143
await Actor.exit()
150-
151-
152-
@pytest.mark.parametrize('request_method', typing.get_args(HttpMethod))
153-
@pytest.mark.parametrize(
154-
'optional_input',
155-
[
156-
{},
157-
{'payload': 'some payload', 'userData': {'some key': 'some value'}, 'headers': {'h1': 'v1', 'h2': 'v2'}},
158-
],
159-
ids=['minimal', 'all_options'],
160-
)
161-
async def test_actor_create_request_list_request_types(
162-
request_method: HttpMethod, optional_input: dict[str, str]
163-
) -> None:
164-
"""Test proper request list generation from both minimal and full inputs for all method types for simple input."""
165-
minimal_request_dict_input = {'url': 'https://www.abc.com', 'method': request_method}
166-
request_dict_input = {**minimal_request_dict_input, **optional_input}
167-
example_start_urls_input = [
168-
request_dict_input,
169-
]
170-
171-
generated_request_list = await Actor.create_request_list(actor_start_urls_input=example_start_urls_input)
172-
173-
assert not await generated_request_list.is_empty()
174-
generated_request = await generated_request_list.fetch_next_request()
175-
assert await generated_request_list.is_empty()
176-
177-
assert generated_request.method == request_dict_input['method']
178-
assert generated_request.url == request_dict_input['url']
179-
assert generated_request.payload == request_dict_input.get('payload', '').encode('utf-8')
180-
expected_user_data = UserData()
181-
if 'userData' in optional_input:
182-
for key, value in optional_input['userData'].items():
183-
expected_user_data[key] = value
184-
assert generated_request.user_data == expected_user_data
185-
expected_headers = HttpHeaders(root=optional_input.get('headers', {}))
186-
assert generated_request.headers == expected_headers
187-
188-
189-
def _create_dummy_response(read_output: typing.Iterable[str]) -> HttpResponse:
190-
"""Create dummy_response that will iterate through read_output when called like dummy_response.read()"""
191-
192-
class DummyResponse(HttpResponse):
193-
@property
194-
def http_version(self) -> str:
195-
return ''
196-
197-
@property
198-
def status_code(self) -> int:
199-
return 200
200-
201-
@property
202-
def headers(self) -> HttpHeaders:
203-
return HttpHeaders()
204-
205-
def read(self) -> bytes:
206-
return next(read_output).encode('utf-8')
207-
208-
return DummyResponse()
209-
210-
211-
async def test_actor_create_request_list_from_url_correctly_send_requests() -> None:
212-
"""Test that injected HttpClient's method send_request is called with properly passed arguments."""
213-
214-
example_start_urls_input = [
215-
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
216-
{'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'PUT'},
217-
{
218-
'requestsFromUrl': 'https://www.something.som',
219-
'method': 'POST',
220-
'headers': {'key': 'value'},
221-
'payload': 'some_payload',
222-
'userData': 'irrelevant',
223-
},
224-
]
225-
mocked_read_outputs = ('' for url in example_start_urls_input)
226-
http_client = HttpxHttpClient()
227-
with mock.patch.object(
228-
http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs)
229-
) as mocked_send_request:
230-
await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=http_client)
231-
232-
expected_calls = [
233-
call(
234-
method=example_input['method'],
235-
url=example_input['requestsFromUrl'],
236-
headers=example_input.get('headers', {}),
237-
payload=example_input.get('payload', '').encode('utf-8'),
238-
)
239-
for example_input in example_start_urls_input
240-
]
241-
mocked_send_request.assert_has_calls(expected_calls)
242-
243-
244-
async def test_actor_create_request_list_from_url() -> None:
245-
"""Test that create_request_list is correctly reading urls from remote url sources and also from simple input."""
246-
expected_simple_url = 'https://www.someurl.com'
247-
expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'}
248-
expected_remote_urls_2 = {'http://www.ok.com', 'https://www.true-positive.com'}
249-
expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url}
250-
response_bodies = iter(
251-
(
252-
'blablabla{} more blablabla{} , even more blablabla. {} '.format(*expected_remote_urls_1),
253-
'some stuff{} more stuff{} www.falsepositive www.false_positive.com'.format(*expected_remote_urls_2),
254-
)
255-
)
256-
257-
example_start_urls_input = [
258-
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
259-
{'url': expected_simple_url, 'method': 'GET'},
260-
{'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'GET'},
261-
]
262-
263-
http_client = HttpxHttpClient()
264-
with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
265-
generated_request_list = await Actor.create_request_list(
266-
actor_start_urls_input=example_start_urls_input, http_client=http_client
267-
)
268-
generated_requests = []
269-
while request := await generated_request_list.fetch_next_request():
270-
print(request)
271-
generated_requests.append(request)
272-
273-
# Check correctly created requests' urls in request list
274-
assert {generated_request.url for generated_request in generated_requests} == expected_urls

0 commit comments

Comments
 (0)