Skip to content

Commit eb875e3

Browse files
committed
WIP
1 parent 2edee89 commit eb875e3

File tree

2 files changed

+81
-7
lines changed

2 files changed

+81
-7
lines changed

src/apify/_actor.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22

33
import asyncio
44
import os
5+
import re
56
import sys
67
from datetime import timedelta
8+
from itertools import chain
79
from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
810

11+
from crawlee.http_clients import HttpxHttpClient, HttpResponse, BaseHttpClient
912
from lazy_object_proxy import Proxy
1013
from pydantic import AliasChoices
1114
from typing_extensions import Self
@@ -39,6 +42,10 @@
3942

4043
MainReturnType = TypeVar('MainReturnType')
4144

45+
URL_NO_COMMAS_REGEX = re.compile(r"https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?")
46+
# JS version. TODO rewrite to Python regexp
47+
# /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?/giu;
48+
4249

4350
class _ActorType:
4451
"""The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
@@ -976,20 +983,44 @@ async def create_proxy_configuration(
976983
return proxy_configuration
977984

978985
@staticmethod
979-
def create_request_list(*, actor_start_urls_input: dict) -> RequestList:
980-
return RequestList(
981-
requests=[
986+
async def create_request_list(*, actor_start_urls_input: dict, http_client: BaseHttpClient = HttpxHttpClient()) -> RequestList:
987+
simple_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "url" in request_input]
988+
remote_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "requestsFromUrl" in request_input]
989+
990+
simple_url_requests = Actor._create_requests_from_input(simple_url_requests_inputs)
991+
remote_url_requests = await Actor._create_requests_from_url(remote_url_requests_inputs, http_client=http_client)
992+
993+
return RequestList(requests=simple_url_requests + remote_url_requests)
994+
995+
@staticmethod
996+
def _create_requests_from_input(simple_url_requests_inputs: list[dict[str,str]]) -> list[Request]:
997+
return [
982998
Request.from_url(
983999
method=request_input.get('method'),
9841000
url=request_input.get('url'),
9851001
payload=request_input.get('payload', '').encode('utf-8'),
9861002
headers=request_input.get('headers', {}),
9871003
user_data=request_input.get('userData', {}),
9881004
)
989-
for request_input in actor_start_urls_input
990-
]
991-
)
1005+
for request_input in simple_url_requests_inputs]
9921006

1007+
@staticmethod
1008+
async def _create_requests_from_url(remote_url_requests_inputs: list[dict[str,str]], http_client: BaseHttpClient ) -> list[Request]:
1009+
remote_url_requests = []
1010+
for input in remote_url_requests_inputs:
1011+
remote_url_requests.append(asyncio.create_task(http_client.send_request(
1012+
url=input["requestsFromUrl"],
1013+
headers=input.get("headers", {}),
1014+
payload=input.get("payload", "").encode('utf-8'),
1015+
)))
1016+
await asyncio.gather(*remote_url_requests)
1017+
# TODO as callbacks
1018+
return list(chain.from_iterable((Actor.extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests)))
1019+
1020+
@staticmethod
1021+
def extract_requests_from_response(response: HttpResponse) -> list[Request]:
1022+
matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8')))
1023+
return [Request.from_url(match.group(0)) for match in matches]
9931024

9941025
Actor = cast(_ActorType, Proxy(_ActorType))
9951026
"""The entry point of the SDK, through which all the Actor operations should be done."""

tests/unit/actor/test_actor_create_proxy_configuration.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import typing
44
from typing import TYPE_CHECKING
5+
from unittest import mock
56

67
import httpx
78
import pytest
@@ -10,6 +11,7 @@
1011
from apify_shared.consts import ApifyEnvVars
1112
from crawlee._request import UserData
1213
from crawlee._types import HttpHeaders, HttpMethod
14+
from crawlee.http_clients import HttpxHttpClient, HttpResponse
1315

1416
from apify import Actor
1517

@@ -165,7 +167,7 @@ async def test_actor_create_request_list_request_types(
165167
request_dict_input,
166168
]
167169

168-
generated_request_list = Actor.create_request_list(actor_start_urls_input=example_start_urls_input)
170+
generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input)
169171

170172
assert not await generated_request_list.is_empty()
171173
generated_request = await generated_request_list.fetch_next_request()
@@ -181,3 +183,44 @@ async def test_actor_create_request_list_request_types(
181183
assert generated_request.user_data == expected_user_data
182184
expected_headers = HttpHeaders(root=optional_input.get('headers', {}))
183185
assert generated_request.headers == expected_headers
186+
187+
188+
async def test_actor_create_request_list_from_url():
189+
expected_urls = {"http://www.something.com", "https://www.something_else.com", "http://www.bla.net"}
190+
response_body = "blablabla{} more blablabla{} ,\n even more blablbablba.{}".format(*expected_urls)
191+
mocked_http_client = HttpxHttpClient()
192+
class DummyResponse(HttpResponse):
193+
@property
194+
def http_version(self) -> str:
195+
"""The HTTP version used in the response."""
196+
return ""
197+
198+
@property
199+
def status_code(self) -> int:
200+
"""The HTTP status code received from the server."""
201+
return 200
202+
203+
@property
204+
def headers(self) -> HttpHeaders:
205+
"""The HTTP headers received in the response."""
206+
return HttpHeaders()
207+
208+
def read(self) -> bytes:
209+
return response_body.encode('utf-8')
210+
211+
212+
async def mocked_send_request(*args, **kwargs):
213+
return DummyResponse()
214+
with mock.patch.object(mocked_http_client, "send_request", mocked_send_request) as mocked_send_request2:
215+
216+
example_start_urls_input = [
217+
{"requestsFromUrl": "https://crawlee.dev/file.txt", 'method': "GET"}
218+
]
219+
220+
221+
generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=mocked_http_client)
222+
generated_requests = []
223+
while request:= await generated_request_list.fetch_next_request():
224+
generated_requests.append(request)
225+
226+
assert set(generated_request.url for generated_request in generated_requests) == expected_urls

0 commit comments

Comments
 (0)