|
2 | 2 |
|
3 | 3 | import asyncio
|
4 | 4 | import os
|
| 5 | +import re |
5 | 6 | import sys
|
6 | 7 | from datetime import timedelta
|
| 8 | +from itertools import chain |
7 | 9 | from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
|
8 | 10 |
|
| 11 | +from crawlee.http_clients import HttpxHttpClient, HttpResponse, BaseHttpClient |
9 | 12 | from lazy_object_proxy import Proxy
|
10 | 13 | from pydantic import AliasChoices
|
11 | 14 | from typing_extensions import Self
|
|
39 | 42 |
|
40 | 43 | MainReturnType = TypeVar('MainReturnType')
|
41 | 44 |
|
| 45 | +URL_NO_COMMAS_REGEX = re.compile(r"https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?") |
| 46 | +# JS version. TODO rewrite to Python regexp |
| 47 | +# /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?/giu; |
| 48 | + |
42 | 49 |
|
43 | 50 | class _ActorType:
|
44 | 51 | """The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
|
@@ -976,20 +983,44 @@ async def create_proxy_configuration(
|
976 | 983 | return proxy_configuration
|
977 | 984 |
|
978 | 985 | @staticmethod
|
979 |
| - def create_request_list(*, actor_start_urls_input: dict) -> RequestList: |
980 |
| - return RequestList( |
981 |
| - requests=[ |
| 986 | + async def create_request_list(*, actor_start_urls_input: dict, http_client: BaseHttpClient = HttpxHttpClient()) -> RequestList: |
| 987 | + simple_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "url" in request_input] |
| 988 | + remote_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "requestsFromUrl" in request_input] |
| 989 | + |
| 990 | + simple_url_requests = Actor._create_requests_from_input(simple_url_requests_inputs) |
| 991 | + remote_url_requests = await Actor._create_requests_from_url(remote_url_requests_inputs, http_client=http_client) |
| 992 | + |
| 993 | + return RequestList(requests=simple_url_requests + remote_url_requests) |
| 994 | + |
| 995 | + @staticmethod |
| 996 | + def _create_requests_from_input(simple_url_requests_inputs: list[dict[str,str]]) -> list[Request]: |
| 997 | + return [ |
982 | 998 | Request.from_url(
|
983 | 999 | method=request_input.get('method'),
|
984 | 1000 | url=request_input.get('url'),
|
985 | 1001 | payload=request_input.get('payload', '').encode('utf-8'),
|
986 | 1002 | headers=request_input.get('headers', {}),
|
987 | 1003 | user_data=request_input.get('userData', {}),
|
988 | 1004 | )
|
989 |
| - for request_input in actor_start_urls_input |
990 |
| - ] |
991 |
| - ) |
| 1005 | + for request_input in simple_url_requests_inputs] |
992 | 1006 |
|
| 1007 | + @staticmethod |
| 1008 | + async def _create_requests_from_url(remote_url_requests_inputs: list[dict[str,str]], http_client: BaseHttpClient ) -> list[Request]: |
| 1009 | + remote_url_requests = [] |
| 1010 | + for input in remote_url_requests_inputs: |
| 1011 | + remote_url_requests.append(asyncio.create_task(http_client.send_request( |
| 1012 | + url=input["requestsFromUrl"], |
| 1013 | + headers=input.get("headers", {}), |
| 1014 | + payload=input.get("payload", "").encode('utf-8'), |
| 1015 | + ))) |
| 1016 | + await asyncio.gather(*remote_url_requests) |
| 1017 | + # TODO as callbacks |
| 1018 | + return list(chain.from_iterable((Actor.extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests))) |
| 1019 | + |
| 1020 | + @staticmethod |
| 1021 | + def extract_requests_from_response(response: HttpResponse) -> list[Request]: |
| 1022 | + matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8'))) |
| 1023 | + return [Request.from_url(match.group(0)) for match in matches] |
993 | 1024 |
|
994 | 1025 | Actor = cast(_ActorType, Proxy(_ActorType))
|
995 | 1026 | """The entry point of the SDK, through which all the Actor operations should be done."""
|
0 commit comments