From cc940235936aa8142a88065804dfcf3b2f11a88b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 12 Nov 2024 14:10:58 +0100 Subject: [PATCH 01/20] Draft example of helper function to create RequestList --- src/apify/_actor.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index f60a99df..3d6255f1 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -6,6 +6,7 @@ from datetime import timedelta from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast +from crawlee.storages import RequestList from lazy_object_proxy import Proxy from pydantic import AliasChoices from typing_extensions import Self @@ -13,7 +14,7 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value -from crawlee import service_container +from crawlee import service_container, Request from crawlee.events._types import Event, EventPersistStateData from apify._configuration import Configuration @@ -974,6 +975,20 @@ async def create_proxy_configuration( return proxy_configuration + @staticmethod + def create_request_list( + *, + actor_start_urls_input: dict + ) ->RequestList: + return RequestList(requests=[ + Request.from_url( + method=request_input.get("method"), + url=request_input.get("url"), + payload=request_input.get("payload", "").encode("utf-8"), + headers=request_input.get("headers", {}), + user_data=request_input.get("userData", {}), + ) for request_input in actor_start_urls_input]) + Actor = cast(_ActorType, Proxy(_ActorType)) """The entry point of the SDK, through which all the Actor operations should be done.""" From ded055d893b3349bfd658eeb9910298748958459 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 14 Nov 2024 14:41:29 +0100 Subject: [PATCH 02/20] Add test for simple input --- src/apify/_actor.py | 29 +++++++------- .../test_actor_create_proxy_configuration.py | 40 +++++++++++++++++++ 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 3d6255f1..06bc2fb8 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -6,7 +6,6 @@ from datetime import timedelta from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast -from crawlee.storages import RequestList from lazy_object_proxy import Proxy from pydantic import AliasChoices from typing_extensions import Self @@ -14,8 +13,9 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value -from crawlee import service_container, Request +from crawlee import Request, service_container from crawlee.events._types import Event, EventPersistStateData +from crawlee.storages import RequestList from apify._configuration import Configuration from apify._consts import EVENT_LISTENERS_TIMEOUT @@ -976,18 +976,19 @@ async def create_proxy_configuration( return proxy_configuration @staticmethod - def create_request_list( - *, - actor_start_urls_input: dict - ) ->RequestList: - return RequestList(requests=[ - Request.from_url( - method=request_input.get("method"), - url=request_input.get("url"), - payload=request_input.get("payload", "").encode("utf-8"), - headers=request_input.get("headers", {}), - user_data=request_input.get("userData", {}), - ) for request_input in actor_start_urls_input]) + def create_request_list(*, actor_start_urls_input: dict) -> RequestList: + return RequestList( + requests=[ + Request.from_url( + method=request_input.get('method'), + url=request_input.get('url'), + payload=request_input.get('payload', '').encode('utf-8'), + headers=request_input.get('headers', {}), + user_data=request_input.get('userData', {}), + ) + for request_input in actor_start_urls_input + ] + ) Actor = cast(_ActorType, Proxy(_ActorType)) diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index e0c7cd57..b5dd293e 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -1,5 +1,6 @@ from __future__ import annotations +import typing from typing import TYPE_CHECKING import httpx @@ -7,6 +8,8 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars +from crawlee._request import UserData +from crawlee._types import HttpHeaders, HttpMethod from apify import Actor @@ -141,3 +144,40 @@ async def test_proxy_configuration_with_actor_proxy_input( assert len(route.calls) == 2 await Actor.exit() + + +@pytest.mark.parametrize('request_method', typing.get_args(HttpMethod)) +@pytest.mark.parametrize( + 'optional_input', + [ + {}, + {'payload': 'some payload', 'userData': {'some key': 'some value'}, 'headers': {'h1': 'v1', 'h2': 'v2'}}, + ], + ids=['minimal', 'all_options'], +) +async def test_actor_create_request_list_request_types( + request_method: HttpMethod, optional_input: dict[str, str] +) -> None: + """Tests proper request list generation from both minimal and full inputs for all method types.""" + minimal_request_dict_input = {'url': 'https://www.abc.com', 'method': request_method} + request_dict_input = {**minimal_request_dict_input, **optional_input} + example_start_urls_input = [ + request_dict_input, + ] + + generated_request_list = Actor.create_request_list(actor_start_urls_input=example_start_urls_input) + + assert not await generated_request_list.is_empty() + generated_request = await generated_request_list.fetch_next_request() + assert await generated_request_list.is_empty() + + assert generated_request.method == request_dict_input['method'] + assert generated_request.url == request_dict_input['url'] + assert generated_request.payload == request_dict_input.get('payload', '').encode('utf-8') + expected_user_data = UserData() + if 'userData' in optional_input: + for key, value in optional_input['userData'].items(): + expected_user_data[key] = value + assert generated_request.user_data == expected_user_data + expected_headers = HttpHeaders(root=optional_input.get('headers', {})) + assert generated_request.headers == expected_headers From 57dd329001c123f815c9306fcbea78a683fe7bc9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 15 Nov 2024 11:32:10 +0100 Subject: [PATCH 03/20] WIP --- src/apify/_actor.py | 43 +++++++++++++++--- .../test_actor_create_proxy_configuration.py | 45 ++++++++++++++++++- 2 files changed, 81 insertions(+), 7 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 06bc2fb8..06023400 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -2,10 +2,13 @@ import asyncio import os +import re import sys from datetime import timedelta +from itertools import chain from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast +from crawlee.http_clients import HttpxHttpClient, HttpResponse, BaseHttpClient from lazy_object_proxy import Proxy from pydantic import AliasChoices from typing_extensions import Self @@ -39,6 +42,10 @@ MainReturnType = TypeVar('MainReturnType') +URL_NO_COMMAS_REGEX = re.compile(r"https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?") +# JS version. TODO rewrite to Python regexp +# /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?/giu; + class _ActorType: """The class of `Actor`. Only make a new instance if you're absolutely sure you need to.""" @@ -976,9 +983,18 @@ async def create_proxy_configuration( return proxy_configuration @staticmethod - def create_request_list(*, actor_start_urls_input: dict) -> RequestList: - return RequestList( - requests=[ + async def create_request_list(*, actor_start_urls_input: dict, http_client: BaseHttpClient = HttpxHttpClient()) -> RequestList: + simple_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "url" in request_input] + remote_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "requestsFromUrl" in request_input] + + simple_url_requests = Actor._create_requests_from_input(simple_url_requests_inputs) + remote_url_requests = await Actor._create_requests_from_url(remote_url_requests_inputs, http_client=http_client) + + return RequestList(requests=simple_url_requests + remote_url_requests) + + @staticmethod + def _create_requests_from_input(simple_url_requests_inputs: list[dict[str,str]]) -> list[Request]: + return [ Request.from_url( method=request_input.get('method'), url=request_input.get('url'), @@ -986,10 +1002,25 @@ def create_request_list(*, actor_start_urls_input: dict) -> RequestList: headers=request_input.get('headers', {}), user_data=request_input.get('userData', {}), ) - for request_input in actor_start_urls_input - ] - ) + for request_input in simple_url_requests_inputs] + @staticmethod + async def _create_requests_from_url(remote_url_requests_inputs: list[dict[str,str]], http_client: BaseHttpClient ) -> list[Request]: + remote_url_requests = [] + for input in remote_url_requests_inputs: + remote_url_requests.append(asyncio.create_task(http_client.send_request( + url=input["requestsFromUrl"], + headers=input.get("headers", {}), + payload=input.get("payload", "").encode('utf-8'), + ))) + await asyncio.gather(*remote_url_requests) + # TODO as callbacks + return list(chain.from_iterable((Actor.extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests))) + + @staticmethod + def extract_requests_from_response(response: HttpResponse) -> list[Request]: + matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8'))) + return [Request.from_url(match.group(0)) for match in matches] Actor = cast(_ActorType, Proxy(_ActorType)) """The entry point of the SDK, through which all the Actor operations should be done.""" diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index b5dd293e..93440996 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -2,6 +2,7 @@ import typing from typing import TYPE_CHECKING +from unittest import mock import httpx import pytest @@ -10,6 +11,7 @@ from apify_shared.consts import ApifyEnvVars from crawlee._request import UserData from crawlee._types import HttpHeaders, HttpMethod +from crawlee.http_clients import HttpxHttpClient, HttpResponse from apify import Actor @@ -165,7 +167,7 @@ async def test_actor_create_request_list_request_types( request_dict_input, ] - generated_request_list = Actor.create_request_list(actor_start_urls_input=example_start_urls_input) + generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input) assert not await generated_request_list.is_empty() generated_request = await generated_request_list.fetch_next_request() @@ -181,3 +183,44 @@ async def test_actor_create_request_list_request_types( assert generated_request.user_data == expected_user_data expected_headers = HttpHeaders(root=optional_input.get('headers', {})) assert generated_request.headers == expected_headers + + +async def test_actor_create_request_list_from_url(): + expected_urls = {"http://www.something.com", "https://www.something_else.com", "http://www.bla.net"} + response_body = "blablabla{} more blablabla{} ,\n even more blablbablba.{}".format(*expected_urls) + mocked_http_client = HttpxHttpClient() + class DummyResponse(HttpResponse): + @property + def http_version(self) -> str: + """The HTTP version used in the response.""" + return "" + + @property + def status_code(self) -> int: + """The HTTP status code received from the server.""" + return 200 + + @property + def headers(self) -> HttpHeaders: + """The HTTP headers received in the response.""" + return HttpHeaders() + + def read(self) -> bytes: + return response_body.encode('utf-8') + + + async def mocked_send_request(*args, **kwargs): + return DummyResponse() + with mock.patch.object(mocked_http_client, "send_request", mocked_send_request) as mocked_send_request2: + + example_start_urls_input = [ + {"requestsFromUrl": "https://crawlee.dev/file.txt", 'method': "GET"} + ] + + + generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=mocked_http_client) + generated_requests = [] + while request:= await generated_request_list.fetch_next_request(): + generated_requests.append(request) + + assert set(generated_request.url for generated_request in generated_requests) == expected_urls From 0a465be93c4ca5d020593529c7e2c3ea1531d5ff Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 15 Nov 2024 14:19:59 +0100 Subject: [PATCH 04/20] WIP Finalize tests. Split to its own file. --- src/apify/_actor.py | 58 ++++-------- src/apify/_actor_inputs.py | 73 +++++++++++++++ .../test_actor_create_proxy_configuration.py | 90 ++++++++++++++----- 3 files changed, 158 insertions(+), 63 deletions(-) create mode 100644 src/apify/_actor_inputs.py diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 06023400..a30e189e 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -8,7 +8,6 @@ from itertools import chain from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast -from crawlee.http_clients import HttpxHttpClient, HttpResponse, BaseHttpClient from lazy_object_proxy import Proxy from pydantic import AliasChoices from typing_extensions import Self @@ -18,8 +17,10 @@ from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value from crawlee import Request, service_container from crawlee.events._types import Event, EventPersistStateData +from crawlee.http_clients import BaseHttpClient, HttpResponse, HttpxHttpClient from crawlee.storages import RequestList +from apify._actor_inputs import _create_request_list from apify._configuration import Configuration from apify._consts import EVENT_LISTENERS_TIMEOUT from apify._crypto import decrypt_input_secrets, load_private_key @@ -42,9 +43,6 @@ MainReturnType = TypeVar('MainReturnType') -URL_NO_COMMAS_REGEX = re.compile(r"https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?") -# JS version. TODO rewrite to Python regexp -# /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?/giu; class _ActorType: @@ -983,44 +981,20 @@ async def create_proxy_configuration( return proxy_configuration @staticmethod - async def create_request_list(*, actor_start_urls_input: dict, http_client: BaseHttpClient = HttpxHttpClient()) -> RequestList: - simple_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "url" in request_input] - remote_url_requests_inputs = [request_input for request_input in actor_start_urls_input if "requestsFromUrl" in request_input] - - simple_url_requests = Actor._create_requests_from_input(simple_url_requests_inputs) - remote_url_requests = await Actor._create_requests_from_url(remote_url_requests_inputs, http_client=http_client) - - return RequestList(requests=simple_url_requests + remote_url_requests) - - @staticmethod - def _create_requests_from_input(simple_url_requests_inputs: list[dict[str,str]]) -> list[Request]: - return [ - Request.from_url( - method=request_input.get('method'), - url=request_input.get('url'), - payload=request_input.get('payload', '').encode('utf-8'), - headers=request_input.get('headers', {}), - user_data=request_input.get('userData', {}), - ) - for request_input in simple_url_requests_inputs] - - @staticmethod - async def _create_requests_from_url(remote_url_requests_inputs: list[dict[str,str]], http_client: BaseHttpClient ) -> list[Request]: - remote_url_requests = [] - for input in remote_url_requests_inputs: - remote_url_requests.append(asyncio.create_task(http_client.send_request( - url=input["requestsFromUrl"], - headers=input.get("headers", {}), - payload=input.get("payload", "").encode('utf-8'), - ))) - await asyncio.gather(*remote_url_requests) - # TODO as callbacks - return list(chain.from_iterable((Actor.extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests))) - - @staticmethod - def extract_requests_from_response(response: HttpResponse) -> list[Request]: - matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8'))) - return [Request.from_url(match.group(0)) for match in matches] + async def create_request_list( + *, actor_start_urls_input: list[dict[str,str]], http_client: BaseHttpClient | None= None + ) -> RequestList: + """Creates request list from Actor input requestListSources. This accepts list of urls and requestsFromUrl. + + Example: + actor_start_urls_input = [ + # Gather urls from response body. + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + # Directly include this url. + {'url': 'https://crawlee.dev', 'method': 'GET'} + ] + """ + return await _create_request_list(actor_start_urls_input=actor_start_urls_input, http_client=http_client) Actor = cast(_ActorType, Proxy(_ActorType)) """The entry point of the SDK, through which all the Actor operations should be done.""" diff --git a/src/apify/_actor_inputs.py b/src/apify/_actor_inputs.py new file mode 100644 index 00000000..bf7d6fca --- /dev/null +++ b/src/apify/_actor_inputs.py @@ -0,0 +1,73 @@ +import asyncio +from itertools import chain +import re + +from crawlee import Request +from crawlee.http_clients import BaseHttpClient, HttpxHttpClient, HttpResponse +from crawlee.storages import RequestList + +URL_NO_COMMAS_REGEX = re.compile( + r'https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?' +) + +@staticmethod +async def _create_request_list( + *, actor_start_urls_input: dict, http_client: BaseHttpClient | None = None +) -> RequestList: + if not http_client: + http_client = HttpxHttpClient() + simple_url_requests_inputs = [ + request_input for request_input in actor_start_urls_input if 'url' in request_input + ] + remote_url_requests_inputs = [ + request_input for request_input in actor_start_urls_input if 'requestsFromUrl' in request_input + ] + + simple_url_requests = _create_requests_from_input(simple_url_requests_inputs) + remote_url_requests = await _create_requests_from_url(remote_url_requests_inputs, http_client=http_client) + + return RequestList(requests=simple_url_requests + remote_url_requests) + + +@staticmethod +def _create_requests_from_input(simple_url_requests_inputs: list[dict[str, str]]) -> list[Request]: + return [ + Request.from_url( + method=request_input.get('method'), + url=request_input.get('url'), + payload=request_input.get('payload', '').encode('utf-8'), + headers=request_input.get('headers', {}), + user_data=request_input.get('userData', {}), + ) + for request_input in simple_url_requests_inputs + ] + + +@staticmethod +async def _create_requests_from_url( + remote_url_requests_inputs: list[dict[str, str]], http_client: BaseHttpClient +) -> list[Request]: + remote_url_requests = [] + for request_input in remote_url_requests_inputs: + remote_url_requests.append( + asyncio.create_task( + http_client.send_request( + method=request_input['method'], + url=request_input['requestsFromUrl'], + headers=request_input.get('headers', {}), + payload=request_input.get('payload', '').encode('utf-8'), + ) + ) + ) + await asyncio.gather(*remote_url_requests) + # TODO as callbacks + a = list( + extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests + ) + return list(chain.from_iterable(a)) + + +@staticmethod +def extract_requests_from_response(response: HttpResponse) -> list[Request]: + matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8'))) + return [Request.from_url(match.group(0)) for match in matches] diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index 93440996..0ed45f52 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -3,6 +3,7 @@ import typing from typing import TYPE_CHECKING from unittest import mock +from unittest.mock import call import httpx import pytest @@ -11,7 +12,7 @@ from apify_shared.consts import ApifyEnvVars from crawlee._request import UserData from crawlee._types import HttpHeaders, HttpMethod -from crawlee.http_clients import HttpxHttpClient, HttpResponse +from crawlee.http_clients import HttpResponse, HttpxHttpClient from apify import Actor @@ -160,14 +161,14 @@ async def test_proxy_configuration_with_actor_proxy_input( async def test_actor_create_request_list_request_types( request_method: HttpMethod, optional_input: dict[str, str] ) -> None: - """Tests proper request list generation from both minimal and full inputs for all method types.""" + """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" minimal_request_dict_input = {'url': 'https://www.abc.com', 'method': request_method} request_dict_input = {**minimal_request_dict_input, **optional_input} example_start_urls_input = [ request_dict_input, ] - generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input) + generated_request_list = await Actor.create_request_list(actor_start_urls_input=example_start_urls_input) assert not await generated_request_list.is_empty() generated_request = await generated_request_list.fetch_next_request() @@ -185,42 +186,89 @@ async def test_actor_create_request_list_request_types( assert generated_request.headers == expected_headers -async def test_actor_create_request_list_from_url(): - expected_urls = {"http://www.something.com", "https://www.something_else.com", "http://www.bla.net"} - response_body = "blablabla{} more blablabla{} ,\n even more blablbablba.{}".format(*expected_urls) - mocked_http_client = HttpxHttpClient() +def _create_dummy_response(read_output: typing.Iterable[str]) -> HttpResponse: + """Create dummy_response that will iterate through read_output when called like dummy_response.read()""" + class DummyResponse(HttpResponse): @property def http_version(self) -> str: - """The HTTP version used in the response.""" - return "" + return '' @property def status_code(self) -> int: - """The HTTP status code received from the server.""" return 200 @property def headers(self) -> HttpHeaders: - """The HTTP headers received in the response.""" return HttpHeaders() def read(self) -> bytes: - return response_body.encode('utf-8') + return next(read_output).encode('utf-8') + return DummyResponse() - async def mocked_send_request(*args, **kwargs): - return DummyResponse() - with mock.patch.object(mocked_http_client, "send_request", mocked_send_request) as mocked_send_request2: - example_start_urls_input = [ - {"requestsFromUrl": "https://crawlee.dev/file.txt", 'method': "GET"} - ] +async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: + """Test that injected HttpClient's method send_request is called with properly passed arguments.""" + example_start_urls_input = [ + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + {'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'PUT'}, + { + 'requestsFromUrl': 'https://www.something.som', + 'method': 'POST', + 'headers': {'key': 'value'}, + 'payload': 'some_payload', + 'userData': 'irrelevant', + }, + ] + mocked_read_outputs = ('' for url in example_start_urls_input) + http_client = HttpxHttpClient() + with mock.patch.object( + http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) + ) as mocked_send_request: + await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=http_client) + + expected_calls = [ + call( + method=example_input['method'], + url=example_input['requestsFromUrl'], + headers=example_input.get('headers', {}), + payload=example_input.get('payload', '').encode('utf-8'), + ) + for example_input in example_start_urls_input + ] + mocked_send_request.assert_has_calls(expected_calls) + + +async def test_actor_create_request_list_from_url() -> None: + """Test that create_request_list is correctly reading urls from remote url sources and also from simple input.""" + expected_simple_url = 'https://www.someurl.com' + expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'} + expected_remote_urls_2 = {'http://www.ok.com', 'https://www.true-positive.com'} + expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url} + response_bodies = iter( + ( + 'blablabla{} more blablabla{} , even more blablabla. {} '.format(*expected_remote_urls_1), + 'some stuff{} more stuff{} www.falsepositive www.false_positive.com'.format(*expected_remote_urls_2), + ) + ) - generated_request_list =await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=mocked_http_client) + example_start_urls_input = [ + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + {'url': expected_simple_url, 'method': 'GET'}, + {'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'GET'}, + ] + + http_client = HttpxHttpClient() + with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): + generated_request_list = await Actor.create_request_list( + actor_start_urls_input=example_start_urls_input, http_client=http_client + ) generated_requests = [] - while request:= await generated_request_list.fetch_next_request(): + while request := await generated_request_list.fetch_next_request(): + print(request) generated_requests.append(request) - assert set(generated_request.url for generated_request in generated_requests) == expected_urls + # Check correctly created requests' urls in request list + assert {generated_request.url for generated_request in generated_requests} == expected_urls From 4b167374bd4c5bc6f8bf71a8c563d9511f835a8f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 15 Nov 2024 15:43:08 +0100 Subject: [PATCH 05/20] Use Pydantic to handle raw inputs Fix typing issues WIP --- pyproject.toml | 3 + src/apify/_actor.py | 18 +-- src/apify/_actor_inputs.py | 83 ++++++----- src/apify/_platform_event_manager.py | 2 +- .../test_actor_create_proxy_configuration.py | 131 ----------------- .../actor/test_actor_create_request_list.py | 139 ++++++++++++++++++ 6 files changed, 201 insertions(+), 175 deletions(-) create mode 100644 tests/unit/actor/test_actor_create_request_list.py diff --git a/pyproject.toml b/pyproject.toml index 0d41756e..c3a01c41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -141,6 +141,9 @@ indent-style = "space" docstring-quotes = "double" inline-quotes = "single" +[tool.ruff.lint.flake8-type-checking] +runtime-evaluated-base-classes = ["pydantic.BaseModel"] + [tool.ruff.lint.flake8-builtins] builtins-ignorelist = ["id"] diff --git a/src/apify/_actor.py b/src/apify/_actor.py index a30e189e..872416a0 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -2,10 +2,8 @@ import asyncio import os -import re import sys from datetime import timedelta -from itertools import chain from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast from lazy_object_proxy import Proxy @@ -15,10 +13,8 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value -from crawlee import Request, service_container +from crawlee import service_container from crawlee.events._types import Event, EventPersistStateData -from crawlee.http_clients import BaseHttpClient, HttpResponse, HttpxHttpClient -from crawlee.storages import RequestList from apify._actor_inputs import _create_request_list from apify._configuration import Configuration @@ -36,7 +32,9 @@ import logging from types import TracebackType + from crawlee.http_clients import BaseHttpClient from crawlee.proxy_configuration import _NewUrlFunction + from crawlee.storages import RequestList from apify._models import Webhook @@ -44,7 +42,6 @@ MainReturnType = TypeVar('MainReturnType') - class _ActorType: """The class of `Actor`. Only make a new instance if you're absolutely sure you need to.""" @@ -982,19 +979,20 @@ async def create_proxy_configuration( @staticmethod async def create_request_list( - *, actor_start_urls_input: list[dict[str,str]], http_client: BaseHttpClient | None= None + *, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None ) -> RequestList: - """Creates request list from Actor input requestListSources. This accepts list of urls and requestsFromUrl. + """Creates request list from Actor input requestListSources. This accepts list of urls and requests_from_url. Example: actor_start_urls_input = [ # Gather urls from response body. - {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'GET'}, # Directly include this url. {'url': 'https://crawlee.dev', 'method': 'GET'} ] - """ + """ return await _create_request_list(actor_start_urls_input=actor_start_urls_input, http_client=http_client) + Actor = cast(_ActorType, Proxy(_ActorType)) """The entry point of the SDK, through which all the Actor operations should be done.""" diff --git a/src/apify/_actor_inputs.py b/src/apify/_actor_inputs.py index bf7d6fca..921064aa 100644 --- a/src/apify/_actor_inputs.py +++ b/src/apify/_actor_inputs.py @@ -1,26 +1,46 @@ +from __future__ import annotations + import asyncio -from itertools import chain import re +from asyncio import Task +from typing import Any + +from pydantic import BaseModel, Field from crawlee import Request -from crawlee.http_clients import BaseHttpClient, HttpxHttpClient, HttpResponse +from crawlee._types import HttpMethod # TODO: Make public in Crawlee? +from crawlee.http_clients import BaseHttpClient, HttpxHttpClient from crawlee.storages import RequestList URL_NO_COMMAS_REGEX = re.compile( r'https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?' ) +class _RequestDetails(BaseModel): + method: HttpMethod + payload: str = '' + headers: dict[str, str] = Field(default_factory=dict) + user_data: dict[str, str]= Field(default_factory=dict, alias='user_data') + +class _RequestsFromUrlInput(_RequestDetails): + requests_from_url: str = Field(alias='requests_from_url') + +class _SimpleUrlInput(_RequestDetails): + url: str + + @staticmethod async def _create_request_list( - *, actor_start_urls_input: dict, http_client: BaseHttpClient | None = None + *, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None ) -> RequestList: if not http_client: http_client = HttpxHttpClient() simple_url_requests_inputs = [ - request_input for request_input in actor_start_urls_input if 'url' in request_input - ] + _SimpleUrlInput(**request_input) for request_input in actor_start_urls_input + if 'url' in request_input] remote_url_requests_inputs = [ - request_input for request_input in actor_start_urls_input if 'requestsFromUrl' in request_input + _RequestsFromUrlInput(**request_input) for request_input in actor_start_urls_input + if 'requests_from_url' in request_input ] simple_url_requests = _create_requests_from_input(simple_url_requests_inputs) @@ -30,44 +50,41 @@ async def _create_request_list( @staticmethod -def _create_requests_from_input(simple_url_requests_inputs: list[dict[str, str]]) -> list[Request]: +def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: return [ Request.from_url( - method=request_input.get('method'), - url=request_input.get('url'), - payload=request_input.get('payload', '').encode('utf-8'), - headers=request_input.get('headers', {}), - user_data=request_input.get('userData', {}), + method=request_input.method, + url=request_input.url, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data, ) - for request_input in simple_url_requests_inputs + for request_input in simple_url_inputs ] @staticmethod async def _create_requests_from_url( - remote_url_requests_inputs: list[dict[str, str]], http_client: BaseHttpClient + remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient ) -> list[Request]: + created_requests: list[Request] = [] + + def extract_requests_from_response(task: Task) -> list[Request]: + matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) + created_requests.extend([Request.from_url(match.group(0)) for match in matches]) + remote_url_requests = [] for request_input in remote_url_requests_inputs: - remote_url_requests.append( - asyncio.create_task( - http_client.send_request( - method=request_input['method'], - url=request_input['requestsFromUrl'], - headers=request_input.get('headers', {}), - payload=request_input.get('payload', '').encode('utf-8'), - ) + task = asyncio.create_task( + http_client.send_request( + method=request_input.method, + url=request_input.requests_from_url, + headers=request_input.headers, + payload=request_input.payload.encode('utf-8'), ) ) - await asyncio.gather(*remote_url_requests) - # TODO as callbacks - a = list( - extract_requests_from_response(finished_request.result()) for finished_request in remote_url_requests - ) - return list(chain.from_iterable(a)) + task.add_done_callback(extract_requests_from_response) + remote_url_requests.append(task) - -@staticmethod -def extract_requests_from_response(response: HttpResponse) -> list[Request]: - matches = list(re.finditer(URL_NO_COMMAS_REGEX, response.read().decode('utf-8'))) - return [Request.from_url(match.group(0)) for match in matches] + await asyncio.gather(*remote_url_requests) + return created_requests diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py index afbf2c5d..348438bf 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/_platform_event_manager.py @@ -1,7 +1,7 @@ from __future__ import annotations import asyncio -from datetime import datetime # noqa: TCH003 +from datetime import datetime from typing import TYPE_CHECKING, Annotated, Any, Literal, Union import websockets.client diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index 0ed45f52..e0c7cd57 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -1,18 +1,12 @@ from __future__ import annotations -import typing from typing import TYPE_CHECKING -from unittest import mock -from unittest.mock import call import httpx import pytest from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars -from crawlee._request import UserData -from crawlee._types import HttpHeaders, HttpMethod -from crawlee.http_clients import HttpResponse, HttpxHttpClient from apify import Actor @@ -147,128 +141,3 @@ async def test_proxy_configuration_with_actor_proxy_input( assert len(route.calls) == 2 await Actor.exit() - - -@pytest.mark.parametrize('request_method', typing.get_args(HttpMethod)) -@pytest.mark.parametrize( - 'optional_input', - [ - {}, - {'payload': 'some payload', 'userData': {'some key': 'some value'}, 'headers': {'h1': 'v1', 'h2': 'v2'}}, - ], - ids=['minimal', 'all_options'], -) -async def test_actor_create_request_list_request_types( - request_method: HttpMethod, optional_input: dict[str, str] -) -> None: - """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" - minimal_request_dict_input = {'url': 'https://www.abc.com', 'method': request_method} - request_dict_input = {**minimal_request_dict_input, **optional_input} - example_start_urls_input = [ - request_dict_input, - ] - - generated_request_list = await Actor.create_request_list(actor_start_urls_input=example_start_urls_input) - - assert not await generated_request_list.is_empty() - generated_request = await generated_request_list.fetch_next_request() - assert await generated_request_list.is_empty() - - assert generated_request.method == request_dict_input['method'] - assert generated_request.url == request_dict_input['url'] - assert generated_request.payload == request_dict_input.get('payload', '').encode('utf-8') - expected_user_data = UserData() - if 'userData' in optional_input: - for key, value in optional_input['userData'].items(): - expected_user_data[key] = value - assert generated_request.user_data == expected_user_data - expected_headers = HttpHeaders(root=optional_input.get('headers', {})) - assert generated_request.headers == expected_headers - - -def _create_dummy_response(read_output: typing.Iterable[str]) -> HttpResponse: - """Create dummy_response that will iterate through read_output when called like dummy_response.read()""" - - class DummyResponse(HttpResponse): - @property - def http_version(self) -> str: - return '' - - @property - def status_code(self) -> int: - return 200 - - @property - def headers(self) -> HttpHeaders: - return HttpHeaders() - - def read(self) -> bytes: - return next(read_output).encode('utf-8') - - return DummyResponse() - - -async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: - """Test that injected HttpClient's method send_request is called with properly passed arguments.""" - - example_start_urls_input = [ - {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - {'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'PUT'}, - { - 'requestsFromUrl': 'https://www.something.som', - 'method': 'POST', - 'headers': {'key': 'value'}, - 'payload': 'some_payload', - 'userData': 'irrelevant', - }, - ] - mocked_read_outputs = ('' for url in example_start_urls_input) - http_client = HttpxHttpClient() - with mock.patch.object( - http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) - ) as mocked_send_request: - await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=http_client) - - expected_calls = [ - call( - method=example_input['method'], - url=example_input['requestsFromUrl'], - headers=example_input.get('headers', {}), - payload=example_input.get('payload', '').encode('utf-8'), - ) - for example_input in example_start_urls_input - ] - mocked_send_request.assert_has_calls(expected_calls) - - -async def test_actor_create_request_list_from_url() -> None: - """Test that create_request_list is correctly reading urls from remote url sources and also from simple input.""" - expected_simple_url = 'https://www.someurl.com' - expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'} - expected_remote_urls_2 = {'http://www.ok.com', 'https://www.true-positive.com'} - expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url} - response_bodies = iter( - ( - 'blablabla{} more blablabla{} , even more blablabla. {} '.format(*expected_remote_urls_1), - 'some stuff{} more stuff{} www.falsepositive www.false_positive.com'.format(*expected_remote_urls_2), - ) - ) - - example_start_urls_input = [ - {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - {'url': expected_simple_url, 'method': 'GET'}, - {'requestsFromUrl': 'https://www.crawlee.dev/file2', 'method': 'GET'}, - ] - - http_client = HttpxHttpClient() - with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - generated_request_list = await Actor.create_request_list( - actor_start_urls_input=example_start_urls_input, http_client=http_client - ) - generated_requests = [] - while request := await generated_request_list.fetch_next_request(): - print(request) - generated_requests.append(request) - - # Check correctly created requests' urls in request list - assert {generated_request.url for generated_request in generated_requests} == expected_urls diff --git a/tests/unit/actor/test_actor_create_request_list.py b/tests/unit/actor/test_actor_create_request_list.py new file mode 100644 index 00000000..c4528f3d --- /dev/null +++ b/tests/unit/actor/test_actor_create_request_list.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +import typing +from unittest import mock +from unittest.mock import call + +import pytest + +from crawlee._request import UserData # TODO: Make public in Crawlee? +from crawlee._types import HttpHeaders, HttpMethod # TODO: Make public in Crawlee? +from crawlee.http_clients import HttpResponse, HttpxHttpClient + +from apify import Actor + + +@pytest.mark.parametrize('request_method', typing.get_args(HttpMethod)) +@pytest.mark.parametrize( + 'optional_input', + [ + {}, + {'payload': 'some payload', 'user_data': {'some key': 'some value'}, 'headers': {'h1': 'v1', 'h2': 'v2'}}, + ], + ids=['minimal', 'all_options'], +) +async def test_actor_create_request_list_request_types( + request_method: HttpMethod, optional_input: dict[str, typing.Any] +) -> None: + """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" + minimal_request_dict_input = {'url': 'https://www.abc.com', 'method': request_method} + request_dict_input = {**minimal_request_dict_input, **optional_input} + example_start_urls_input = [ + request_dict_input, + ] + + generated_request_list = await Actor.create_request_list(actor_start_urls_input=example_start_urls_input) + + assert not await generated_request_list.is_empty() + generated_request = await generated_request_list.fetch_next_request() + assert generated_request is not None + assert await generated_request_list.is_empty() + + assert generated_request.method == request_dict_input['method'] + assert generated_request.url == request_dict_input['url'] + assert generated_request.payload == request_dict_input.get('payload', '').encode('utf-8') + expected_user_data = UserData() + if 'user_data' in optional_input: + for key, value in optional_input['user_data'].items(): + expected_user_data[key] = value + assert generated_request.user_data == expected_user_data + expected_headers = HttpHeaders(root=optional_input.get('headers', {})) + assert generated_request.headers == expected_headers + + +def _create_dummy_response(read_output: typing.Iterator[str]) -> HttpResponse: + """Create dummy_response that will iterate through read_output when called like dummy_response.read()""" + + class DummyResponse(HttpResponse): + @property + def http_version(self) -> str: + return '' + + @property + def status_code(self) -> int: + return 200 + + @property + def headers(self) -> HttpHeaders: + return HttpHeaders() + + def read(self) -> bytes: + return next(read_output).encode('utf-8') + + return DummyResponse() + + +async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: + """Test that injected HttpClient's method send_request is called with properly passed arguments.""" + + example_start_urls_input: list[dict[str, typing.Any]] = [ + {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + {'requests_from_url': 'https://www.crawlee.dev/file2', 'method': 'PUT'}, + { + 'requests_from_url': 'https://www.something.som', + 'method': 'POST', + 'headers': {'key': 'value'}, + 'payload': 'some_payload', + 'user_data': {'another_key': 'another_value'}, + }, + ] + mocked_read_outputs = ('' for url in example_start_urls_input) + http_client = HttpxHttpClient() + with mock.patch.object( + http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) + ) as mocked_send_request: + await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=http_client) + + expected_calls = [ + call( + method=example_input['method'], + url=example_input['requests_from_url'], + headers=example_input.get('headers', {}), + payload=example_input.get('payload', '').encode('utf-8'), + ) + for example_input in example_start_urls_input + ] + mocked_send_request.assert_has_calls(expected_calls) + + +async def test_actor_create_request_list_from_url() -> None: + """Test that create_request_list is correctly reading urls from remote url sources and also from simple input.""" + expected_simple_url = 'https://www.someurl.com' + expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'} + expected_remote_urls_2 = {'http://www.ok.com', 'https://www.true-positive.com'} + expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url} + response_bodies = iter( + ( + 'blablabla{} more blablabla{} , even more blablabla. {} '.format(*expected_remote_urls_1), + 'some stuff{} more stuff{} www.falsepositive www.false_positive.com'.format(*expected_remote_urls_2), + ) + ) + + example_start_urls_input = [ + {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + {'url': expected_simple_url, 'method': 'GET'}, + {'requests_from_url': 'https://www.crawlee.dev/file2', 'method': 'GET'}, + ] + + http_client = HttpxHttpClient() + with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): + generated_request_list = await Actor.create_request_list( + actor_start_urls_input=example_start_urls_input, http_client=http_client + ) + generated_requests = [] + while request := await generated_request_list.fetch_next_request(): + print(request) + generated_requests.append(request) + + # Check correctly created requests' urls in request list + assert {generated_request.url for generated_request in generated_requests} == expected_urls From f2c24404e62d4d1668767ebfe68d4fe973257220 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 15 Nov 2024 16:15:26 +0100 Subject: [PATCH 06/20] Properly pass request creation settings. TODO: Finish test for it. WIP --- src/apify/_actor_inputs.py | 28 +++++++++++++------ .../actor/test_actor_create_request_list.py | 8 ++++-- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/apify/_actor_inputs.py b/src/apify/_actor_inputs.py index 921064aa..088b9aa5 100644 --- a/src/apify/_actor_inputs.py +++ b/src/apify/_actor_inputs.py @@ -5,6 +5,7 @@ from asyncio import Task from typing import Any +from functools import partial from pydantic import BaseModel, Field from crawlee import Request @@ -67,23 +68,34 @@ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> lis async def _create_requests_from_url( remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient ) -> list[Request]: + """Crete list of requests from url. + + Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting + callback on each response body and use URL_NO_COMMAS_REGEX regexp to find all links. Create list of Requests from + collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. + """ created_requests: list[Request] = [] - def extract_requests_from_response(task: Task) -> list[Request]: + def create_requests_from_response(request_input: _SimpleUrlInput, task: Task) -> list[Request]: + """Callback to scrape response body with regexp and create Requests from macthes.""" matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) - created_requests.extend([Request.from_url(match.group(0)) for match in matches]) + created_requests.extend([Request.from_url( + match.group(0), + method=request_input.method, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data) for match in matches]) remote_url_requests = [] - for request_input in remote_url_requests_inputs: + for remote_url_requests_input in remote_url_requests_inputs: task = asyncio.create_task( http_client.send_request( - method=request_input.method, - url=request_input.requests_from_url, - headers=request_input.headers, - payload=request_input.payload.encode('utf-8'), + method='GET', + url=remote_url_requests_input.requests_from_url, ) ) - task.add_done_callback(extract_requests_from_response) + + task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) remote_url_requests.append(task) await asyncio.gather(*remote_url_requests) diff --git a/tests/unit/actor/test_actor_create_request_list.py b/tests/unit/actor/test_actor_create_request_list.py index c4528f3d..f77812ca 100644 --- a/tests/unit/actor/test_actor_create_request_list.py +++ b/tests/unit/actor/test_actor_create_request_list.py @@ -96,10 +96,8 @@ async def test_actor_create_request_list_from_url_correctly_send_requests() -> N expected_calls = [ call( - method=example_input['method'], + method='GET', url=example_input['requests_from_url'], - headers=example_input.get('headers', {}), - payload=example_input.get('payload', '').encode('utf-8'), ) for example_input in example_start_urls_input ] @@ -137,3 +135,7 @@ async def test_actor_create_request_list_from_url() -> None: # Check correctly created requests' urls in request list assert {generated_request.url for generated_request in generated_requests} == expected_urls + +async def test_actor_create_request_list_from_url_additional_inputs() -> None: + assert False + # TODO test that will check that additional properties, like payload, headers request type are all properly passed. From 5af94058bb64e9c7430f36a18cedd60e0a9e458c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Sun, 17 Nov 2024 11:21:17 +0100 Subject: [PATCH 07/20] Add tests fro regexp. Add test for checking all genrated request properties. --- src/apify/_actor_inputs.py | 2 +- .../actor/test_actor_create_request_list.py | 73 +++++++++++++++++-- 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/src/apify/_actor_inputs.py b/src/apify/_actor_inputs.py index 088b9aa5..d4048cb0 100644 --- a/src/apify/_actor_inputs.py +++ b/src/apify/_actor_inputs.py @@ -3,9 +3,9 @@ import asyncio import re from asyncio import Task +from functools import partial from typing import Any -from functools import partial from pydantic import BaseModel, Field from crawlee import Request diff --git a/tests/unit/actor/test_actor_create_request_list.py b/tests/unit/actor/test_actor_create_request_list.py index f77812ca..3114f8a2 100644 --- a/tests/unit/actor/test_actor_create_request_list.py +++ b/tests/unit/actor/test_actor_create_request_list.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re import typing from unittest import mock from unittest.mock import call @@ -11,6 +12,7 @@ from crawlee.http_clients import HttpResponse, HttpxHttpClient from apify import Actor +from apify._actor_inputs import URL_NO_COMMAS_REGEX @pytest.mark.parametrize('request_method', typing.get_args(HttpMethod)) @@ -47,8 +49,7 @@ async def test_actor_create_request_list_request_types( for key, value in optional_input['user_data'].items(): expected_user_data[key] = value assert generated_request.user_data == expected_user_data - expected_headers = HttpHeaders(root=optional_input.get('headers', {})) - assert generated_request.headers == expected_headers + assert generated_request.headers.root == optional_input.get('headers', {}) def _create_dummy_response(read_output: typing.Iterator[str]) -> HttpResponse: @@ -130,12 +131,74 @@ async def test_actor_create_request_list_from_url() -> None: ) generated_requests = [] while request := await generated_request_list.fetch_next_request(): - print(request) generated_requests.append(request) # Check correctly created requests' urls in request list assert {generated_request.url for generated_request in generated_requests} == expected_urls async def test_actor_create_request_list_from_url_additional_inputs() -> None: - assert False - # TODO test that will check that additional properties, like payload, headers request type are all properly passed. + """Test that all generated request properties are correctly populated from input values.""" + expected_simple_url = 'https://www.someurl.com' + example_start_urls_input = [ + {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'POST', + 'headers': {'key': 'value'}, + 'payload': 'some_payload', + 'user_data': {'another_key': 'another_value'}}, + ] + response_bodies = iter((expected_simple_url,)) + http_client = HttpxHttpClient() + with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): + generated_request_list = await Actor.create_request_list( + actor_start_urls_input=example_start_urls_input, http_client=http_client + ) + request = await generated_request_list.fetch_next_request() + + # Check all properties correctly created for request + assert request.url == expected_simple_url + assert request.method == example_start_urls_input[0]['method'] + assert request.headers.root == example_start_urls_input[0]['headers'] + assert request.payload == example_start_urls_input[0]['payload'].encode('utf-8') + expected_user_data = UserData() + for key, value in example_start_urls_input[0]['user_data'].items(): + expected_user_data[key] = value + assert request.user_data == expected_user_data + + +@pytest.mark.parametrize('true_positive', [ + 'http://www.something.com', + 'https://www.something.net', + 'http://nowww.cz', + 'https://with-hypen.com', + 'http://number1.com', + 'http://www.number.123', + 'http://many.dots.com', + 'http://a.com', + 'http://www.something.com/somethignelse' + 'http://www.something.com/somethignelse.txt', + # "http://non-english-chars-รก.com" # re module not suitable, regex can do this with \p{L}. Do we want this? +]) +def test_url_no_commas_regex_true_positives(true_positive: str) -> None: + example_string= f'Some text {true_positive} some more text' + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) + assert len(matches) == 1 + assert matches[0].group(0) == true_positive + +@pytest.mark.parametrize('false_positive',[ + 'http://www.a', + 'http://a', + 'http://a.a', + 'http://123.456', + 'www.something.com', + 'http:www.something.com', +]) +def test_url_no_commas_regex_false_positives(false_positive: str) -> None: + example_string= f'Some text {false_positive} some more text' + matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string)) + assert len(matches) == 0 + +def test_url_no_commas_regex_multi_line() -> None: + true_positives = ('http://www.something.com', 'http://www.else.com') + example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) + assert len(matches) == 2 + assert {match.group(0) for match in matches} == set(true_positives) From cf4534a08b6a7cd6baf463aae7ed92f543ddc293 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 18 Nov 2024 17:12:19 +0100 Subject: [PATCH 08/20] Use regex instead of re. Add helper class for input keys Add top level Input class for handling actor inputs. --- poetry.lock | 116 +++++++++++++++- pyproject.toml | 2 + src/apify/_actor.py | 19 --- src/apify/storages/__init__.py | 4 +- src/apify/{ => storages}/_actor_inputs.py | 70 +++++++--- src/apify/storages/_known_actor_input_keys.py | 28 ++++ ...e_request_list.py => test_actor_inputs.py} | 129 +++++++++--------- 7 files changed, 261 insertions(+), 107 deletions(-) rename src/apify/{ => storages}/_actor_inputs.py (56%) create mode 100644 src/apify/storages/_known_actor_input_keys.py rename tests/unit/actor/{test_actor_create_request_list.py => test_actor_inputs.py} (52%) diff --git a/poetry.lock b/poetry.lock index 768801be..f94db0ae 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2449,6 +2449,109 @@ files = [ {file = "readchar-4.2.1.tar.gz", hash = "sha256:91ce3faf07688de14d800592951e5575e9c7a3213738ed01d394dcc949b79adb"}, ] +[[package]] +name = "regex" +version = "2024.11.6" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"}, + {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"}, + {file = "regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c"}, + {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008"}, + {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62"}, + {file = "regex-2024.11.6-cp310-cp310-win32.whl", hash = "sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e"}, + {file = "regex-2024.11.6-cp310-cp310-win_amd64.whl", hash = "sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7"}, + {file = "regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0"}, + {file = "regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d"}, + {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45"}, + {file = "regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9"}, + {file = "regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9"}, + {file = "regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e"}, + {file = "regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51"}, + {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad"}, + {file = "regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54"}, + {file = "regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4"}, + {file = "regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c"}, + {file = "regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4"}, + {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d"}, + {file = "regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff"}, + {file = "regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3a51ccc315653ba012774efca4f23d1d2a8a8f278a6072e29c7147eee7da446b"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ad182d02e40de7459b73155deb8996bbd8e96852267879396fb274e8700190e3"}, + {file = "regex-2024.11.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba9b72e5643641b7d41fa1f6d5abda2c9a263ae835b917348fc3c928182ad467"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40291b1b89ca6ad8d3f2b82782cc33807f1406cf68c8d440861da6304d8ffbbd"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf58d0e516ee426a48f7b2c03a332a4114420716d55769ff7108c37a09951bf"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a36fdf2af13c2b14738f6e973aba563623cb77d753bbbd8d414d18bfaa3105dd"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cee317bfc014c2419a76bcc87f071405e3966da434e03e13beb45f8aced1a6"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50153825ee016b91549962f970d6a4442fa106832e14c918acd1c8e479916c4f"}, + {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea1bfda2f7162605f6e8178223576856b3d791109f15ea99a9f95c16a7636fb5"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:df951c5f4a1b1910f1a99ff42c473ff60f8225baa1cdd3539fe2819d9543e9df"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:072623554418a9911446278f16ecb398fb3b540147a7828c06e2011fa531e773"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f654882311409afb1d780b940234208a252322c24a93b442ca714d119e68086c"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:89d75e7293d2b3e674db7d4d9b1bee7f8f3d1609428e293771d1a962617150cc"}, + {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f65557897fc977a44ab205ea871b690adaef6b9da6afda4790a2484b04293a5f"}, + {file = "regex-2024.11.6-cp38-cp38-win32.whl", hash = "sha256:6f44ec28b1f858c98d3036ad5d7d0bfc568bdd7a74f9c24e25f41ef1ebfd81a4"}, + {file = "regex-2024.11.6-cp38-cp38-win_amd64.whl", hash = "sha256:bb8f74f2f10dbf13a0be8de623ba4f9491faf58c24064f32b65679b021ed0001"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e"}, + {file = "regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48"}, + {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f"}, + {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b"}, + {file = "regex-2024.11.6-cp39-cp39-win32.whl", hash = "sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57"}, + {file = "regex-2024.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983"}, + {file = "regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519"}, +] + [[package]] name = "requests" version = "2.32.3" @@ -2823,6 +2926,17 @@ files = [ {file = "types_python_dateutil-2.9.0.20241003-py3-none-any.whl", hash = "sha256:250e1d8e80e7bbc3a6c99b907762711d1a1cdd00e978ad39cb5940f6f0a87f3d"}, ] +[[package]] +name = "types-regex" +version = "2024.11.6.20241108" +description = "Typing stubs for regex" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-regex-2024.11.6.20241108.tar.gz", hash = "sha256:a774e307b99b3df49481b29e8b4962f021693052a8d8a2b9e6792fcec896cf5e"}, + {file = "types_regex-2024.11.6.20241108-py3-none-any.whl", hash = "sha256:adec2ff2dfed00855551057334466fde923606599d01e7440556d53a3ef20835"}, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -3215,4 +3329,4 @@ scrapy = ["scrapy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "3698d5b2d562a7a83489d316a70b6685d4276f9aa9adb904ea5f39479cc8eeee" +content-hash = "bd3a28d080a0548f41c8d9a50ede2725a8b12985203f2afebd98b7607471b003" diff --git a/pyproject.toml b/pyproject.toml index c3a01c41..dc4f56c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ lazy-object-proxy = ">=1.10.0" scrapy = { version = ">=2.11.0", optional = true } typing-extensions = ">=4.1.0" websockets = ">=10.0 <14.0.0" +regex = "^2024.11.6" [tool.poetry.group.dev.dependencies] build = "~1.2.0" @@ -72,6 +73,7 @@ pytest-xdist = "~3.6.0" respx = "~0.21.0" ruff = "~0.7.0" setuptools = "~75.0.0" # setuptools are used by pytest but not explicitly required +types-regex = "^2024.11.6.20241108" [tool.poetry.extras] scrapy = ["scrapy"] diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 872416a0..f60a99df 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -16,7 +16,6 @@ from crawlee import service_container from crawlee.events._types import Event, EventPersistStateData -from apify._actor_inputs import _create_request_list from apify._configuration import Configuration from apify._consts import EVENT_LISTENERS_TIMEOUT from apify._crypto import decrypt_input_secrets, load_private_key @@ -32,9 +31,7 @@ import logging from types import TracebackType - from crawlee.http_clients import BaseHttpClient from crawlee.proxy_configuration import _NewUrlFunction - from crawlee.storages import RequestList from apify._models import Webhook @@ -977,22 +974,6 @@ async def create_proxy_configuration( return proxy_configuration - @staticmethod - async def create_request_list( - *, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None - ) -> RequestList: - """Creates request list from Actor input requestListSources. This accepts list of urls and requests_from_url. - - Example: - actor_start_urls_input = [ - # Gather urls from response body. - {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - # Directly include this url. - {'url': 'https://crawlee.dev', 'method': 'GET'} - ] - """ - return await _create_request_list(actor_start_urls_input=actor_start_urls_input, http_client=http_client) - Actor = cast(_ActorType, Proxy(_ActorType)) """The entry point of the SDK, through which all the Actor operations should be done.""" diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index 2ed85e84..8fd33ba3 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,3 +1,5 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue -__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue'] +from ._actor_inputs import Input + +__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'Input'] diff --git a/src/apify/_actor_inputs.py b/src/apify/storages/_actor_inputs.py similarity index 56% rename from src/apify/_actor_inputs.py rename to src/apify/storages/_actor_inputs.py index d4048cb0..c53a051a 100644 --- a/src/apify/_actor_inputs.py +++ b/src/apify/storages/_actor_inputs.py @@ -1,47 +1,79 @@ from __future__ import annotations import asyncio -import re from asyncio import Task from functools import partial -from typing import Any +from typing import TYPE_CHECKING, Any -from pydantic import BaseModel, Field +import regex +from pydantic import BaseModel, ConfigDict, Field + +if TYPE_CHECKING: + from typing_extensions import Self from crawlee import Request -from crawlee._types import HttpMethod # TODO: Make public in Crawlee? +from crawlee._types import HttpMethod from crawlee.http_clients import BaseHttpClient, HttpxHttpClient from crawlee.storages import RequestList -URL_NO_COMMAS_REGEX = re.compile( - r'https?:\/\/(www\.)?([a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{0,254}[a-zA-Z0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-a-zA-Z0-9@:%_+.~#?&/=()]*)?' +from ._known_actor_input_keys import ActorInputKeys + +URL_NO_COMMAS_REGEX = regex.compile( + r'https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?' ) + + class _RequestDetails(BaseModel): method: HttpMethod payload: str = '' headers: dict[str, str] = Field(default_factory=dict) - user_data: dict[str, str]= Field(default_factory=dict, alias='user_data') + user_data: dict[str, str]= Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData) class _RequestsFromUrlInput(_RequestDetails): - requests_from_url: str = Field(alias='requests_from_url') + requests_from_url: str = Field(alias=ActorInputKeys.startUrls.requestsFromUrl) + class _SimpleUrlInput(_RequestDetails): url: str +class Input(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + start_urls: RequestList + + @classmethod + async def read(cls, raw_input: dict[str, Any], http_client: BaseHttpClient | None = None) -> Self: + if ActorInputKeys.startUrls in raw_input: + request_list = await _create_request_list( + actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client) + else: + request_list = RequestList() + return cls(start_urls=request_list) -@staticmethod async def _create_request_list( *, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None ) -> RequestList: + """Creates RequestList from Actor input requestListSources. + + actor_start_urls_input can contain list dicts with either url or requestsFromUrl key + http_client is client that will be used to send get request to url defined in requestsFromUrl + + Example: + actor_start_urls_input = [ + # Gather urls from response body. + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + # Directly include this url. + {'url': 'https://crawlee.dev', 'method': 'GET'} + ] + """ if not http_client: http_client = HttpxHttpClient() simple_url_requests_inputs = [ _SimpleUrlInput(**request_input) for request_input in actor_start_urls_input - if 'url' in request_input] + if ActorInputKeys.startUrls.url in request_input] remote_url_requests_inputs = [ _RequestsFromUrlInput(**request_input) for request_input in actor_start_urls_input - if 'requests_from_url' in request_input + if ActorInputKeys.startUrls.requestsFromUrl in request_input ] simple_url_requests = _create_requests_from_input(simple_url_requests_inputs) @@ -50,7 +82,6 @@ async def _create_request_list( return RequestList(requests=simple_url_requests + remote_url_requests) -@staticmethod def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: return [ Request.from_url( @@ -64,21 +95,20 @@ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> lis ] -@staticmethod async def _create_requests_from_url( remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient ) -> list[Request]: """Crete list of requests from url. Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting - callback on each response body and use URL_NO_COMMAS_REGEX regexp to find all links. Create list of Requests from + callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. """ created_requests: list[Request] = [] - def create_requests_from_response(request_input: _SimpleUrlInput, task: Task) -> list[Request]: - """Callback to scrape response body with regexp and create Requests from macthes.""" - matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) + def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: + """Callback to scrape response body with regexp and create Requests from matches.""" + matches = regex.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) created_requests.extend([Request.from_url( match.group(0), method=request_input.method, @@ -88,15 +118,15 @@ def create_requests_from_response(request_input: _SimpleUrlInput, task: Task) -> remote_url_requests = [] for remote_url_requests_input in remote_url_requests_inputs: - task = asyncio.create_task( + get_response_task = asyncio.create_task( http_client.send_request( method='GET', url=remote_url_requests_input.requests_from_url, ) ) - task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) - remote_url_requests.append(task) + get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) + remote_url_requests.append(get_response_task) await asyncio.gather(*remote_url_requests) return created_requests diff --git a/src/apify/storages/_known_actor_input_keys.py b/src/apify/storages/_known_actor_input_keys.py new file mode 100644 index 00000000..49347393 --- /dev/null +++ b/src/apify/storages/_known_actor_input_keys.py @@ -0,0 +1,28 @@ + + +class _KnownInputKey(str): + __slots__ = ('_name',) + def __init__(self, name: str) -> None: + self._name = name + + def __str__(self) -> str: + return self._name + + def __repr__(self) ->str: + return str(self) + +class _StartUrls(_KnownInputKey): + url='url' + requestsFromUrl = 'requestsFromUrl' # noqa: N815 # Intentional to respect actual naming of input keys. + method='method' + payload= 'payload' + userData='userData' # noqa: N815 # Intentional to respect actual naming of input keys. + headers='headers' + + +class _ActorInputKeys: + # Helper class to have actor input strings all in one place and easy to use with code completion. + startUrls: _StartUrls = _StartUrls('startUrls') # noqa: N815 # Intentional to respect actual naming of input keys. + # More inputs should be gradually added + +ActorInputKeys = _ActorInputKeys() diff --git a/tests/unit/actor/test_actor_create_request_list.py b/tests/unit/actor/test_actor_inputs.py similarity index 52% rename from tests/unit/actor/test_actor_create_request_list.py rename to tests/unit/actor/test_actor_inputs.py index 3114f8a2..4d7ef83e 100644 --- a/tests/unit/actor/test_actor_create_request_list.py +++ b/tests/unit/actor/test_actor_inputs.py @@ -1,58 +1,57 @@ from __future__ import annotations -import re -import typing +from typing import Any, Iterator, get_args from unittest import mock from unittest.mock import call import pytest +import regex -from crawlee._request import UserData # TODO: Make public in Crawlee? -from crawlee._types import HttpHeaders, HttpMethod # TODO: Make public in Crawlee? +from crawlee._request import UserData +from crawlee._types import HttpHeaders, HttpMethod from crawlee.http_clients import HttpResponse, HttpxHttpClient -from apify import Actor -from apify._actor_inputs import URL_NO_COMMAS_REGEX +from apify.storages._actor_inputs import URL_NO_COMMAS_REGEX, ActorInputKeys, Input -@pytest.mark.parametrize('request_method', typing.get_args(HttpMethod)) +@pytest.mark.parametrize('request_method', get_args(HttpMethod)) @pytest.mark.parametrize( 'optional_input', [ {}, - {'payload': 'some payload', 'user_data': {'some key': 'some value'}, 'headers': {'h1': 'v1', 'h2': 'v2'}}, + {ActorInputKeys.startUrls.payload: 'some payload', ActorInputKeys.startUrls.userData: + {'some key': 'some value'}, ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'}}, ], ids=['minimal', 'all_options'], ) async def test_actor_create_request_list_request_types( - request_method: HttpMethod, optional_input: dict[str, typing.Any] + request_method: HttpMethod, optional_input: dict[str, Any] ) -> None: """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" - minimal_request_dict_input = {'url': 'https://www.abc.com', 'method': request_method} + minimal_request_dict_input = {ActorInputKeys.startUrls.url: 'https://www.abc.com', + ActorInputKeys.startUrls.method: request_method} request_dict_input = {**minimal_request_dict_input, **optional_input} - example_start_urls_input = [ - request_dict_input, - ] + example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [request_dict_input]} - generated_request_list = await Actor.create_request_list(actor_start_urls_input=example_start_urls_input) + generated_input = await Input.read(example_actor_input) - assert not await generated_request_list.is_empty() - generated_request = await generated_request_list.fetch_next_request() + assert not await generated_input.start_urls.is_empty() + generated_request = await generated_input.start_urls.fetch_next_request() assert generated_request is not None - assert await generated_request_list.is_empty() + assert await generated_input.start_urls.is_empty() - assert generated_request.method == request_dict_input['method'] - assert generated_request.url == request_dict_input['url'] - assert generated_request.payload == request_dict_input.get('payload', '').encode('utf-8') + assert generated_request.method == request_dict_input[ActorInputKeys.startUrls.method] + assert generated_request.url == request_dict_input[ActorInputKeys.startUrls.url] + assert generated_request.payload == request_dict_input.get(ActorInputKeys.startUrls.payload, '').encode('utf-8') expected_user_data = UserData() - if 'user_data' in optional_input: - for key, value in optional_input['user_data'].items(): + if ActorInputKeys.startUrls.userData in optional_input: + for key, value in optional_input[ActorInputKeys.startUrls.userData].items(): expected_user_data[key] = value assert generated_request.user_data == expected_user_data - assert generated_request.headers.root == optional_input.get('headers', {}) + assert generated_request.headers.root == optional_input.get(ActorInputKeys.startUrls.headers, {}) -def _create_dummy_response(read_output: typing.Iterator[str]) -> HttpResponse: +def _create_dummy_response(read_output: Iterator[str]) -> HttpResponse: """Create dummy_response that will iterate through read_output when called like dummy_response.read()""" class DummyResponse(HttpResponse): @@ -76,31 +75,31 @@ def read(self) -> bytes: async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: """Test that injected HttpClient's method send_request is called with properly passed arguments.""" - - example_start_urls_input: list[dict[str, typing.Any]] = [ - {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - {'requests_from_url': 'https://www.crawlee.dev/file2', 'method': 'PUT'}, + example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [ + {ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'}, + {ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'PUT'}, { - 'requests_from_url': 'https://www.something.som', - 'method': 'POST', - 'headers': {'key': 'value'}, - 'payload': 'some_payload', - 'user_data': {'another_key': 'another_value'}, + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', + ActorInputKeys.startUrls.method: 'POST', + ActorInputKeys.startUrls.headers: {'key': 'value'}, + ActorInputKeys.startUrls.payload: 'some_payload', + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, }, - ] - mocked_read_outputs = ('' for url in example_start_urls_input) + ]} + + mocked_read_outputs = ('' for url in example_actor_input[ActorInputKeys.startUrls]) http_client = HttpxHttpClient() with mock.patch.object( http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) ) as mocked_send_request: - await Actor.create_request_list(actor_start_urls_input=example_start_urls_input, http_client=http_client) + await Input.read(example_actor_input, http_client=http_client) expected_calls = [ call( method='GET', - url=example_input['requests_from_url'], + url=example_input[ActorInputKeys.startUrls.requestsFromUrl], ) - for example_input in example_start_urls_input + for example_input in example_actor_input[ActorInputKeys.startUrls] ] mocked_send_request.assert_has_calls(expected_calls) @@ -118,19 +117,17 @@ async def test_actor_create_request_list_from_url() -> None: ) ) - example_start_urls_input = [ - {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - {'url': expected_simple_url, 'method': 'GET'}, - {'requests_from_url': 'https://www.crawlee.dev/file2', 'method': 'GET'}, - ] + example_actor_input:dict[str, Any] = {ActorInputKeys.startUrls:[ + {ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'}, + {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, + {ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'GET'}, + ]} http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - generated_request_list = await Actor.create_request_list( - actor_start_urls_input=example_start_urls_input, http_client=http_client - ) + generated_input = await Input.read(example_actor_input, http_client=http_client) generated_requests = [] - while request := await generated_request_list.fetch_next_request(): + while request := await generated_input.start_urls.fetch_next_request(): generated_requests.append(request) # Check correctly created requests' urls in request list @@ -139,27 +136,27 @@ async def test_actor_create_request_list_from_url() -> None: async def test_actor_create_request_list_from_url_additional_inputs() -> None: """Test that all generated request properties are correctly populated from input values.""" expected_simple_url = 'https://www.someurl.com' - example_start_urls_input = [ - {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'POST', - 'headers': {'key': 'value'}, - 'payload': 'some_payload', - 'user_data': {'another_key': 'another_value'}}, - ] + example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls:[ + {ActorInputKeys.startUrls.requestsFromUrl: 'https://crawlee.dev/file.txt', 'method': 'POST', + ActorInputKeys.startUrls.headers: {'key': 'value'}, + ActorInputKeys.startUrls.payload: 'some_payload', + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}}, + ]} response_bodies = iter((expected_simple_url,)) http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - generated_request_list = await Actor.create_request_list( - actor_start_urls_input=example_start_urls_input, http_client=http_client - ) - request = await generated_request_list.fetch_next_request() + generated_input = await Input.read(example_actor_input, http_client=http_client) + request = await generated_input.start_urls.fetch_next_request() # Check all properties correctly created for request + assert request assert request.url == expected_simple_url - assert request.method == example_start_urls_input[0]['method'] - assert request.headers.root == example_start_urls_input[0]['headers'] - assert request.payload == example_start_urls_input[0]['payload'].encode('utf-8') + assert request.method == example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.method] + assert request.headers.root == example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.headers] + assert request.payload == example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.payload].encode( + 'utf-8') expected_user_data = UserData() - for key, value in example_start_urls_input[0]['user_data'].items(): + for key, value in example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.userData].items(): expected_user_data[key] = value assert request.user_data == expected_user_data @@ -170,16 +167,16 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: 'http://nowww.cz', 'https://with-hypen.com', 'http://number1.com', - 'http://www.number.123', + 'http://www.number.123.abc', 'http://many.dots.com', 'http://a.com', 'http://www.something.com/somethignelse' 'http://www.something.com/somethignelse.txt', - # "http://non-english-chars-รก.com" # re module not suitable, regex can do this with \p{L}. Do we want this? + 'http://non-english-chars-รกรญรฉรฅรผ.com' ]) def test_url_no_commas_regex_true_positives(true_positive: str) -> None: example_string= f'Some text {true_positive} some more text' - matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) + matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 1 assert matches[0].group(0) == true_positive @@ -193,12 +190,12 @@ def test_url_no_commas_regex_true_positives(true_positive: str) -> None: ]) def test_url_no_commas_regex_false_positives(false_positive: str) -> None: example_string= f'Some text {false_positive} some more text' - matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string)) + matches = list(regex.findall(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 0 def test_url_no_commas_regex_multi_line() -> None: true_positives = ('http://www.something.com', 'http://www.else.com') example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) - matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) + matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 2 assert {match.group(0) for match in matches} == set(true_positives) From ff3e047049281ee8840581cb7a70f680fea74921 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 18 Nov 2024 17:49:22 +0100 Subject: [PATCH 09/20] Use re with \w Add few more tests for regex --- poetry.lock | 105 +------------------------- pyproject.toml | 1 - src/apify/storages/_actor_inputs.py | 8 +- tests/unit/actor/test_actor_inputs.py | 12 +-- 4 files changed, 12 insertions(+), 114 deletions(-) diff --git a/poetry.lock b/poetry.lock index f94db0ae..f43b2f3a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2449,109 +2449,6 @@ files = [ {file = "readchar-4.2.1.tar.gz", hash = "sha256:91ce3faf07688de14d800592951e5575e9c7a3213738ed01d394dcc949b79adb"}, ] -[[package]] -name = "regex" -version = "2024.11.6" -description = "Alternative regular expression module, to replace re." -optional = false -python-versions = ">=3.8" -files = [ - {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"}, - {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"}, - {file = "regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c"}, - {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008"}, - {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62"}, - {file = "regex-2024.11.6-cp310-cp310-win32.whl", hash = "sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e"}, - {file = "regex-2024.11.6-cp310-cp310-win_amd64.whl", hash = "sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519"}, - {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638"}, - {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7"}, - {file = "regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0"}, - {file = "regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d"}, - {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45"}, - {file = "regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9"}, - {file = "regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60"}, - {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a"}, - {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9"}, - {file = "regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e"}, - {file = "regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51"}, - {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad"}, - {file = "regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54"}, - {file = "regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b"}, - {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84"}, - {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4"}, - {file = "regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c"}, - {file = "regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4"}, - {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d"}, - {file = "regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff"}, - {file = "regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a"}, - {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3a51ccc315653ba012774efca4f23d1d2a8a8f278a6072e29c7147eee7da446b"}, - {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ad182d02e40de7459b73155deb8996bbd8e96852267879396fb274e8700190e3"}, - {file = "regex-2024.11.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba9b72e5643641b7d41fa1f6d5abda2c9a263ae835b917348fc3c928182ad467"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40291b1b89ca6ad8d3f2b82782cc33807f1406cf68c8d440861da6304d8ffbbd"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf58d0e516ee426a48f7b2c03a332a4114420716d55769ff7108c37a09951bf"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a36fdf2af13c2b14738f6e973aba563623cb77d753bbbd8d414d18bfaa3105dd"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cee317bfc014c2419a76bcc87f071405e3966da434e03e13beb45f8aced1a6"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50153825ee016b91549962f970d6a4442fa106832e14c918acd1c8e479916c4f"}, - {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea1bfda2f7162605f6e8178223576856b3d791109f15ea99a9f95c16a7636fb5"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:df951c5f4a1b1910f1a99ff42c473ff60f8225baa1cdd3539fe2819d9543e9df"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:072623554418a9911446278f16ecb398fb3b540147a7828c06e2011fa531e773"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f654882311409afb1d780b940234208a252322c24a93b442ca714d119e68086c"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:89d75e7293d2b3e674db7d4d9b1bee7f8f3d1609428e293771d1a962617150cc"}, - {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f65557897fc977a44ab205ea871b690adaef6b9da6afda4790a2484b04293a5f"}, - {file = "regex-2024.11.6-cp38-cp38-win32.whl", hash = "sha256:6f44ec28b1f858c98d3036ad5d7d0bfc568bdd7a74f9c24e25f41ef1ebfd81a4"}, - {file = "regex-2024.11.6-cp38-cp38-win_amd64.whl", hash = "sha256:bb8f74f2f10dbf13a0be8de623ba4f9491faf58c24064f32b65679b021ed0001"}, - {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839"}, - {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e"}, - {file = "regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48"}, - {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f"}, - {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b"}, - {file = "regex-2024.11.6-cp39-cp39-win32.whl", hash = "sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57"}, - {file = "regex-2024.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983"}, - {file = "regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519"}, -] - [[package]] name = "requests" version = "2.32.3" @@ -3329,4 +3226,4 @@ scrapy = ["scrapy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "bd3a28d080a0548f41c8d9a50ede2725a8b12985203f2afebd98b7607471b003" +content-hash = "da388b618b4c9b95567d426529d0b7cda05d33909995c409e595c99e6f1767ff" diff --git a/pyproject.toml b/pyproject.toml index dc4f56c1..68091faa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,6 @@ lazy-object-proxy = ">=1.10.0" scrapy = { version = ">=2.11.0", optional = true } typing-extensions = ">=4.1.0" websockets = ">=10.0 <14.0.0" -regex = "^2024.11.6" [tool.poetry.group.dev.dependencies] build = "~1.2.0" diff --git a/src/apify/storages/_actor_inputs.py b/src/apify/storages/_actor_inputs.py index c53a051a..9437a578 100644 --- a/src/apify/storages/_actor_inputs.py +++ b/src/apify/storages/_actor_inputs.py @@ -1,11 +1,11 @@ from __future__ import annotations import asyncio +import re from asyncio import Task from functools import partial from typing import TYPE_CHECKING, Any -import regex from pydantic import BaseModel, ConfigDict, Field if TYPE_CHECKING: @@ -18,8 +18,8 @@ from ._known_actor_input_keys import ActorInputKeys -URL_NO_COMMAS_REGEX = regex.compile( - r'https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?' +URL_NO_COMMAS_REGEX = re.compile( + r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?' ) @@ -108,7 +108,7 @@ async def _create_requests_from_url( def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: """Callback to scrape response body with regexp and create Requests from matches.""" - matches = regex.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) + matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) created_requests.extend([Request.from_url( match.group(0), method=request_input.method, diff --git a/tests/unit/actor/test_actor_inputs.py b/tests/unit/actor/test_actor_inputs.py index 4d7ef83e..f039e549 100644 --- a/tests/unit/actor/test_actor_inputs.py +++ b/tests/unit/actor/test_actor_inputs.py @@ -1,11 +1,11 @@ from __future__ import annotations +import re from typing import Any, Iterator, get_args from unittest import mock from unittest.mock import call import pytest -import regex from crawlee._request import UserData from crawlee._types import HttpHeaders, HttpMethod @@ -172,11 +172,13 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: 'http://a.com', 'http://www.something.com/somethignelse' 'http://www.something.com/somethignelse.txt', - 'http://non-english-chars-รกรญรฉรฅรผ.com' + 'http://non-english-chars-รกรญรฉรฅรผ.com', + 'http://www.port.com:1234', + 'http://username:password@something.apify.com' ]) def test_url_no_commas_regex_true_positives(true_positive: str) -> None: example_string= f'Some text {true_positive} some more text' - matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string)) + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 1 assert matches[0].group(0) == true_positive @@ -190,12 +192,12 @@ def test_url_no_commas_regex_true_positives(true_positive: str) -> None: ]) def test_url_no_commas_regex_false_positives(false_positive: str) -> None: example_string= f'Some text {false_positive} some more text' - matches = list(regex.findall(URL_NO_COMMAS_REGEX, example_string)) + matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 0 def test_url_no_commas_regex_multi_line() -> None: true_positives = ('http://www.something.com', 'http://www.else.com') example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) - matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string)) + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 2 assert {match.group(0) for match in matches} == set(true_positives) From b4ad24fb37db5d4a66431e7642576847a08c8e5e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 18 Nov 2024 18:07:47 +0100 Subject: [PATCH 10/20] Reduce some test code repetition. --- tests/unit/actor/test_actor_inputs.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/unit/actor/test_actor_inputs.py b/tests/unit/actor/test_actor_inputs.py index f039e549..08a3f155 100644 --- a/tests/unit/actor/test_actor_inputs.py +++ b/tests/unit/actor/test_actor_inputs.py @@ -136,12 +136,13 @@ async def test_actor_create_request_list_from_url() -> None: async def test_actor_create_request_list_from_url_additional_inputs() -> None: """Test that all generated request properties are correctly populated from input values.""" expected_simple_url = 'https://www.someurl.com' - example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls:[ - {ActorInputKeys.startUrls.requestsFromUrl: 'https://crawlee.dev/file.txt', 'method': 'POST', - ActorInputKeys.startUrls.headers: {'key': 'value'}, - ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}}, - ]} + example_start_url_input = { + ActorInputKeys.startUrls.requestsFromUrl: 'https://crawlee.dev/file.txt', + ActorInputKeys.startUrls.method: 'POST', + ActorInputKeys.startUrls.headers: {'key': 'value'}, + ActorInputKeys.startUrls.payload: 'some_payload', + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}} + example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls:[example_start_url_input]} response_bodies = iter((expected_simple_url,)) http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): @@ -149,12 +150,12 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: request = await generated_input.start_urls.fetch_next_request() # Check all properties correctly created for request + example_start_url_input = example_actor_input[ActorInputKeys.startUrls][0] assert request assert request.url == expected_simple_url - assert request.method == example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.method] - assert request.headers.root == example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.headers] - assert request.payload == example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.payload].encode( - 'utf-8') + assert request.method == example_start_url_input[ActorInputKeys.startUrls.method] + assert request.headers.root == example_start_url_input[ActorInputKeys.startUrls.headers] + assert request.payload == str(example_start_url_input[ActorInputKeys.startUrls.payload]).encode('utf-8') expected_user_data = UserData() for key, value in example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.userData].items(): expected_user_data[key] = value @@ -174,7 +175,7 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: 'http://www.something.com/somethignelse.txt', 'http://non-english-chars-รกรญรฉรฅรผ.com', 'http://www.port.com:1234', - 'http://username:password@something.apify.com' + 'http://username:password@something.else.com' ]) def test_url_no_commas_regex_true_positives(true_positive: str) -> None: example_string= f'Some text {true_positive} some more text' From 05d048a0f4afcd9a578f5333945a76e7276df39b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 18 Nov 2024 18:09:21 +0100 Subject: [PATCH 11/20] Remove types-regex --- poetry.lock | 13 +------------ pyproject.toml | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index f43b2f3a..768801be 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2823,17 +2823,6 @@ files = [ {file = "types_python_dateutil-2.9.0.20241003-py3-none-any.whl", hash = "sha256:250e1d8e80e7bbc3a6c99b907762711d1a1cdd00e978ad39cb5940f6f0a87f3d"}, ] -[[package]] -name = "types-regex" -version = "2024.11.6.20241108" -description = "Typing stubs for regex" -optional = false -python-versions = ">=3.8" -files = [ - {file = "types-regex-2024.11.6.20241108.tar.gz", hash = "sha256:a774e307b99b3df49481b29e8b4962f021693052a8d8a2b9e6792fcec896cf5e"}, - {file = "types_regex-2024.11.6.20241108-py3-none-any.whl", hash = "sha256:adec2ff2dfed00855551057334466fde923606599d01e7440556d53a3ef20835"}, -] - [[package]] name = "typing-extensions" version = "4.12.2" @@ -3226,4 +3215,4 @@ scrapy = ["scrapy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "da388b618b4c9b95567d426529d0b7cda05d33909995c409e595c99e6f1767ff" +content-hash = "3698d5b2d562a7a83489d316a70b6685d4276f9aa9adb904ea5f39479cc8eeee" diff --git a/pyproject.toml b/pyproject.toml index 68091faa..c3a01c41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,6 @@ pytest-xdist = "~3.6.0" respx = "~0.21.0" ruff = "~0.7.0" setuptools = "~75.0.0" # setuptools are used by pytest but not explicitly required -types-regex = "^2024.11.6.20241108" [tool.poetry.extras] scrapy = ["scrapy"] From 376ae8bea6d13f949c46fe5274df556adf559cdf Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 18 Nov 2024 18:23:37 +0100 Subject: [PATCH 12/20] Make ruff happy --- src/apify/_actor.py | 3 +- src/apify/storages/_actor_inputs.py | 36 +++-- src/apify/storages/_known_actor_input_keys.py | 17 +-- tests/unit/actor/test_actor_inputs.py | 128 +++++++++++------- 4 files changed, 114 insertions(+), 70 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index f60a99df..4f3f032f 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -8,7 +8,6 @@ from lazy_object_proxy import Proxy from pydantic import AliasChoices -from typing_extensions import Self from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars @@ -31,6 +30,8 @@ import logging from types import TracebackType + from typing_extensions import Self + from crawlee.proxy_configuration import _NewUrlFunction from apify._models import Webhook diff --git a/src/apify/storages/_actor_inputs.py b/src/apify/storages/_actor_inputs.py index 9437a578..524ac70b 100644 --- a/src/apify/storages/_actor_inputs.py +++ b/src/apify/storages/_actor_inputs.py @@ -23,12 +23,12 @@ ) - class _RequestDetails(BaseModel): method: HttpMethod payload: str = '' headers: dict[str, str] = Field(default_factory=dict) - user_data: dict[str, str]= Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData) + user_data: dict[str, str] = Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData) + class _RequestsFromUrlInput(_RequestDetails): requests_from_url: str = Field(alias=ActorInputKeys.startUrls.requestsFromUrl) @@ -37,6 +37,7 @@ class _RequestsFromUrlInput(_RequestDetails): class _SimpleUrlInput(_RequestDetails): url: str + class Input(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) start_urls: RequestList @@ -45,11 +46,13 @@ class Input(BaseModel): async def read(cls, raw_input: dict[str, Any], http_client: BaseHttpClient | None = None) -> Self: if ActorInputKeys.startUrls in raw_input: request_list = await _create_request_list( - actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client) + actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client + ) else: request_list = RequestList() return cls(start_urls=request_list) + async def _create_request_list( *, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None ) -> RequestList: @@ -69,10 +72,13 @@ async def _create_request_list( if not http_client: http_client = HttpxHttpClient() simple_url_requests_inputs = [ - _SimpleUrlInput(**request_input) for request_input in actor_start_urls_input - if ActorInputKeys.startUrls.url in request_input] + _SimpleUrlInput(**request_input) + for request_input in actor_start_urls_input + if ActorInputKeys.startUrls.url in request_input + ] remote_url_requests_inputs = [ - _RequestsFromUrlInput(**request_input) for request_input in actor_start_urls_input + _RequestsFromUrlInput(**request_input) + for request_input in actor_start_urls_input if ActorInputKeys.startUrls.requestsFromUrl in request_input ] @@ -109,12 +115,18 @@ async def _create_requests_from_url( def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: """Callback to scrape response body with regexp and create Requests from matches.""" matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) - created_requests.extend([Request.from_url( - match.group(0), - method=request_input.method, - payload=request_input.payload.encode('utf-8'), - headers=request_input.headers, - user_data=request_input.user_data) for match in matches]) + created_requests.extend( + [ + Request.from_url( + match.group(0), + method=request_input.method, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data, + ) + for match in matches + ] + ) remote_url_requests = [] for remote_url_requests_input in remote_url_requests_inputs: diff --git a/src/apify/storages/_known_actor_input_keys.py b/src/apify/storages/_known_actor_input_keys.py index 49347393..2283a056 100644 --- a/src/apify/storages/_known_actor_input_keys.py +++ b/src/apify/storages/_known_actor_input_keys.py @@ -1,23 +1,23 @@ - - class _KnownInputKey(str): __slots__ = ('_name',) + def __init__(self, name: str) -> None: self._name = name def __str__(self) -> str: return self._name - def __repr__(self) ->str: + def __repr__(self) -> str: return str(self) + class _StartUrls(_KnownInputKey): - url='url' + url = 'url' requestsFromUrl = 'requestsFromUrl' # noqa: N815 # Intentional to respect actual naming of input keys. - method='method' - payload= 'payload' - userData='userData' # noqa: N815 # Intentional to respect actual naming of input keys. - headers='headers' + method = 'method' + payload = 'payload' + userData = 'userData' # noqa: N815 # Intentional to respect actual naming of input keys. + headers = 'headers' class _ActorInputKeys: @@ -25,4 +25,5 @@ class _ActorInputKeys: startUrls: _StartUrls = _StartUrls('startUrls') # noqa: N815 # Intentional to respect actual naming of input keys. # More inputs should be gradually added + ActorInputKeys = _ActorInputKeys() diff --git a/tests/unit/actor/test_actor_inputs.py b/tests/unit/actor/test_actor_inputs.py index 08a3f155..736a1137 100644 --- a/tests/unit/actor/test_actor_inputs.py +++ b/tests/unit/actor/test_actor_inputs.py @@ -19,8 +19,11 @@ 'optional_input', [ {}, - {ActorInputKeys.startUrls.payload: 'some payload', ActorInputKeys.startUrls.userData: - {'some key': 'some value'}, ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'}}, + { + ActorInputKeys.startUrls.payload: 'some payload', + ActorInputKeys.startUrls.userData: {'some key': 'some value'}, + ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'}, + }, ], ids=['minimal', 'all_options'], ) @@ -28,8 +31,10 @@ async def test_actor_create_request_list_request_types( request_method: HttpMethod, optional_input: dict[str, Any] ) -> None: """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" - minimal_request_dict_input = {ActorInputKeys.startUrls.url: 'https://www.abc.com', - ActorInputKeys.startUrls.method: request_method} + minimal_request_dict_input = { + ActorInputKeys.startUrls.url: 'https://www.abc.com', + ActorInputKeys.startUrls.method: request_method, + } request_dict_input = {**minimal_request_dict_input, **optional_input} example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [request_dict_input]} @@ -75,17 +80,25 @@ def read(self) -> bytes: async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: """Test that injected HttpClient's method send_request is called with properly passed arguments.""" - example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [ - {ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'}, - {ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'PUT'}, - { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', - ActorInputKeys.startUrls.method: 'POST', - ActorInputKeys.startUrls.headers: {'key': 'value'}, - ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, - }, - ]} + example_actor_input: dict[str, Any] = { + ActorInputKeys.startUrls: [ + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', + ActorInputKeys.startUrls.method: 'GET', + }, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', + ActorInputKeys.startUrls.method: 'PUT', + }, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', + ActorInputKeys.startUrls.method: 'POST', + ActorInputKeys.startUrls.headers: {'key': 'value'}, + ActorInputKeys.startUrls.payload: 'some_payload', + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, + }, + ] + } mocked_read_outputs = ('' for url in example_actor_input[ActorInputKeys.startUrls]) http_client = HttpxHttpClient() @@ -117,11 +130,19 @@ async def test_actor_create_request_list_from_url() -> None: ) ) - example_actor_input:dict[str, Any] = {ActorInputKeys.startUrls:[ - {ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'}, - {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, - {ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'GET'}, - ]} + example_actor_input: dict[str, Any] = { + ActorInputKeys.startUrls: [ + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', + ActorInputKeys.startUrls.method: 'GET', + }, + {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', + ActorInputKeys.startUrls.method: 'GET', + }, + ] + } http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): @@ -133,7 +154,8 @@ async def test_actor_create_request_list_from_url() -> None: # Check correctly created requests' urls in request list assert {generated_request.url for generated_request in generated_requests} == expected_urls -async def test_actor_create_request_list_from_url_additional_inputs() -> None: + +async def test_actor_create_request_list_from_url_additional_inputs() -> None: """Test that all generated request properties are correctly populated from input values.""" expected_simple_url = 'https://www.someurl.com' example_start_url_input = { @@ -141,8 +163,9 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: ActorInputKeys.startUrls.method: 'POST', ActorInputKeys.startUrls.headers: {'key': 'value'}, ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}} - example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls:[example_start_url_input]} + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, + } + example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [example_start_url_input]} response_bodies = iter((expected_simple_url,)) http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): @@ -162,43 +185,50 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: assert request.user_data == expected_user_data -@pytest.mark.parametrize('true_positive', [ - 'http://www.something.com', - 'https://www.something.net', - 'http://nowww.cz', - 'https://with-hypen.com', - 'http://number1.com', - 'http://www.number.123.abc', - 'http://many.dots.com', - 'http://a.com', - 'http://www.something.com/somethignelse' - 'http://www.something.com/somethignelse.txt', - 'http://non-english-chars-รกรญรฉรฅรผ.com', - 'http://www.port.com:1234', - 'http://username:password@something.else.com' -]) +@pytest.mark.parametrize( + 'true_positive', + [ + 'http://www.something.com', + 'https://www.something.net', + 'http://nowww.cz', + 'https://with-hypen.com', + 'http://number1.com', + 'http://www.number.123.abc', + 'http://many.dots.com', + 'http://a.com', + 'http://www.something.com/somethignelse' 'http://www.something.com/somethignelse.txt', + 'http://non-english-chars-รกรญรฉรฅรผ.com', + 'http://www.port.com:1234', + 'http://username:password@something.else.com', + ], +) def test_url_no_commas_regex_true_positives(true_positive: str) -> None: - example_string= f'Some text {true_positive} some more text' + example_string = f'Some text {true_positive} some more text' matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 1 assert matches[0].group(0) == true_positive -@pytest.mark.parametrize('false_positive',[ - 'http://www.a', - 'http://a', - 'http://a.a', - 'http://123.456', - 'www.something.com', - 'http:www.something.com', -]) + +@pytest.mark.parametrize( + 'false_positive', + [ + 'http://www.a', + 'http://a', + 'http://a.a', + 'http://123.456', + 'www.something.com', + 'http:www.something.com', + ], +) def test_url_no_commas_regex_false_positives(false_positive: str) -> None: - example_string= f'Some text {false_positive} some more text' + example_string = f'Some text {false_positive} some more text' matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 0 + def test_url_no_commas_regex_multi_line() -> None: true_positives = ('http://www.something.com', 'http://www.else.com') - example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) + example_string = 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 2 assert {match.group(0) for match in matches} == set(true_positives) From 629939ef904c1e85976905812e213bca8e0ec264 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Nov 2024 08:45:44 +0100 Subject: [PATCH 13/20] Remove Input class It had too many assumptions that users might not be interested in. Users should create such Input helper classes based on their specific inputs and their names. --- src/apify/storages/__init__.py | 4 +- src/apify/storages/_actor_inputs.py | 28 ++----- tests/unit/actor/test_actor_inputs.py | 105 ++++++++++++-------------- 3 files changed, 56 insertions(+), 81 deletions(-) diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index 8fd33ba3..1c77e7b3 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,5 +1,5 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue -from ._actor_inputs import Input +from ._actor_inputs import create_request_list -__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'Input'] +__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'create_request_list'] diff --git a/src/apify/storages/_actor_inputs.py b/src/apify/storages/_actor_inputs.py index 524ac70b..675e7fb2 100644 --- a/src/apify/storages/_actor_inputs.py +++ b/src/apify/storages/_actor_inputs.py @@ -4,12 +4,9 @@ import re from asyncio import Task from functools import partial -from typing import TYPE_CHECKING, Any +from typing import Any -from pydantic import BaseModel, ConfigDict, Field - -if TYPE_CHECKING: - from typing_extensions import Self +from pydantic import BaseModel, Field from crawlee import Request from crawlee._types import HttpMethod @@ -24,7 +21,7 @@ class _RequestDetails(BaseModel): - method: HttpMethod + method: HttpMethod = 'GET' payload: str = '' headers: dict[str, str] = Field(default_factory=dict) user_data: dict[str, str] = Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData) @@ -38,23 +35,8 @@ class _SimpleUrlInput(_RequestDetails): url: str -class Input(BaseModel): - model_config = ConfigDict(arbitrary_types_allowed=True) - start_urls: RequestList - - @classmethod - async def read(cls, raw_input: dict[str, Any], http_client: BaseHttpClient | None = None) -> Self: - if ActorInputKeys.startUrls in raw_input: - request_list = await _create_request_list( - actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client - ) - else: - request_list = RequestList() - return cls(start_urls=request_list) - - -async def _create_request_list( - *, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None +async def create_request_list( + actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None ) -> RequestList: """Creates RequestList from Actor input requestListSources. diff --git a/tests/unit/actor/test_actor_inputs.py b/tests/unit/actor/test_actor_inputs.py index 736a1137..91c253f0 100644 --- a/tests/unit/actor/test_actor_inputs.py +++ b/tests/unit/actor/test_actor_inputs.py @@ -11,7 +11,7 @@ from crawlee._types import HttpHeaders, HttpMethod from crawlee.http_clients import HttpResponse, HttpxHttpClient -from apify.storages._actor_inputs import URL_NO_COMMAS_REGEX, ActorInputKeys, Input +from apify.storages._actor_inputs import URL_NO_COMMAS_REGEX, ActorInputKeys, create_request_list @pytest.mark.parametrize('request_method', get_args(HttpMethod)) @@ -36,24 +36,22 @@ async def test_actor_create_request_list_request_types( ActorInputKeys.startUrls.method: request_method, } request_dict_input = {**minimal_request_dict_input, **optional_input} - example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [request_dict_input]} - generated_input = await Input.read(example_actor_input) + request_list = await create_request_list([request_dict_input]) + assert not await request_list.is_empty() + request = await request_list.fetch_next_request() + assert request is not None + assert await request_list.is_empty() - assert not await generated_input.start_urls.is_empty() - generated_request = await generated_input.start_urls.fetch_next_request() - assert generated_request is not None - assert await generated_input.start_urls.is_empty() - - assert generated_request.method == request_dict_input[ActorInputKeys.startUrls.method] - assert generated_request.url == request_dict_input[ActorInputKeys.startUrls.url] - assert generated_request.payload == request_dict_input.get(ActorInputKeys.startUrls.payload, '').encode('utf-8') + assert request.method == request_dict_input[ActorInputKeys.startUrls.method] + assert request.url == request_dict_input[ActorInputKeys.startUrls.url] + assert request.payload == request_dict_input.get(ActorInputKeys.startUrls.payload, '').encode('utf-8') expected_user_data = UserData() if ActorInputKeys.startUrls.userData in optional_input: for key, value in optional_input[ActorInputKeys.startUrls.userData].items(): expected_user_data[key] = value - assert generated_request.user_data == expected_user_data - assert generated_request.headers.root == optional_input.get(ActorInputKeys.startUrls.headers, {}) + assert request.user_data == expected_user_data + assert request.headers.root == optional_input.get(ActorInputKeys.startUrls.headers, {}) def _create_dummy_response(read_output: Iterator[str]) -> HttpResponse: @@ -80,39 +78,37 @@ def read(self) -> bytes: async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: """Test that injected HttpClient's method send_request is called with properly passed arguments.""" - example_actor_input: dict[str, Any] = { - ActorInputKeys.startUrls: [ - { - ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', - ActorInputKeys.startUrls.method: 'GET', - }, - { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', - ActorInputKeys.startUrls.method: 'PUT', - }, - { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', - ActorInputKeys.startUrls.method: 'POST', - ActorInputKeys.startUrls.headers: {'key': 'value'}, - ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, - }, - ] - } + actor_start_urls_input: list[dict[str, Any]] = [ + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', + ActorInputKeys.startUrls.method: 'GET', + }, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', + ActorInputKeys.startUrls.method: 'PUT', + }, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', + ActorInputKeys.startUrls.method: 'POST', + ActorInputKeys.startUrls.headers: {'key': 'value'}, + ActorInputKeys.startUrls.payload: 'some_payload', + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, + }, + ] - mocked_read_outputs = ('' for url in example_actor_input[ActorInputKeys.startUrls]) + mocked_read_outputs = ('' for url in actor_start_urls_input) http_client = HttpxHttpClient() with mock.patch.object( http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) ) as mocked_send_request: - await Input.read(example_actor_input, http_client=http_client) + await create_request_list(actor_start_urls_input, http_client=http_client) expected_calls = [ call( method='GET', url=example_input[ActorInputKeys.startUrls.requestsFromUrl], ) - for example_input in example_actor_input[ActorInputKeys.startUrls] + for example_input in actor_start_urls_input ] mocked_send_request.assert_has_calls(expected_calls) @@ -130,25 +126,23 @@ async def test_actor_create_request_list_from_url() -> None: ) ) - example_actor_input: dict[str, Any] = { - ActorInputKeys.startUrls: [ - { - ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', - ActorInputKeys.startUrls.method: 'GET', - }, - {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, - { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', - ActorInputKeys.startUrls.method: 'GET', - }, - ] - } + actor_start_urls_input = [ + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', + ActorInputKeys.startUrls.method: 'GET', + }, + {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', + ActorInputKeys.startUrls.method: 'GET', + }, + ] http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - generated_input = await Input.read(example_actor_input, http_client=http_client) + request_list = await create_request_list(actor_start_urls_input, http_client=http_client) generated_requests = [] - while request := await generated_input.start_urls.fetch_next_request(): + while request := await request_list.fetch_next_request(): generated_requests.append(request) # Check correctly created requests' urls in request list @@ -158,29 +152,28 @@ async def test_actor_create_request_list_from_url() -> None: async def test_actor_create_request_list_from_url_additional_inputs() -> None: """Test that all generated request properties are correctly populated from input values.""" expected_simple_url = 'https://www.someurl.com' - example_start_url_input = { + example_start_url_input: dict[str, Any] = { ActorInputKeys.startUrls.requestsFromUrl: 'https://crawlee.dev/file.txt', ActorInputKeys.startUrls.method: 'POST', ActorInputKeys.startUrls.headers: {'key': 'value'}, ActorInputKeys.startUrls.payload: 'some_payload', ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, } - example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [example_start_url_input]} + response_bodies = iter((expected_simple_url,)) http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - generated_input = await Input.read(example_actor_input, http_client=http_client) - request = await generated_input.start_urls.fetch_next_request() + request_list = await create_request_list([example_start_url_input], http_client=http_client) + request = await request_list.fetch_next_request() # Check all properties correctly created for request - example_start_url_input = example_actor_input[ActorInputKeys.startUrls][0] assert request assert request.url == expected_simple_url assert request.method == example_start_url_input[ActorInputKeys.startUrls.method] assert request.headers.root == example_start_url_input[ActorInputKeys.startUrls.headers] assert request.payload == str(example_start_url_input[ActorInputKeys.startUrls.payload]).encode('utf-8') expected_user_data = UserData() - for key, value in example_actor_input[ActorInputKeys.startUrls][0][ActorInputKeys.startUrls.userData].items(): + for key, value in example_start_url_input[ActorInputKeys.startUrls.userData].items(): expected_user_data[key] = value assert request.user_data == expected_user_data From 910d11f142c959597f9c1bac29b35e03e32bb700 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Nov 2024 13:43:11 +0100 Subject: [PATCH 14/20] Review comments. Update TCH001, TCH002, TCH003 uses. --- pyproject.toml | 2 +- src/apify/_configuration.py | 1 - src/apify/_models.py | 1 - src/apify/scrapy/middlewares/apify_proxy.py | 6 +- .../scrapy/pipelines/actor_dataset_push.py | 5 +- src/apify/scrapy/scheduler.py | 5 +- src/apify/scrapy/utils.py | 5 +- src/apify/storages/__init__.py | 4 +- src/apify/storages/_known_actor_input_keys.py | 29 ------ .../{_actor_inputs.py => request_list.py} | 81 +++++++++------- ...t_actor_inputs.py => test_request_list.py} | 96 ++++++++++--------- 11 files changed, 115 insertions(+), 120 deletions(-) delete mode 100644 src/apify/storages/_known_actor_input_keys.py rename src/apify/storages/{_actor_inputs.py => request_list.py} (61%) rename tests/unit/actor/{test_actor_inputs.py => test_request_list.py} (63%) diff --git a/pyproject.toml b/pyproject.toml index c3a01c41..7647dfa0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,7 +142,7 @@ docstring-quotes = "double" inline-quotes = "single" [tool.ruff.lint.flake8-type-checking] -runtime-evaluated-base-classes = ["pydantic.BaseModel"] +runtime-evaluated-base-classes = ["pydantic.BaseModel", "crawlee.configuration.Configuration", "ApifyHttpProxyMiddleware"] [tool.ruff.lint.flake8-builtins] builtins-ignorelist = ["id"] diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index ab249284..00bb0336 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -1,4 +1,3 @@ -# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work) from __future__ import annotations from datetime import datetime, timedelta diff --git a/src/apify/_models.py b/src/apify/_models.py index 5963ec9a..f9b2f9a8 100644 --- a/src/apify/_models.py +++ b/src/apify/_models.py @@ -1,4 +1,3 @@ -# ruff: noqa: TCH001 TCH002 TCH003 (Pydantic) from __future__ import annotations from datetime import datetime, timedelta diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index 3a7f7b75..b1dc2b88 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -1,11 +1,13 @@ from __future__ import annotations +from typing import TYPE_CHECKING from urllib.parse import ParseResult, urlparse try: - from scrapy import Request, Spider # noqa: TCH002 + if TYPE_CHECKING: + from scrapy import Request, Spider + from scrapy.crawler import Crawler from scrapy.core.downloader.handlers.http11 import TunnelError - from scrapy.crawler import Crawler # noqa: TCH002 from scrapy.exceptions import NotConfigured except ImportError as exc: raise ImportError( diff --git a/src/apify/scrapy/pipelines/actor_dataset_push.py b/src/apify/scrapy/pipelines/actor_dataset_push.py index 15026475..d2d983cc 100644 --- a/src/apify/scrapy/pipelines/actor_dataset_push.py +++ b/src/apify/scrapy/pipelines/actor_dataset_push.py @@ -1,9 +1,12 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from itemadapter.adapter import ItemAdapter try: - from scrapy import Item, Spider # noqa: TCH002 + if TYPE_CHECKING: + from scrapy import Item, Spider except ImportError as exc: raise ImportError( 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 849e5376..da79ac64 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -1,6 +1,7 @@ from __future__ import annotations import traceback +from typing import TYPE_CHECKING from apify._configuration import Configuration from apify.apify_storage_client import ApifyStorageClient @@ -8,8 +9,10 @@ try: from scrapy import Spider from scrapy.core.scheduler import BaseScheduler - from scrapy.http.request import Request # noqa: TCH002 from scrapy.utils.reactor import is_asyncio_reactor_installed + + if TYPE_CHECKING: + from scrapy.http.request import Request except ImportError as exc: raise ImportError( 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index f22a60de..1f92d4ff 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -2,14 +2,17 @@ import asyncio from base64 import b64encode +from typing import TYPE_CHECKING from urllib.parse import unquote from apify_shared.utils import ignore_docs try: - from scrapy.settings import Settings # noqa: TCH002 from scrapy.utils.project import get_project_settings from scrapy.utils.python import to_bytes + + if TYPE_CHECKING: + from scrapy.settings import Settings except ImportError as exc: raise ImportError( 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run ' diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index 1c77e7b3..fc812aa1 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,5 +1,5 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue -from ._actor_inputs import create_request_list +from .request_list import RequestList -__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'create_request_list'] +__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList'] diff --git a/src/apify/storages/_known_actor_input_keys.py b/src/apify/storages/_known_actor_input_keys.py deleted file mode 100644 index 2283a056..00000000 --- a/src/apify/storages/_known_actor_input_keys.py +++ /dev/null @@ -1,29 +0,0 @@ -class _KnownInputKey(str): - __slots__ = ('_name',) - - def __init__(self, name: str) -> None: - self._name = name - - def __str__(self) -> str: - return self._name - - def __repr__(self) -> str: - return str(self) - - -class _StartUrls(_KnownInputKey): - url = 'url' - requestsFromUrl = 'requestsFromUrl' # noqa: N815 # Intentional to respect actual naming of input keys. - method = 'method' - payload = 'payload' - userData = 'userData' # noqa: N815 # Intentional to respect actual naming of input keys. - headers = 'headers' - - -class _ActorInputKeys: - # Helper class to have actor input strings all in one place and easy to use with code completion. - startUrls: _StartUrls = _StartUrls('startUrls') # noqa: N815 # Intentional to respect actual naming of input keys. - # More inputs should be gradually added - - -ActorInputKeys = _ActorInputKeys() diff --git a/src/apify/storages/_actor_inputs.py b/src/apify/storages/request_list.py similarity index 61% rename from src/apify/storages/_actor_inputs.py rename to src/apify/storages/request_list.py index 675e7fb2..a3e11006 100644 --- a/src/apify/storages/_actor_inputs.py +++ b/src/apify/storages/request_list.py @@ -4,16 +4,14 @@ import re from asyncio import Task from functools import partial -from typing import Any +from typing import Any, Union -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, TypeAdapter from crawlee import Request from crawlee._types import HttpMethod from crawlee.http_clients import BaseHttpClient, HttpxHttpClient -from crawlee.storages import RequestList - -from ._known_actor_input_keys import ActorInputKeys +from crawlee.storages import RequestList as CrawleeRequestList URL_NO_COMMAS_REGEX = re.compile( r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?' @@ -24,50 +22,63 @@ class _RequestDetails(BaseModel): method: HttpMethod = 'GET' payload: str = '' headers: dict[str, str] = Field(default_factory=dict) - user_data: dict[str, str] = Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData) + user_data: dict[str, str] = Field(default_factory=dict, alias='userData') class _RequestsFromUrlInput(_RequestDetails): - requests_from_url: str = Field(alias=ActorInputKeys.startUrls.requestsFromUrl) + requests_from_url: str = Field(alias='requestsFromUrl') class _SimpleUrlInput(_RequestDetails): url: str -async def create_request_list( - actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None +url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]]) + + +class RequestList(CrawleeRequestList): + """Extends crawlee RequestList.""" + + @classmethod + async def open( + cls, + name: str | None = None, + actor_start_urls_input: list[dict[str, Any]] | None = None, + http_client: BaseHttpClient | None = None, + ) -> RequestList: + """Creates RequestList from Actor input requestListSources. + + name is name of the returned RequestList + actor_start_urls_input can contain list dicts with either url or requestsFromUrl key + http_client is client that will be used to send get request to url defined in requestsFromUrl + + Example actor_start_urls_input: + [ + # Gather urls from response body. + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + # Directly include this url. + {'url': 'https://crawlee.dev', 'method': 'GET'} + ] + """ + actor_start_urls_input = actor_start_urls_input or [] + return await _create_request_list(name, actor_start_urls_input, http_client) + + +async def _create_request_list( + name: str | None, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None ) -> RequestList: - """Creates RequestList from Actor input requestListSources. - - actor_start_urls_input can contain list dicts with either url or requestsFromUrl key - http_client is client that will be used to send get request to url defined in requestsFromUrl - - Example: - actor_start_urls_input = [ - # Gather urls from response body. - {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - # Directly include this url. - {'url': 'https://crawlee.dev', 'method': 'GET'} - ] - """ if not http_client: http_client = HttpxHttpClient() - simple_url_requests_inputs = [ - _SimpleUrlInput(**request_input) - for request_input in actor_start_urls_input - if ActorInputKeys.startUrls.url in request_input - ] - remote_url_requests_inputs = [ - _RequestsFromUrlInput(**request_input) - for request_input in actor_start_urls_input - if ActorInputKeys.startUrls.requestsFromUrl in request_input - ] - simple_url_requests = _create_requests_from_input(simple_url_requests_inputs) - remote_url_requests = await _create_requests_from_url(remote_url_requests_inputs, http_client=http_client) + ulr_inputs = url_input_adapter.validate_python(actor_start_urls_input) # instance of list[Union[...]] + + simple_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _SimpleUrlInput] + remote_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _RequestsFromUrlInput] + + simple_url_requests = _create_requests_from_input(simple_url_inputs) + remote_url_requests = await _create_requests_from_url(remote_url_inputs, http_client=http_client) - return RequestList(requests=simple_url_requests + remote_url_requests) + return RequestList(name=name, requests=simple_url_requests + remote_url_requests) def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: diff --git a/tests/unit/actor/test_actor_inputs.py b/tests/unit/actor/test_request_list.py similarity index 63% rename from tests/unit/actor/test_actor_inputs.py rename to tests/unit/actor/test_request_list.py index 91c253f0..e3bbfb36 100644 --- a/tests/unit/actor/test_actor_inputs.py +++ b/tests/unit/actor/test_request_list.py @@ -11,7 +11,7 @@ from crawlee._types import HttpHeaders, HttpMethod from crawlee.http_clients import HttpResponse, HttpxHttpClient -from apify.storages._actor_inputs import URL_NO_COMMAS_REGEX, ActorInputKeys, create_request_list +from apify.storages.request_list import URL_NO_COMMAS_REGEX, RequestList @pytest.mark.parametrize('request_method', get_args(HttpMethod)) @@ -20,38 +20,36 @@ [ {}, { - ActorInputKeys.startUrls.payload: 'some payload', - ActorInputKeys.startUrls.userData: {'some key': 'some value'}, - ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'}, + 'payload': 'some payload', + 'userData': {'some key': 'some value'}, + 'headers': {'h1': 'v1', 'h2': 'v2'}, }, ], ids=['minimal', 'all_options'], ) -async def test_actor_create_request_list_request_types( - request_method: HttpMethod, optional_input: dict[str, Any] -) -> None: +async def test_request_list_open_request_types(request_method: HttpMethod, optional_input: dict[str, Any]) -> None: """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" minimal_request_dict_input = { - ActorInputKeys.startUrls.url: 'https://www.abc.com', - ActorInputKeys.startUrls.method: request_method, + 'url': 'https://www.abc.com', + 'method': request_method, } request_dict_input = {**minimal_request_dict_input, **optional_input} - request_list = await create_request_list([request_dict_input]) + request_list = await RequestList.open(actor_start_urls_input=[request_dict_input]) assert not await request_list.is_empty() request = await request_list.fetch_next_request() assert request is not None assert await request_list.is_empty() - assert request.method == request_dict_input[ActorInputKeys.startUrls.method] - assert request.url == request_dict_input[ActorInputKeys.startUrls.url] - assert request.payload == request_dict_input.get(ActorInputKeys.startUrls.payload, '').encode('utf-8') + assert request.method == request_dict_input['method'] + assert request.url == request_dict_input['url'] + assert request.payload == request_dict_input.get('payload', '').encode('utf-8') expected_user_data = UserData() - if ActorInputKeys.startUrls.userData in optional_input: - for key, value in optional_input[ActorInputKeys.startUrls.userData].items(): + if 'userData' in optional_input: + for key, value in optional_input['userData'].items(): expected_user_data[key] = value assert request.user_data == expected_user_data - assert request.headers.root == optional_input.get(ActorInputKeys.startUrls.headers, {}) + assert request.headers.root == optional_input.get('headers', {}) def _create_dummy_response(read_output: Iterator[str]) -> HttpResponse: @@ -76,23 +74,23 @@ def read(self) -> bytes: return DummyResponse() -async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: +async def test__request_list_open_from_url_correctly_send_requests() -> None: """Test that injected HttpClient's method send_request is called with properly passed arguments.""" actor_start_urls_input: list[dict[str, Any]] = [ { - ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', - ActorInputKeys.startUrls.method: 'GET', + 'requestsFromUrl': 'https://abc.dev/file.txt', + 'method': 'GET', }, { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', - ActorInputKeys.startUrls.method: 'PUT', + 'requestsFromUrl': 'https://www.abc.dev/file2', + 'method': 'PUT', }, { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', - ActorInputKeys.startUrls.method: 'POST', - ActorInputKeys.startUrls.headers: {'key': 'value'}, - ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, + 'requestsFromUrl': 'https://www.something.som', + 'method': 'POST', + 'headers': {'key': 'value'}, + 'payload': 'some_payload', + 'userData': {'another_key': 'another_value'}, }, ] @@ -101,19 +99,19 @@ async def test_actor_create_request_list_from_url_correctly_send_requests() -> N with mock.patch.object( http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) ) as mocked_send_request: - await create_request_list(actor_start_urls_input, http_client=http_client) + await RequestList.open(actor_start_urls_input=actor_start_urls_input, http_client=http_client) expected_calls = [ call( method='GET', - url=example_input[ActorInputKeys.startUrls.requestsFromUrl], + url=example_input['requestsFromUrl'], ) for example_input in actor_start_urls_input ] mocked_send_request.assert_has_calls(expected_calls) -async def test_actor_create_request_list_from_url() -> None: +async def test_request_list_open_from_url() -> None: """Test that create_request_list is correctly reading urls from remote url sources and also from simple input.""" expected_simple_url = 'https://www.someurl.com' expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'} @@ -128,19 +126,19 @@ async def test_actor_create_request_list_from_url() -> None: actor_start_urls_input = [ { - ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', - ActorInputKeys.startUrls.method: 'GET', + 'requestsFromUrl': 'https://abc.dev/file.txt', + 'method': 'GET', }, - {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, + {'url': expected_simple_url, 'method': 'GET'}, { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', - ActorInputKeys.startUrls.method: 'GET', + 'requestsFromUrl': 'https://www.abc.dev/file2', + 'method': 'GET', }, ] http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - request_list = await create_request_list(actor_start_urls_input, http_client=http_client) + request_list = await RequestList.open(actor_start_urls_input=actor_start_urls_input, http_client=http_client) generated_requests = [] while request := await request_list.fetch_next_request(): generated_requests.append(request) @@ -149,35 +147,41 @@ async def test_actor_create_request_list_from_url() -> None: assert {generated_request.url for generated_request in generated_requests} == expected_urls -async def test_actor_create_request_list_from_url_additional_inputs() -> None: +async def test_request_list_open_from_url_additional_inputs() -> None: """Test that all generated request properties are correctly populated from input values.""" expected_simple_url = 'https://www.someurl.com' example_start_url_input: dict[str, Any] = { - ActorInputKeys.startUrls.requestsFromUrl: 'https://crawlee.dev/file.txt', - ActorInputKeys.startUrls.method: 'POST', - ActorInputKeys.startUrls.headers: {'key': 'value'}, - ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, + 'requestsFromUrl': 'https://crawlee.dev/file.txt', + 'method': 'POST', + 'headers': {'key': 'value'}, + 'payload': 'some_payload', + 'userData': {'another_key': 'another_value'}, } response_bodies = iter((expected_simple_url,)) http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - request_list = await create_request_list([example_start_url_input], http_client=http_client) + request_list = await RequestList.open(actor_start_urls_input=[example_start_url_input], http_client=http_client) request = await request_list.fetch_next_request() # Check all properties correctly created for request assert request assert request.url == expected_simple_url - assert request.method == example_start_url_input[ActorInputKeys.startUrls.method] - assert request.headers.root == example_start_url_input[ActorInputKeys.startUrls.headers] - assert request.payload == str(example_start_url_input[ActorInputKeys.startUrls.payload]).encode('utf-8') + assert request.method == example_start_url_input['method'] + assert request.headers.root == example_start_url_input['headers'] + assert request.payload == str(example_start_url_input['payload']).encode('utf-8') expected_user_data = UserData() - for key, value in example_start_url_input[ActorInputKeys.startUrls.userData].items(): + for key, value in example_start_url_input['userData'].items(): expected_user_data[key] = value assert request.user_data == expected_user_data +async def test_request_list_open_name() -> None: + name = 'some_name' + request_list = await RequestList.open(name=name) + assert request_list.name == name + + @pytest.mark.parametrize( 'true_positive', [ From 6ff9e9058afa3522cf3cc425b3249fc5d96dae37 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Nov 2024 13:45:30 +0100 Subject: [PATCH 15/20] Remove unnecessary pyproject setting value. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7647dfa0..bda4c509 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,7 +142,7 @@ docstring-quotes = "double" inline-quotes = "single" [tool.ruff.lint.flake8-type-checking] -runtime-evaluated-base-classes = ["pydantic.BaseModel", "crawlee.configuration.Configuration", "ApifyHttpProxyMiddleware"] +runtime-evaluated-base-classes = ["pydantic.BaseModel", "crawlee.configuration.Configuration"] [tool.ruff.lint.flake8-builtins] builtins-ignorelist = ["id"] From 3f3314529e7929e823a0540262de45dd6456c1bb Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 19 Nov 2024 16:25:38 +0100 Subject: [PATCH 16/20] Addresing review comments --- src/apify/storages/__init__.py | 2 +- src/apify/storages/_request_list.py | 140 ++++++++++++++++++++++++++ src/apify/storages/request_list.py | 137 ------------------------- tests/unit/actor/test_request_list.py | 35 ++++--- 4 files changed, 161 insertions(+), 153 deletions(-) create mode 100644 src/apify/storages/_request_list.py delete mode 100644 src/apify/storages/request_list.py diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index fc812aa1..63ac7af6 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,5 +1,5 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue -from .request_list import RequestList +from ._request_list import RequestList __all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList'] diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py new file mode 100644 index 00000000..c4ca1d9a --- /dev/null +++ b/src/apify/storages/_request_list.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +import asyncio +import re +from asyncio import Task +from functools import partial +from typing import Annotated, Any, Union + +from pydantic import BaseModel, Field, TypeAdapter + +from crawlee import Request +from crawlee._types import HttpMethod +from crawlee.http_clients import BaseHttpClient, HttpxHttpClient +from crawlee.storages import RequestList as CrawleeRequestList + +URL_NO_COMMAS_REGEX = re.compile( + r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?' +) + + +class _RequestDetails(BaseModel): + method: HttpMethod = 'GET' + payload: str = '' + headers: Annotated[dict[str, str], Field(default_factory=dict)] = {} + user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {} + + +class _RequestsFromUrlInput(_RequestDetails): + requests_from_url: str = Field(alias='requestsFromUrl') + + +class _SimpleUrlInput(_RequestDetails): + url: str + + +url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]]) + + +# @docs_group('Classes') # Not yet available in crawlee +class RequestList(CrawleeRequestList): + """Extends crawlee RequestList. + + Method open is used to create RequestList from actor's requestListSources input. + """ + + @staticmethod + async def open( + name: str | None = None, + request_list_sources_input: list[dict[str, Any]] | None = None, + http_client: BaseHttpClient | None = None, + ) -> RequestList: + """Creates RequestList from Actor input requestListSources. + + name is name of the returned RequestList + request_list_sources_input can contain list dicts with either url or requestsFromUrl key + http_client is client that will be used to send get request to url defined in requestsFromUrl + + Example request_list_sources_input: + [ + # Gather urls from response body. + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + # Directly include this url. + {'url': 'https://crawlee.dev', 'method': 'GET'} + ] + """ + request_list_sources_input = request_list_sources_input or [] + return await RequestList._create_request_list(name, request_list_sources_input, http_client) + + @staticmethod + async def _create_request_list( + name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None + ) -> RequestList: + if not http_client: + http_client = HttpxHttpClient() + + ulr_inputs = url_input_adapter.validate_python(request_list_sources_input) # instance of list[Union[...]] + + simple_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _SimpleUrlInput] + remote_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _RequestsFromUrlInput] + + simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs) + remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client) + + return RequestList(name=name, requests=simple_url_requests + remote_url_requests) + + @staticmethod + def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: + return [ + Request.from_url( + method=request_input.method, + url=request_input.url, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data, + ) + for request_input in simple_url_inputs + ] + + @staticmethod + async def _fetch_requests_from_url( + remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient + ) -> list[Request]: + """Crete list of requests from url. + + Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting + callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from + collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. + """ + created_requests: list[Request] = [] + + def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: + """Callback to scrape response body with regexp and create Requests from matches.""" + matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) + created_requests.extend( + [ + Request.from_url( + match.group(0), + method=request_input.method, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data, + ) + for match in matches + ] + ) + + remote_url_requests = [] + for remote_url_requests_input in remote_url_requests_inputs: + get_response_task = asyncio.create_task( + http_client.send_request( + method='GET', + url=remote_url_requests_input.requests_from_url, + ) + ) + + get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) + remote_url_requests.append(get_response_task) + + await asyncio.gather(*remote_url_requests) + return created_requests diff --git a/src/apify/storages/request_list.py b/src/apify/storages/request_list.py deleted file mode 100644 index a3e11006..00000000 --- a/src/apify/storages/request_list.py +++ /dev/null @@ -1,137 +0,0 @@ -from __future__ import annotations - -import asyncio -import re -from asyncio import Task -from functools import partial -from typing import Any, Union - -from pydantic import BaseModel, Field, TypeAdapter - -from crawlee import Request -from crawlee._types import HttpMethod -from crawlee.http_clients import BaseHttpClient, HttpxHttpClient -from crawlee.storages import RequestList as CrawleeRequestList - -URL_NO_COMMAS_REGEX = re.compile( - r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?' -) - - -class _RequestDetails(BaseModel): - method: HttpMethod = 'GET' - payload: str = '' - headers: dict[str, str] = Field(default_factory=dict) - user_data: dict[str, str] = Field(default_factory=dict, alias='userData') - - -class _RequestsFromUrlInput(_RequestDetails): - requests_from_url: str = Field(alias='requestsFromUrl') - - -class _SimpleUrlInput(_RequestDetails): - url: str - - -url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]]) - - -class RequestList(CrawleeRequestList): - """Extends crawlee RequestList.""" - - @classmethod - async def open( - cls, - name: str | None = None, - actor_start_urls_input: list[dict[str, Any]] | None = None, - http_client: BaseHttpClient | None = None, - ) -> RequestList: - """Creates RequestList from Actor input requestListSources. - - name is name of the returned RequestList - actor_start_urls_input can contain list dicts with either url or requestsFromUrl key - http_client is client that will be used to send get request to url defined in requestsFromUrl - - Example actor_start_urls_input: - [ - # Gather urls from response body. - {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - # Directly include this url. - {'url': 'https://crawlee.dev', 'method': 'GET'} - ] - """ - actor_start_urls_input = actor_start_urls_input or [] - return await _create_request_list(name, actor_start_urls_input, http_client) - - -async def _create_request_list( - name: str | None, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None -) -> RequestList: - if not http_client: - http_client = HttpxHttpClient() - - ulr_inputs = url_input_adapter.validate_python(actor_start_urls_input) # instance of list[Union[...]] - - simple_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _SimpleUrlInput] - remote_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _RequestsFromUrlInput] - - simple_url_requests = _create_requests_from_input(simple_url_inputs) - remote_url_requests = await _create_requests_from_url(remote_url_inputs, http_client=http_client) - - return RequestList(name=name, requests=simple_url_requests + remote_url_requests) - - -def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: - return [ - Request.from_url( - method=request_input.method, - url=request_input.url, - payload=request_input.payload.encode('utf-8'), - headers=request_input.headers, - user_data=request_input.user_data, - ) - for request_input in simple_url_inputs - ] - - -async def _create_requests_from_url( - remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient -) -> list[Request]: - """Crete list of requests from url. - - Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting - callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from - collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. - """ - created_requests: list[Request] = [] - - def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: - """Callback to scrape response body with regexp and create Requests from matches.""" - matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) - created_requests.extend( - [ - Request.from_url( - match.group(0), - method=request_input.method, - payload=request_input.payload.encode('utf-8'), - headers=request_input.headers, - user_data=request_input.user_data, - ) - for match in matches - ] - ) - - remote_url_requests = [] - for remote_url_requests_input in remote_url_requests_inputs: - get_response_task = asyncio.create_task( - http_client.send_request( - method='GET', - url=remote_url_requests_input.requests_from_url, - ) - ) - - get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) - remote_url_requests.append(get_response_task) - - await asyncio.gather(*remote_url_requests) - return created_requests diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index e3bbfb36..6c26ba64 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -11,7 +11,7 @@ from crawlee._types import HttpHeaders, HttpMethod from crawlee.http_clients import HttpResponse, HttpxHttpClient -from apify.storages.request_list import URL_NO_COMMAS_REGEX, RequestList +from apify.storages._request_list import URL_NO_COMMAS_REGEX, RequestList @pytest.mark.parametrize('request_method', get_args(HttpMethod)) @@ -35,7 +35,7 @@ async def test_request_list_open_request_types(request_method: HttpMethod, optio } request_dict_input = {**minimal_request_dict_input, **optional_input} - request_list = await RequestList.open(actor_start_urls_input=[request_dict_input]) + request_list = await RequestList.open(request_list_sources_input=[request_dict_input]) assert not await request_list.is_empty() request = await request_list.fetch_next_request() assert request is not None @@ -76,7 +76,7 @@ def read(self) -> bytes: async def test__request_list_open_from_url_correctly_send_requests() -> None: """Test that injected HttpClient's method send_request is called with properly passed arguments.""" - actor_start_urls_input: list[dict[str, Any]] = [ + request_list_sources_input: list[dict[str, Any]] = [ { 'requestsFromUrl': 'https://abc.dev/file.txt', 'method': 'GET', @@ -94,19 +94,20 @@ async def test__request_list_open_from_url_correctly_send_requests() -> None: }, ] - mocked_read_outputs = ('' for url in actor_start_urls_input) - http_client = HttpxHttpClient() + mocked_read_outputs = ('' for url in request_list_sources_input) + + mocked_http_client = mock.Mock(spec_set=HttpxHttpClient) with mock.patch.object( - http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) + mocked_http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) ) as mocked_send_request: - await RequestList.open(actor_start_urls_input=actor_start_urls_input, http_client=http_client) + await RequestList.open(request_list_sources_input=request_list_sources_input, http_client=mocked_http_client) expected_calls = [ call( method='GET', url=example_input['requestsFromUrl'], ) - for example_input in actor_start_urls_input + for example_input in request_list_sources_input ] mocked_send_request.assert_has_calls(expected_calls) @@ -124,7 +125,7 @@ async def test_request_list_open_from_url() -> None: ) ) - actor_start_urls_input = [ + request_list_sources_input = [ { 'requestsFromUrl': 'https://abc.dev/file.txt', 'method': 'GET', @@ -136,9 +137,11 @@ async def test_request_list_open_from_url() -> None: }, ] - http_client = HttpxHttpClient() - with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - request_list = await RequestList.open(actor_start_urls_input=actor_start_urls_input, http_client=http_client) + mocked_http_client = mock.Mock(spec_set=HttpxHttpClient) + with mock.patch.object(mocked_http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): + request_list = await RequestList.open( + request_list_sources_input=request_list_sources_input, http_client=mocked_http_client + ) generated_requests = [] while request := await request_list.fetch_next_request(): generated_requests.append(request) @@ -159,9 +162,11 @@ async def test_request_list_open_from_url_additional_inputs() -> None: } response_bodies = iter((expected_simple_url,)) - http_client = HttpxHttpClient() - with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - request_list = await RequestList.open(actor_start_urls_input=[example_start_url_input], http_client=http_client) + mocked_http_client = mock.Mock(spec_set=HttpxHttpClient) + with mock.patch.object(mocked_http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): + request_list = await RequestList.open( + request_list_sources_input=[example_start_url_input], http_client=mocked_http_client + ) request = await request_list.fetch_next_request() # Check all properties correctly created for request From 318c9c8353c9d0c0a433c075de350ddb3ef0be3c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 20 Nov 2024 09:26:50 +0100 Subject: [PATCH 17/20] Addresing review comments 2 --- src/apify/storages/_request_list.py | 32 +++++--- tests/unit/actor/test_request_list.py | 108 ++++++++++---------------- 2 files changed, 62 insertions(+), 78 deletions(-) diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index c4ca1d9a..b3ce3f03 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -51,17 +51,25 @@ async def open( ) -> RequestList: """Creates RequestList from Actor input requestListSources. - name is name of the returned RequestList - request_list_sources_input can contain list dicts with either url or requestsFromUrl key - http_client is client that will be used to send get request to url defined in requestsFromUrl - - Example request_list_sources_input: - [ - # Gather urls from response body. - {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, - # Directly include this url. - {'url': 'https://crawlee.dev', 'method': 'GET'} - ] + Args: + name: Name of the returned RequestList. + request_list_sources_input: List of dicts with either url key or requestsFromUrl key. + http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys. + + Returns: + RequestList created from request_list_sources_input. + + ### Usage + + ```python + example_input = [ + # Gather urls from response body. + {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'}, + # Directly include this url. + {'url': 'https://crawlee.dev', 'method': 'GET'} + ] + request_list = await RequestList.open(request_list_sources_input=example_input) + ``` """ request_list_sources_input = request_list_sources_input or [] return await RequestList._create_request_list(name, request_list_sources_input, http_client) @@ -73,7 +81,7 @@ async def _create_request_list( if not http_client: http_client = HttpxHttpClient() - ulr_inputs = url_input_adapter.validate_python(request_list_sources_input) # instance of list[Union[...]] + ulr_inputs = url_input_adapter.validate_python(request_list_sources_input) simple_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _SimpleUrlInput] remote_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _RequestsFromUrlInput] diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index 6c26ba64..4f4c75ac 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -1,15 +1,15 @@ from __future__ import annotations import re -from typing import Any, Iterator, get_args -from unittest import mock -from unittest.mock import call +from dataclasses import dataclass +from typing import Any, get_args import pytest +import respx +from httpx import Response from crawlee._request import UserData -from crawlee._types import HttpHeaders, HttpMethod -from crawlee.http_clients import HttpResponse, HttpxHttpClient +from crawlee._types import HttpMethod from apify.storages._request_list import URL_NO_COMMAS_REGEX, RequestList @@ -52,30 +52,9 @@ async def test_request_list_open_request_types(request_method: HttpMethod, optio assert request.headers.root == optional_input.get('headers', {}) -def _create_dummy_response(read_output: Iterator[str]) -> HttpResponse: - """Create dummy_response that will iterate through read_output when called like dummy_response.read()""" - - class DummyResponse(HttpResponse): - @property - def http_version(self) -> str: - return '' - - @property - def status_code(self) -> int: - return 200 - - @property - def headers(self) -> HttpHeaders: - return HttpHeaders() - - def read(self) -> bytes: - return next(read_output).encode('utf-8') - - return DummyResponse() - - -async def test__request_list_open_from_url_correctly_send_requests() -> None: - """Test that injected HttpClient's method send_request is called with properly passed arguments.""" +@respx.mock +async def test_request_list_open_from_url_correctly_send_requests() -> None: + """Test that requests are sent to expected urls.""" request_list_sources_input: list[dict[str, Any]] = [ { 'requestsFromUrl': 'https://abc.dev/file.txt', @@ -94,65 +73,65 @@ async def test__request_list_open_from_url_correctly_send_requests() -> None: }, ] - mocked_read_outputs = ('' for url in request_list_sources_input) + routes = [respx.get(entry['requestsFromUrl']) for entry in request_list_sources_input] - mocked_http_client = mock.Mock(spec_set=HttpxHttpClient) - with mock.patch.object( - mocked_http_client, 'send_request', return_value=_create_dummy_response(mocked_read_outputs) - ) as mocked_send_request: - await RequestList.open(request_list_sources_input=request_list_sources_input, http_client=mocked_http_client) + await RequestList.open(request_list_sources_input=request_list_sources_input) - expected_calls = [ - call( - method='GET', - url=example_input['requestsFromUrl'], - ) - for example_input in request_list_sources_input - ] - mocked_send_request.assert_has_calls(expected_calls) + for route in routes: + assert route.called +@respx.mock async def test_request_list_open_from_url() -> None: """Test that create_request_list is correctly reading urls from remote url sources and also from simple input.""" expected_simple_url = 'https://www.someurl.com' expected_remote_urls_1 = {'http://www.something.com', 'https://www.somethingelse.com', 'http://www.bla.net'} expected_remote_urls_2 = {'http://www.ok.com', 'https://www.true-positive.com'} expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url} - response_bodies = iter( - ( + + @dataclass + class MockedUrlInfo: + url: str + response_text: str + + mocked_urls = ( + MockedUrlInfo( + 'https://abc.dev/file.txt', 'blablabla{} more blablabla{} , even more blablabla. {} '.format(*expected_remote_urls_1), - 'some stuff{} more stuff{} www.falsepositive www.false_positive.com'.format(*expected_remote_urls_2), - ) + ), + MockedUrlInfo( + 'https://www.abc.dev/file2', + 'some stuff{} more stuff{} www.false_positive.com'.format(*expected_remote_urls_2), + ), ) request_list_sources_input = [ { - 'requestsFromUrl': 'https://abc.dev/file.txt', + 'requestsFromUrl': mocked_urls[0].url, 'method': 'GET', }, {'url': expected_simple_url, 'method': 'GET'}, { - 'requestsFromUrl': 'https://www.abc.dev/file2', + 'requestsFromUrl': mocked_urls[1].url, 'method': 'GET', }, ] + for mocked_url in mocked_urls: + respx.get(mocked_url.url).mock(return_value=Response(200, text=mocked_url.response_text)) - mocked_http_client = mock.Mock(spec_set=HttpxHttpClient) - with mock.patch.object(mocked_http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - request_list = await RequestList.open( - request_list_sources_input=request_list_sources_input, http_client=mocked_http_client - ) - generated_requests = [] - while request := await request_list.fetch_next_request(): - generated_requests.append(request) + request_list = await RequestList.open(request_list_sources_input=request_list_sources_input) + generated_requests = [] + while request := await request_list.fetch_next_request(): + generated_requests.append(request) # Check correctly created requests' urls in request list assert {generated_request.url for generated_request in generated_requests} == expected_urls +@respx.mock async def test_request_list_open_from_url_additional_inputs() -> None: """Test that all generated request properties are correctly populated from input values.""" - expected_simple_url = 'https://www.someurl.com' + expected_url = 'https://www.someurl.com' example_start_url_input: dict[str, Any] = { 'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'POST', @@ -161,17 +140,14 @@ async def test_request_list_open_from_url_additional_inputs() -> None: 'userData': {'another_key': 'another_value'}, } - response_bodies = iter((expected_simple_url,)) - mocked_http_client = mock.Mock(spec_set=HttpxHttpClient) - with mock.patch.object(mocked_http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): - request_list = await RequestList.open( - request_list_sources_input=[example_start_url_input], http_client=mocked_http_client - ) - request = await request_list.fetch_next_request() + respx.get(example_start_url_input['requestsFromUrl']).mock(return_value=Response(200, text=expected_url)) + + request_list = await RequestList.open(request_list_sources_input=[example_start_url_input]) + request = await request_list.fetch_next_request() # Check all properties correctly created for request assert request - assert request.url == expected_simple_url + assert request.url == expected_url assert request.method == example_start_url_input['method'] assert request.headers.root == example_start_url_input['headers'] assert request.payload == str(example_start_url_input['payload']).encode('utf-8') From feff6badb5badccb01170fa2f8c62f1480158291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Wed, 20 Nov 2024 15:18:10 +0100 Subject: [PATCH 18/20] Update src/apify/storages/_request_list.py Co-authored-by: Jan Buchar --- src/apify/storages/_request_list.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index b3ce3f03..3d21871c 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -81,10 +81,10 @@ async def _create_request_list( if not http_client: http_client = HttpxHttpClient() - ulr_inputs = url_input_adapter.validate_python(request_list_sources_input) + url_inputs = url_input_adapter.validate_python(request_list_sources_input) - simple_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _SimpleUrlInput] - remote_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _RequestsFromUrlInput] + simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)] + remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)] simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs) remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client) From b150e1dec152791d373de8a27f7d050e1d96381a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 20 Nov 2024 15:30:40 +0100 Subject: [PATCH 19/20] Use docs_group decorator --- src/apify/storages/_request_list.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index 3d21871c..0ad521c6 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -13,6 +13,8 @@ from crawlee.http_clients import BaseHttpClient, HttpxHttpClient from crawlee.storages import RequestList as CrawleeRequestList +from apify._utils import docs_group + URL_NO_COMMAS_REGEX = re.compile( r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?' ) @@ -36,7 +38,7 @@ class _SimpleUrlInput(_RequestDetails): url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]]) -# @docs_group('Classes') # Not yet available in crawlee +@docs_group('Classes') # Not yet available in crawlee class RequestList(CrawleeRequestList): """Extends crawlee RequestList. From 470041f3bc1c76c0ef6e2ebe46a3ff1b41dfb991 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 20 Nov 2024 16:44:31 +0100 Subject: [PATCH 20/20] Update src/apify/storages/_request_list.py --- src/apify/storages/_request_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index 0ad521c6..2dd381fa 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -38,7 +38,7 @@ class _SimpleUrlInput(_RequestDetails): url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]]) -@docs_group('Classes') # Not yet available in crawlee +@docs_group('Classes') class RequestList(CrawleeRequestList): """Extends crawlee RequestList.