Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ indent-style = "space"
docstring-quotes = "double"
inline-quotes = "single"

[tool.ruff.lint.flake8-type-checking]
runtime-evaluated-base-classes = ["pydantic.BaseModel", "crawlee.configuration.Configuration"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though crawlee.configuration.Configuration inherits from pydantic_settings.BaseSettings and that one from pydantic.BaseModel, ruff has some issues in following inheritance hierarchy too far. So until that changes, some models will have to be explicitly mentioned even though they inherit from pydantic.BaseModel.
See closed issue where this is described as known limitation to certain degree:
astral-sh/ruff#8725


[tool.ruff.lint.flake8-builtins]
builtins-ignorelist = ["id"]

Expand Down
3 changes: 2 additions & 1 deletion src/apify/_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from lazy_object_proxy import Proxy
from pydantic import AliasChoices
from typing_extensions import Self

from apify_client import ApifyClientAsync
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
Expand All @@ -31,6 +30,8 @@
import logging
from types import TracebackType

from typing_extensions import Self

from crawlee.proxy_configuration import _NewUrlFunction

from apify._models import Webhook
Expand Down
1 change: 0 additions & 1 deletion src/apify/_configuration.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work)
from __future__ import annotations

from datetime import datetime, timedelta
Expand Down
1 change: 0 additions & 1 deletion src/apify/_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
from __future__ import annotations

from datetime import datetime, timedelta
Expand Down
2 changes: 1 addition & 1 deletion src/apify/_platform_event_manager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import asyncio
from datetime import datetime # noqa: TCH003
from datetime import datetime
from typing import TYPE_CHECKING, Annotated, Any, Literal, Union

import websockets.client
Expand Down
6 changes: 4 additions & 2 deletions src/apify/scrapy/middlewares/apify_proxy.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urlparse

try:
from scrapy import Request, Spider # noqa: TCH002
if TYPE_CHECKING:
from scrapy import Request, Spider
from scrapy.crawler import Crawler
from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.crawler import Crawler # noqa: TCH002
from scrapy.exceptions import NotConfigured
except ImportError as exc:
raise ImportError(
Expand Down
5 changes: 4 additions & 1 deletion src/apify/scrapy/pipelines/actor_dataset_push.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from itemadapter.adapter import ItemAdapter

try:
from scrapy import Item, Spider # noqa: TCH002
if TYPE_CHECKING:
from scrapy import Item, Spider
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
Expand Down
5 changes: 4 additions & 1 deletion src/apify/scrapy/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from __future__ import annotations

import traceback
from typing import TYPE_CHECKING

from apify._configuration import Configuration
from apify.apify_storage_client import ApifyStorageClient

try:
from scrapy import Spider
from scrapy.core.scheduler import BaseScheduler
from scrapy.http.request import Request # noqa: TCH002
from scrapy.utils.reactor import is_asyncio_reactor_installed

if TYPE_CHECKING:
from scrapy.http.request import Request
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
Expand Down
5 changes: 4 additions & 1 deletion src/apify/scrapy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

import asyncio
from base64 import b64encode
from typing import TYPE_CHECKING
from urllib.parse import unquote

from apify_shared.utils import ignore_docs

try:
from scrapy.settings import Settings # noqa: TCH002
from scrapy.utils.project import get_project_settings
from scrapy.utils.python import to_bytes

if TYPE_CHECKING:
from scrapy.settings import Settings
except ImportError as exc:
raise ImportError(
'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
Expand Down
4 changes: 3 additions & 1 deletion src/apify/storages/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from crawlee.storages import Dataset, KeyValueStore, RequestQueue

__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue']
from .request_list import RequestList

__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList']
137 changes: 137 additions & 0 deletions src/apify/storages/request_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from __future__ import annotations

import asyncio
import re
from asyncio import Task
from functools import partial
from typing import Any, Union

from pydantic import BaseModel, Field, TypeAdapter

from crawlee import Request
from crawlee._types import HttpMethod
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
from crawlee.storages import RequestList as CrawleeRequestList

URL_NO_COMMAS_REGEX = re.compile(
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
)


class _RequestDetails(BaseModel):
method: HttpMethod = 'GET'
payload: str = ''
headers: dict[str, str] = Field(default_factory=dict)
user_data: dict[str, str] = Field(default_factory=dict, alias='userData')


class _RequestsFromUrlInput(_RequestDetails):
requests_from_url: str = Field(alias='requestsFromUrl')


class _SimpleUrlInput(_RequestDetails):
url: str


url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])


class RequestList(CrawleeRequestList):
"""Extends crawlee RequestList."""

@classmethod
async def open(
cls,
name: str | None = None,
actor_start_urls_input: list[dict[str, Any]] | None = None,
http_client: BaseHttpClient | None = None,
) -> RequestList:
"""Creates RequestList from Actor input requestListSources.
name is name of the returned RequestList
actor_start_urls_input can contain list dicts with either url or requestsFromUrl key
http_client is client that will be used to send get request to url defined in requestsFromUrl
Example actor_start_urls_input:
[
# Gather urls from response body.
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
# Directly include this url.
{'url': 'https://crawlee.dev', 'method': 'GET'}
]
"""
actor_start_urls_input = actor_start_urls_input or []
return await _create_request_list(name, actor_start_urls_input, http_client)


async def _create_request_list(
name: str | None, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None
) -> RequestList:
if not http_client:
http_client = HttpxHttpClient()

ulr_inputs = url_input_adapter.validate_python(actor_start_urls_input) # instance of list[Union[...]]

simple_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _SimpleUrlInput]
remote_url_inputs = [url_input for url_input in ulr_inputs if type(url_input) is _RequestsFromUrlInput]

simple_url_requests = _create_requests_from_input(simple_url_inputs)
remote_url_requests = await _create_requests_from_url(remote_url_inputs, http_client=http_client)

return RequestList(name=name, requests=simple_url_requests + remote_url_requests)


def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
return [
Request.from_url(
method=request_input.method,
url=request_input.url,
payload=request_input.payload.encode('utf-8'),
headers=request_input.headers,
user_data=request_input.user_data,
)
for request_input in simple_url_inputs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw thinking about this class reminded me of this: apify/crawlee#2466

Basically, in JS Crawlee, we used to instantiate all the requests at the beginning (like here), but that ate up much more memory than just keeping the POJO objects (I guess dicts in Python) and instantiating the Request class only once we really need it. Maybe Python handles this better (has slimmer class instances).

It was at that time we learned that users love to make >1.000.000 requests long RLs 😅

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a valid point, but I think it will be done later in apify/crawlee-python#99 as the current class is openly not as mature as the JS version:

https://github.com/apify/crawlee-python/blob/master/src/crawlee/storages/_request_list.py#L23

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, let's make sure the SDK supports everything the platform can offer first, then we can optimize.

]


async def _create_requests_from_url(
remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
) -> list[Request]:
"""Crete list of requests from url.
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
"""
created_requests: list[Request] = []

def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
"""Callback to scrape response body with regexp and create Requests from matches."""
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
created_requests.extend(
[
Request.from_url(
match.group(0),
method=request_input.method,
payload=request_input.payload.encode('utf-8'),
headers=request_input.headers,
user_data=request_input.user_data,
)
for match in matches
]
)

remote_url_requests = []
for remote_url_requests_input in remote_url_requests_inputs:
get_response_task = asyncio.create_task(
http_client.send_request(
method='GET',
url=remote_url_requests_input.requests_from_url,
)
)

get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
remote_url_requests.append(get_response_task)

await asyncio.gather(*remote_url_requests)
return created_requests
Loading