Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from playwright.async_api import Page, Response
from typing_extensions import Self

from crawlee.crawlers._playwright._types import BlockRequestsFunction
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions


TStaticParseResult = TypeVar('TStaticParseResult')
Expand Down Expand Up @@ -190,8 +190,9 @@ async def from_playwright_crawling_context(
http_response = await PlaywrightHttpResponse.from_playwright_response(
response=context.response, protocol=protocol_guess or ''
)
# block_requests is useful only on pre-navigation contexts. It is useless here.
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
context_kwargs.pop('block_requests')
context_kwargs.pop('goto_options')
return cls(
parsed_content=await parser.parse(http_response),
http_response=http_response,
Expand All @@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
block_requests: BlockRequestsFunction | None = None
"""Blocks network requests matching specified URL patterns."""

goto_options: GotoOptions | None = None
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""

@property
def page(self) -> Page:
"""The Playwright `Page` object for the current page.
Expand Down
9 changes: 8 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from ._playwright_crawling_context import PlaywrightCrawlingContext
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
from ._types import GotoOptions
from ._utils import block_requests, infinite_scroll

TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
Expand Down Expand Up @@ -108,6 +109,7 @@ def __init__(
user_data_dir: str | Path | None = None,
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
goto_options: GotoOptions | None = None,
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
headless: bool | None = None,
use_incognito_pages: bool | None = None,
Expand Down Expand Up @@ -142,6 +144,8 @@ def __init__(
This option should not be used if `browser_pool` is provided.
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
the request handler)
goto_options: Additional options to pass to Playwright's `Page.goto()` method. Don't support `timeout`,
use `navigation_timeout` instead.
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
"""
configuration = kwargs.pop('configuration', None)
Expand Down Expand Up @@ -213,6 +217,7 @@ def __init__(
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)

self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
self._goto_options = goto_options or GotoOptions()

super().__init__(**kwargs)

Expand All @@ -238,6 +243,7 @@ async def _open_page(
log=context.log,
page=crawlee_page.page,
block_requests=partial(block_requests, page=crawlee_page.page),
goto_options=GotoOptions(**self._goto_options),
)

context_id = id(pre_navigation_context)
Expand Down Expand Up @@ -321,7 +327,7 @@ async def _navigate(
try:
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
response = await context.page.goto(
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
)
except playwright.async_api.TimeoutError as exc:
raise asyncio.TimeoutError from exc
Expand Down Expand Up @@ -351,6 +357,7 @@ async def _navigate(
extract_links=extract_links,
enqueue_links=self._create_enqueue_links_function(context, extract_links),
block_requests=partial(block_requests, page=context.page),
goto_options=context.goto_options,
)

if context.session:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
if TYPE_CHECKING:
from playwright.async_api import Page

from ._types import BlockRequestsFunction
from ._types import BlockRequestsFunction, GotoOptions


@dataclass(frozen=True)
Expand All @@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
block_requests: BlockRequestsFunction
"""Blocks network requests matching specified URL patterns."""

goto_options: GotoOptions
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""

async def get_snapshot(self) -> PageSnapshot:
"""Get snapshot of crawled page."""
html = None
Expand Down
14 changes: 12 additions & 2 deletions src/crawlee/crawlers/_playwright/_types.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Protocol
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict

from crawlee import HttpHeaders
from crawlee._utils.docs import docs_group
Expand All @@ -10,7 +10,7 @@
from collections.abc import AsyncGenerator

from playwright.async_api import APIResponse, Response
from typing_extensions import Self
from typing_extensions import NotRequired, Self


@docs_group('Functions')
Expand Down Expand Up @@ -58,3 +58,13 @@ async def from_playwright_response(cls, response: Response | APIResponse, protoc
_content = await response.body()

return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)


class GotoOptions(TypedDict):
"""Keyword arguments for Playwright's `Page.goto()` method."""

wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
"""When to consider operation succeeded, defaults to 'load' event."""

referer: NotRequired[str]
"""Referer header value."""
Loading