Skip to content

Commit 0b82f3b

Browse files
authored
feat: Add goto_options for PlaywrightCrawler (#1599)
### Description - This PR adds `goto_options` for `PlaywrightCrawler`, which allows additional configuration of `page.goto`. - Also, `goto_options` is now an attribute for `PlaywrightPreNavCrawlingContext`, for configuration in `pre_navigation_hooks` for specific URLs. ### Issues - Closes: #1576
1 parent 8351aa1 commit 0b82f3b

File tree

4 files changed

+30
-6
lines changed

4 files changed

+30
-6
lines changed

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from playwright.async_api import Page, Response
1818
from typing_extensions import Self
1919

20-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
20+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
2121

2222

2323
TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ async def from_playwright_crawling_context(
190190
http_response = await PlaywrightHttpResponse.from_playwright_response(
191191
response=context.response, protocol=protocol_guess or ''
192192
)
193-
# block_requests is useful only on pre-navigation contexts. It is useless here.
193+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
194194
context_kwargs.pop('block_requests')
195+
context_kwargs.pop('goto_options')
195196
return cls(
196197
parsed_content=await parser.parse(http_response),
197198
http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
212213
block_requests: BlockRequestsFunction | None = None
213214
"""Blocks network requests matching specified URL patterns."""
214215

216+
goto_options: GotoOptions | None = None
217+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218+
215219
@property
216220
def page(self) -> Page:
217221
"""The Playwright `Page` object for the current page.

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from ._playwright_crawling_context import PlaywrightCrawlingContext
3636
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
3737
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
38+
from ._types import GotoOptions
3839
from ._utils import block_requests, infinite_scroll
3940

4041
TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
@@ -108,6 +109,7 @@ def __init__(
108109
user_data_dir: str | Path | None = None,
109110
browser_launch_options: Mapping[str, Any] | None = None,
110111
browser_new_context_options: Mapping[str, Any] | None = None,
112+
goto_options: GotoOptions | None = None,
111113
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
112114
headless: bool | None = None,
113115
use_incognito_pages: bool | None = None,
@@ -142,6 +144,8 @@ def __init__(
142144
This option should not be used if `browser_pool` is provided.
143145
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
144146
the request handler)
147+
goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
148+
not supported, use `navigation_timeout` instead.
145149
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
146150
"""
147151
configuration = kwargs.pop('configuration', None)
@@ -213,6 +217,7 @@ def __init__(
213217
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
214218

215219
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
220+
self._goto_options = goto_options or GotoOptions()
216221

217222
super().__init__(**kwargs)
218223

@@ -238,6 +243,7 @@ async def _open_page(
238243
log=context.log,
239244
page=crawlee_page.page,
240245
block_requests=partial(block_requests, page=crawlee_page.page),
246+
goto_options=GotoOptions(**self._goto_options),
241247
)
242248

243249
context_id = id(pre_navigation_context)
@@ -321,7 +327,7 @@ async def _navigate(
321327
try:
322328
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
323329
response = await context.page.goto(
324-
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
330+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
325331
)
326332
except playwright.async_api.TimeoutError as exc:
327333
raise asyncio.TimeoutError from exc
@@ -351,6 +357,7 @@ async def _navigate(
351357
extract_links=extract_links,
352358
enqueue_links=self._create_enqueue_links_function(context, extract_links),
353359
block_requests=partial(block_requests, page=context.page),
360+
goto_options=context.goto_options,
354361
)
355362

356363
if context.session:

src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
if TYPE_CHECKING:
1010
from playwright.async_api import Page
1111

12-
from ._types import BlockRequestsFunction
12+
from ._types import BlockRequestsFunction, GotoOptions
1313

1414

1515
@dataclass(frozen=True)
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
2626
block_requests: BlockRequestsFunction
2727
"""Blocks network requests matching specified URL patterns."""
2828

29+
goto_options: GotoOptions
30+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
31+
2932
async def get_snapshot(self) -> PageSnapshot:
3033
"""Get snapshot of crawled page."""
3134
html = None

src/crawlee/crawlers/_playwright/_types.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
from dataclasses import dataclass
4-
from typing import TYPE_CHECKING, Protocol
4+
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
55

66
from crawlee import HttpHeaders
77
from crawlee._utils.docs import docs_group
@@ -10,7 +10,7 @@
1010
from collections.abc import AsyncGenerator
1111

1212
from playwright.async_api import APIResponse, Response
13-
from typing_extensions import Self
13+
from typing_extensions import NotRequired, Self
1414

1515

1616
@docs_group('Functions')
@@ -58,3 +58,13 @@ async def from_playwright_response(cls, response: Response | APIResponse, protoc
5858
_content = await response.body()
5959

6060
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
61+
62+
63+
class GotoOptions(TypedDict):
64+
"""Keyword arguments for Playwright's `Page.goto()` method."""
65+
66+
wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
67+
"""When to consider operation succeeded, defaults to 'load' event."""
68+
69+
referer: NotRequired[str]
70+
"""Referer header value."""

0 commit comments

Comments
 (0)