Skip to content

Commit eae3a33

Browse files
authored
feat: add support use_incognito_pages for browser_launch_options in PlaywrightCrawler (#941)
### Description - Improve cookie handling for `PlaywrightCrawler`. Cookies are now stored in the `Session` and set in Playwright Context from the `Session`. - Add `use_incognito_pages` option for `browser_launch_options` allowing each new page to be launched in a separate context. ### Issues - #722 - #933
1 parent 3c89827 commit eae3a33

File tree

6 files changed

+151
-13
lines changed

6 files changed

+151
-13
lines changed

src/crawlee/browsers/_browser_pool.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def with_default_plugin(
103103
browser_launch_options: Mapping[str, Any] | None = None,
104104
browser_new_context_options: Mapping[str, Any] | None = None,
105105
headless: bool | None = None,
106+
use_incognito_pages: bool | None = False,
106107
**kwargs: Any,
107108
) -> BrowserPool:
108109
"""Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
@@ -116,6 +117,8 @@ def with_default_plugin(
116117
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
117118
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
118119
headless: Whether to run the browser in headless mode.
120+
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
121+
own context that is destroyed once the page is closed or crashes.
119122
kwargs: Additional arguments for default constructor.
120123
"""
121124
plugin_options: dict = defaultdict(dict)
@@ -125,6 +128,9 @@ def with_default_plugin(
125128
if headless is not None:
126129
plugin_options['browser_launch_options']['headless'] = headless
127130

131+
if use_incognito_pages is not None:
132+
plugin_options['use_incognito_pages'] = use_incognito_pages
133+
128134
if browser_type:
129135
plugin_options['browser_type'] = browser_type
130136

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,20 +41,24 @@ def __init__(
4141
browser: Browser,
4242
*,
4343
max_open_pages_per_browser: int = 20,
44+
use_incognito_pages: bool = False,
4445
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
4546
) -> None:
4647
"""A default constructor.
4748
4849
Args:
4950
browser: The browser instance to control.
5051
max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
52+
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
53+
own context that is destroyed once the page is closed or crashes.
5154
header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
5255
requests made by the browser. By default, a predefined header generator is used. Set to `None` to
5356
disable automatic header modifications.
5457
"""
5558
self._browser = browser
5659
self._max_open_pages_per_browser = max_open_pages_per_browser
5760
self._header_generator = header_generator
61+
self._use_incognito_pages = use_incognito_pages
5862

5963
self._browser_context: BrowserContext | None = None
6064
self._pages = list[Page]()
@@ -115,13 +119,20 @@ async def new_page(
115119
Raises:
116120
ValueError: If the browser has reached the maximum number of open pages.
117121
"""
118-
if not self._browser_context:
119-
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
120-
121122
if not self.has_free_capacity:
122123
raise ValueError('Cannot open more pages in this browser.')
123124

124-
page = await self._browser_context.new_page()
125+
if self._use_incognito_pages:
126+
# We use https://playwright.dev/python/docs/api/class-browser#browser-new-page for create a page in
127+
# a separate context.
128+
page_context_options = self._create_context_options(browser_new_context_options, proxy_info)
129+
page = await self._browser.new_page(**page_context_options)
130+
else:
131+
# We use https://playwright.dev/python/docs/api/class-browser#browser-new-context for create context
132+
# The page are then created in this context
133+
if not self._browser_context:
134+
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
135+
page = await self._browser_context.new_page()
125136

126137
# Handle page close event
127138
page.on(event='close', f=self._on_page_close)
@@ -153,10 +164,10 @@ def _on_page_close(self, page: Page) -> None:
153164
"""Handle actions after a page is closed."""
154165
self._pages.remove(page)
155166

156-
async def _create_browser_context(
167+
def _create_context_options(
157168
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
158-
) -> BrowserContext:
159-
"""Create a new browser context with the specified proxy settings."""
169+
) -> Mapping[str, Any]:
170+
"""Create context options for context and single pages with the specified proxy settings."""
160171
if self._header_generator:
161172
common_headers = self._header_generator.get_common_headers()
162173
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
@@ -179,5 +190,11 @@ async def _create_browser_context(
179190
username=proxy_info.username,
180191
password=proxy_info.password,
181192
)
193+
return browser_new_context_options
182194

195+
async def _create_browser_context(
196+
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
197+
) -> BrowserContext:
198+
"""Create a new browser context with the specified proxy settings."""
199+
browser_new_context_options = self._create_context_options(browser_new_context_options, proxy_info)
183200
return await self._browser.new_context(**browser_new_context_options)

src/crawlee/browsers/_playwright_browser_plugin.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(
4343
browser_launch_options: dict[str, Any] | None = None,
4444
browser_new_context_options: dict[str, Any] | None = None,
4545
max_open_pages_per_browser: int = 20,
46+
use_incognito_pages: bool = False,
4647
) -> None:
4748
"""A default constructor.
4849
@@ -56,6 +57,8 @@ def __init__(
5657
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
5758
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
5859
Once reached, a new browser instance will be launched to handle the excess.
60+
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
61+
own context that is destroyed once the page is closed or crashes.
5962
"""
6063
config = service_locator.get_configuration()
6164

@@ -70,6 +73,7 @@ def __init__(
7073
self._browser_launch_options = default_launch_browser_options | (browser_launch_options or {})
7174
self._browser_new_context_options = browser_new_context_options or {}
7275
self._max_open_pages_per_browser = max_open_pages_per_browser
76+
self._use_incognito_pages = use_incognito_pages
7377

7478
self._playwright_context_manager = async_playwright()
7579
self._playwright: Playwright | None = None
@@ -154,5 +158,6 @@ async def new_browser(self) -> PlaywrightBrowserController:
154158

155159
return PlaywrightBrowserController(
156160
browser,
161+
use_incognito_pages=self._use_incognito_pages,
157162
max_open_pages_per_browser=self._max_open_pages_per_browser,
158163
)

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import TYPE_CHECKING, Any, Callable
66

77
from pydantic import ValidationError
8+
from yarl import URL
89

910
from crawlee import EnqueueStrategy, RequestTransformAction
1011
from crawlee._request import Request, RequestOptions
@@ -22,6 +23,7 @@
2223
if TYPE_CHECKING:
2324
from collections.abc import AsyncGenerator, Awaitable, Mapping
2425

26+
from playwright.async_api import Page
2527
from typing_extensions import Unpack
2628

2729
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
@@ -76,6 +78,7 @@ def __init__(
7678
browser_launch_options: Mapping[str, Any] | None = None,
7779
browser_new_context_options: Mapping[str, Any] | None = None,
7880
headless: bool | None = None,
81+
use_incognito_pages: bool | None = None,
7982
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
8083
) -> None:
8184
"""A default constructor.
@@ -94,17 +97,27 @@ def __init__(
9497
This option should not be used if `browser_pool` is provided.
9598
headless: Whether to run the browser in headless mode.
9699
This option should not be used if `browser_pool` is provided.
100+
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
101+
own context that is destroyed once the page is closed or crashes.
102+
This option should not be used if `browser_pool` is provided.
97103
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
98104
"""
99105
if browser_pool:
100106
# Raise an exception if browser_pool is provided together with other browser-related arguments.
101107
if any(
102108
param is not None
103-
for param in (headless, browser_type, browser_launch_options, browser_new_context_options)
109+
for param in (
110+
use_incognito_pages,
111+
headless,
112+
browser_type,
113+
browser_launch_options,
114+
browser_new_context_options,
115+
)
104116
):
105117
raise ValueError(
106-
'You cannot provide `headless`, `browser_type`, `browser_launch_options` or '
107-
'`browser_new_context_options` arguments when `browser_pool` is provided.'
118+
'You cannot provide `headless`, `browser_type`, `browser_launch_options`'
119+
'`browser_new_context_options` or `use_incognito_pages` arguments when '
120+
'`browser_pool` is provided.'
108121
)
109122

110123
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
@@ -114,6 +127,7 @@ def __init__(
114127
browser_type=browser_type,
115128
browser_launch_options=browser_launch_options,
116129
browser_new_context_options=browser_new_context_options,
130+
use_incognito_pages=use_incognito_pages,
117131
)
118132

119133
self._browser_pool = browser_pool
@@ -175,6 +189,9 @@ async def _navigate(
175189
infinite_scroll and block_requests).
176190
"""
177191
async with context.page:
192+
if context.session:
193+
await self._set_cookies(context.page, context.request.url, context.session.cookies)
194+
178195
if context.request.headers:
179196
await context.page.set_extra_http_headers(context.request.headers.model_dump())
180197
# Navigate to the URL and get response.
@@ -186,6 +203,10 @@ async def _navigate(
186203
# Set the loaded URL to the actual URL after redirection.
187204
context.request.loaded_url = context.page.url
188205

206+
if context.session:
207+
cookies = await self._get_cookies(context.page)
208+
context.session.cookies.update(cookies)
209+
189210
async def enqueue_links(
190211
*,
191212
selector: str = 'a',
@@ -295,3 +316,15 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
295316
hook: A coroutine function to be called before each navigation.
296317
"""
297318
self._pre_navigation_hooks.append(hook)
319+
320+
async def _get_cookies(self, page: Page) -> dict[str, str]:
321+
"""Get the cookies from the page."""
322+
cookies = await page.context.cookies()
323+
return {cookie['name']: cookie['value'] for cookie in cookies if cookie.get('name') and cookie.get('value')}
324+
325+
async def _set_cookies(self, page: Page, url: str, cookies: dict[str, str]) -> None:
326+
"""Set the cookies to the page."""
327+
parsed_url = URL(url)
328+
await page.context.add_cookies(
329+
[{'name': name, 'value': value, 'domain': parsed_url.host, 'path': '/'} for name, value in cookies.items()]
330+
)

src/crawlee/sessions/_session.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def __init__(
3838
usage_count: int = 0,
3939
max_usage_count: int = 50,
4040
error_score: float = 0.0,
41-
cookies: dict | None = None,
41+
cookies: dict[str, str] | None = None,
4242
blocked_status_codes: list | None = None,
4343
) -> None:
4444
"""A default constructor.
@@ -94,7 +94,7 @@ def user_data(self) -> dict:
9494
return self._user_data
9595

9696
@property
97-
def cookies(self) -> dict:
97+
def cookies(self) -> dict[str, str]:
9898
"""Get the cookies."""
9999
return self._cookies
100100

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
from typing import TYPE_CHECKING
99
from unittest import mock
1010

11-
from crawlee import Glob, HttpHeaders, Request, RequestTransformAction
11+
import pytest
12+
13+
from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction
1214
from crawlee._types import EnqueueStrategy
1315
from crawlee.crawlers import PlaywrightCrawler
1416
from crawlee.fingerprint_suite._consts import (
@@ -19,6 +21,7 @@
1921
PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT,
2022
)
2123
from crawlee.proxy_configuration import ProxyConfiguration
24+
from crawlee.sessions import SessionPool
2225

2326
if TYPE_CHECKING:
2427
from yarl import URL
@@ -247,3 +250,77 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
247250
await crawler.run(['https://test.com'])
248251

249252
assert handler_data.get('proxy') == proxy_value
253+
254+
255+
@pytest.mark.parametrize(
256+
'use_incognito_pages',
257+
[
258+
pytest.param(False, id='without use_incognito_pages'),
259+
pytest.param(True, id='with use_incognito_pages'),
260+
],
261+
)
262+
async def test_isolation_cookies(*, use_incognito_pages: bool, httpbin: URL) -> None:
263+
sessions_ids: list[str] = []
264+
sessions_cookies: dict[str, dict[str, str]] = {}
265+
response_cookies: dict[str, dict[str, str]] = {}
266+
267+
crawler = PlaywrightCrawler(
268+
session_pool=SessionPool(max_pool_size=1),
269+
use_incognito_pages=use_incognito_pages,
270+
concurrency_settings=ConcurrencySettings(max_concurrency=1),
271+
)
272+
273+
@crawler.router.default_handler
274+
async def handler(context: PlaywrightCrawlingContext) -> None:
275+
if not context.session:
276+
return
277+
278+
sessions_ids.append(context.session.id)
279+
280+
if context.request.unique_key not in {'1', '2'}:
281+
return
282+
283+
sessions_cookies[context.session.id] = context.session.cookies
284+
response_data = json.loads(await context.response.text())
285+
response_cookies[context.session.id] = response_data.get('cookies')
286+
287+
if context.request.user_data.get('retire_session'):
288+
context.session.retire()
289+
290+
await crawler.run(
291+
[
292+
# The first request sets the cookie in the session
293+
str(httpbin.with_path('/cookies/set').extend_query(a=1)),
294+
# With the second request, we check the cookies in the session and set retire
295+
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
296+
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'),
297+
]
298+
)
299+
300+
assert len(sessions_cookies) == 2
301+
assert len(response_cookies) == 2
302+
303+
assert sessions_ids[0] == sessions_ids[1]
304+
305+
cookie_session_id = sessions_ids[0]
306+
clean_session_id = sessions_ids[2]
307+
308+
assert cookie_session_id != clean_session_id
309+
310+
# When using `use_incognito_pages` there should be full cookie isolation
311+
if use_incognito_pages:
312+
# The initiated cookies must match in both the response and the session store
313+
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}
314+
315+
# For a clean session, the cookie should not be in the sesstion store or in the response
316+
# This way we can be sure that no cookies are being leaked through the http client
317+
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}
318+
# Without `use_incognito_pages` we will have access to the session cookie,
319+
# but there will be a cookie leak via PlaywrightContext
320+
else:
321+
# The initiated cookies must match in both the response and the session store
322+
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}
323+
324+
# PlaywrightContext makes cookies shared by all sessions that work with it.
325+
# So in this case a clean session contains the same cookies
326+
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'}

0 commit comments

Comments
 (0)