5
5
from typing import TYPE_CHECKING , Any , Callable
6
6
7
7
from pydantic import ValidationError
8
+ from yarl import URL
8
9
9
10
from crawlee import EnqueueStrategy , RequestTransformAction
10
11
from crawlee ._request import Request , RequestOptions
22
23
if TYPE_CHECKING :
23
24
from collections .abc import AsyncGenerator , Awaitable , Mapping
24
25
26
+ from playwright .async_api import Page
25
27
from typing_extensions import Unpack
26
28
27
29
from crawlee ._types import BasicCrawlingContext , EnqueueLinksKwargs
@@ -76,6 +78,7 @@ def __init__(
76
78
browser_launch_options : Mapping [str , Any ] | None = None ,
77
79
browser_new_context_options : Mapping [str , Any ] | None = None ,
78
80
headless : bool | None = None ,
81
+ use_incognito_pages : bool | None = None ,
79
82
** kwargs : Unpack [BasicCrawlerOptions [PlaywrightCrawlingContext ]],
80
83
) -> None :
81
84
"""A default constructor.
@@ -94,17 +97,27 @@ def __init__(
94
97
This option should not be used if `browser_pool` is provided.
95
98
headless: Whether to run the browser in headless mode.
96
99
This option should not be used if `browser_pool` is provided.
100
+ use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
101
+ own context that is destroyed once the page is closed or crashes.
102
+ This option should not be used if `browser_pool` is provided.
97
103
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
98
104
"""
99
105
if browser_pool :
100
106
# Raise an exception if browser_pool is provided together with other browser-related arguments.
101
107
if any (
102
108
param is not None
103
- for param in (headless , browser_type , browser_launch_options , browser_new_context_options )
109
+ for param in (
110
+ use_incognito_pages ,
111
+ headless ,
112
+ browser_type ,
113
+ browser_launch_options ,
114
+ browser_new_context_options ,
115
+ )
104
116
):
105
117
raise ValueError (
106
- 'You cannot provide `headless`, `browser_type`, `browser_launch_options` or '
107
- '`browser_new_context_options` arguments when `browser_pool` is provided.'
118
+ 'You cannot provide `headless`, `browser_type`, `browser_launch_options`'
119
+ '`browser_new_context_options` or `use_incognito_pages` arguments when '
120
+ '`browser_pool` is provided.'
108
121
)
109
122
110
123
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
@@ -114,6 +127,7 @@ def __init__(
114
127
browser_type = browser_type ,
115
128
browser_launch_options = browser_launch_options ,
116
129
browser_new_context_options = browser_new_context_options ,
130
+ use_incognito_pages = use_incognito_pages ,
117
131
)
118
132
119
133
self ._browser_pool = browser_pool
@@ -175,6 +189,9 @@ async def _navigate(
175
189
infinite_scroll and block_requests).
176
190
"""
177
191
async with context .page :
192
+ if context .session :
193
+ await self ._set_cookies (context .page , context .request .url , context .session .cookies )
194
+
178
195
if context .request .headers :
179
196
await context .page .set_extra_http_headers (context .request .headers .model_dump ())
180
197
# Navigate to the URL and get response.
@@ -186,6 +203,10 @@ async def _navigate(
186
203
# Set the loaded URL to the actual URL after redirection.
187
204
context .request .loaded_url = context .page .url
188
205
206
+ if context .session :
207
+ cookies = await self ._get_cookies (context .page )
208
+ context .session .cookies .update (cookies )
209
+
189
210
async def enqueue_links (
190
211
* ,
191
212
selector : str = 'a' ,
@@ -295,3 +316,15 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
295
316
hook: A coroutine function to be called before each navigation.
296
317
"""
297
318
self ._pre_navigation_hooks .append (hook )
319
+
320
+ async def _get_cookies (self , page : Page ) -> dict [str , str ]:
321
+ """Get the cookies from the page."""
322
+ cookies = await page .context .cookies ()
323
+ return {cookie ['name' ]: cookie ['value' ] for cookie in cookies if cookie .get ('name' ) and cookie .get ('value' )}
324
+
325
+ async def _set_cookies (self , page : Page , url : str , cookies : dict [str , str ]) -> None :
326
+ """Set the cookies to the page."""
327
+ parsed_url = URL (url )
328
+ await page .context .add_cookies (
329
+ [{'name' : name , 'value' : value , 'domain' : parsed_url .host , 'path' : '/' } for name , value in cookies .items ()]
330
+ )
0 commit comments