Skip to content

Commit bb181e5

Browse files
authored
fix: Fix memory leak in PlaywrightCrawler on browser context creation (#1446)
### Description - Add `Lock` to `PlaywrightBrowserController._browser_context` creation to ensure no orphaned duplicate contexts are created. ### Issues - Closes: #1443 ### Testing - Unit test added. - Benchmark actor with PlaywrightCrawler used to confirm normal memory consumption with this fix.
1 parent 29d2585 commit bb181e5

File tree

2 files changed

+48
-16
lines changed

2 files changed

+48
-16
lines changed

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
from asyncio import Lock
56
from datetime import datetime, timedelta, timezone
67
from typing import TYPE_CHECKING, Any, cast
78

@@ -77,6 +78,19 @@ def __init__(
7778

7879
self._total_opened_pages = 0
7980

81+
self._context_creation_lock: Lock | None = None
82+
83+
async def _get_context_creation_lock(self) -> Lock:
84+
"""Get context checking and creation lock.
85+
86+
It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
87+
memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
88+
"""
89+
if self._context_creation_lock:
90+
return self._context_creation_lock
91+
self._context_creation_lock = Lock()
92+
return self._context_creation_lock
93+
8094
@property
8195
@override
8296
def pages(self) -> list[Page]:
@@ -137,12 +151,6 @@ async def new_page(
137151
Raises:
138152
ValueError: If the browser has reached the maximum number of open pages.
139153
"""
140-
if not self._browser_context:
141-
self._browser_context = await self._create_browser_context(
142-
browser_new_context_options=browser_new_context_options,
143-
proxy_info=proxy_info,
144-
)
145-
146154
if not self.has_free_capacity:
147155
raise ValueError('Cannot open more pages in this browser.')
148156

@@ -154,11 +162,12 @@ async def new_page(
154162
)
155163
page = await new_context.new_page()
156164
else:
157-
if not self._browser_context:
158-
self._browser_context = await self._create_browser_context(
159-
browser_new_context_options=browser_new_context_options,
160-
proxy_info=proxy_info,
161-
)
165+
async with await self._get_context_creation_lock():
166+
if not self._browser_context:
167+
self._browser_context = await self._create_browser_context(
168+
browser_new_context_options=browser_new_context_options,
169+
proxy_info=proxy_info,
170+
)
162171
page = await self._browser_context.new_page()
163172

164173
# Handle page close event
@@ -169,7 +178,6 @@ async def new_page(
169178
self._last_page_opened_at = datetime.now(timezone.utc)
170179

171180
self._total_opened_pages += 1
172-
173181
return page
174182

175183
@override
@@ -206,7 +214,6 @@ async def _create_browser_context(
206214
`self._fingerprint_generator` is available.
207215
"""
208216
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
209-
210217
if proxy_info:
211218
if browser_new_context_options.get('proxy'):
212219
logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
@@ -244,5 +251,4 @@ async def _create_browser_context(
244251
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
245252
'extra_http_headers', extra_http_headers
246253
)
247-
248254
return await self._browser.new_context(**browser_new_context_options)

tests/unit/browsers/test_playwright_browser_controller.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
import asyncio
44
from datetime import datetime, timedelta, timezone
5-
from typing import TYPE_CHECKING
5+
from typing import TYPE_CHECKING, Any
6+
from unittest.mock import AsyncMock
67

78
import pytest
89
from playwright.async_api import Browser, Playwright, async_playwright
910

10-
from crawlee.browsers import PlaywrightBrowserController
11+
from crawlee.browsers import PlaywrightBrowserController, PlaywrightPersistentBrowser
1112

1213
if TYPE_CHECKING:
1314
from collections.abc import AsyncGenerator
@@ -106,3 +107,28 @@ async def test_close_browser_with_open_pages(browser: Browser) -> None:
106107

107108
assert controller.pages_count == 0
108109
assert not controller.is_browser_connected
110+
111+
112+
async def test_memory_leak_on_concurrent_context_creation() -> None:
113+
"""Test that only one browser context is created when multiple pages are opened concurrently."""
114+
115+
# Prepare mocked browser with relevant methods and attributes
116+
mocked_browser = AsyncMock()
117+
mocked_context_launcher = AsyncMock()
118+
119+
async def delayed_launch_persistent_context(*args: Any, **kwargs: Any) -> Any:
120+
"""Ensure that both calls to create context overlap in time."""
121+
await asyncio.sleep(5) # Simulate delay in creation to make sure race condition happens
122+
return await mocked_context_launcher(*args, **kwargs)
123+
124+
mocked_browser.launch_persistent_context = delayed_launch_persistent_context
125+
126+
# Create minimal instance of PlaywrightBrowserController with mocked browser
127+
controller = PlaywrightBrowserController(
128+
PlaywrightPersistentBrowser(mocked_browser, None, {}), header_generator=None, fingerprint_generator=None
129+
)
130+
131+
# Both calls will try to create browser context at the same time, but only one context should be created.
132+
await asyncio.gather(controller.new_page(), controller.new_page())
133+
134+
assert mocked_context_launcher.call_count == 1

0 commit comments

Comments
 (0)