Skip to content

Commit 00ffb7e

Browse files
Mantisusvdusek
andauthored
feat: Add pre/post launch hooks to BrowserPool (#1879)
### Description Add two browser launch hooks to `BrowserPool` registered as decorators: - `pre_launch_hook` - called before a new browser is launched. - `post_launch_hook` - called after a new browser is launched, receives the newly created `BrowserController`. If the hook raises, the browser is closed before the error is propagated. ### Issues - Closes: #1741 ### Testing - Added new tests for `BrowserPool`. --------- Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
1 parent b723b58 commit 00ffb7e

4 files changed

Lines changed: 187 additions & 7 deletions

File tree

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import logging
5+
from typing import TYPE_CHECKING
6+
7+
from crawlee.browsers import BrowserPool
8+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
9+
10+
if TYPE_CHECKING:
11+
from crawlee.browsers._browser_controller import BrowserController
12+
from crawlee.browsers._browser_plugin import BrowserPlugin
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
async def main() -> None:
18+
async with BrowserPool() as browser_pool:
19+
20+
@browser_pool.pre_launch_hook
21+
async def log_browser_launch(page_id: str, plugin: BrowserPlugin) -> None:
22+
"""Log before a new browser instance is launched."""
23+
logger.info(f'Launching {plugin.browser_type} browser for page {page_id}...')
24+
25+
@browser_pool.post_launch_hook
26+
async def log_browser_launched(
27+
page_id: str, controller: BrowserController
28+
) -> None:
29+
"""Log after a new browser instance has been launched."""
30+
logger.info(f'Browser launched for page {page_id}, controller: {controller}')
31+
32+
crawler = PlaywrightCrawler(
33+
browser_pool=browser_pool,
34+
max_requests_per_crawl=5,
35+
)
36+
37+
@crawler.router.default_handler
38+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
39+
context.log.info(f'Processing {context.request.url} ...')
40+
41+
await context.enqueue_links()
42+
43+
# Run the crawler with the initial list of URLs.
44+
await crawler.run(['https://crawlee.dev'])
45+
46+
47+
if __name__ == '__main__':
48+
asyncio.run(main())

docs/guides/playwright_crawler.mdx

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
1111
import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';
1212
import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';
1313
import NavigationHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/navigation_hooks_example.py';
14+
import BrowserPoolLaunchHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_launch_hooks_example.py';
1415
import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';
1516
import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';
1617

@@ -57,9 +58,21 @@ You can also configure each plugin used by <ApiLink to="class/BrowserPool">`Brow
5758

5859
For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.
5960

60-
## Page configuration with lifecycle page hooks
61+
## Browser pool lifecycle hooks
6162

62-
For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.
63+
The <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes lifecycle hooks for both browser launches and page creation/closure. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.
64+
65+
### Browser launch hooks
66+
67+
The <ApiLink to="class/BrowserPool#pre_launch_hook">`pre_launch_hook`</ApiLink> and <ApiLink to="class/BrowserPool#post_launch_hook">`post_launch_hook`</ApiLink> are called once per browser instance, before and after it is launched. Use them for logging, metrics, or any setup at the browser level. Note that these hooks are not called when a new page is created in an already-running browser.
68+
69+
<RunnableCodeBlock className="language-python" language="python">
70+
{BrowserPoolLaunchHooksExample}
71+
</RunnableCodeBlock>
72+
73+
### Page lifecycle hooks
74+
75+
For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>.
6376

6477
<RunnableCodeBlock className="language-python" language="python">
6578
{BrowserPoolPageHooksExample}
@@ -75,4 +88,4 @@ Navigation hooks allow for additional configuration at specific points during pa
7588

7689
## Conclusion
7790

78-
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
91+
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!

src/crawlee/browsers/_browser_pool.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ def __init__(
9999
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
100100
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins
101101

102+
# Hooks for custom behavior at different stages of the browser and page lifecycles.
103+
self._pre_launch_hooks: list[Callable[[str, BrowserPlugin], Awaitable[None]]] = []
104+
self._post_launch_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = []
102105
self._pre_page_create_hooks: list[
103106
Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
104107
] = []
@@ -307,7 +310,7 @@ async def _get_new_page(
307310

308311
try:
309312
if not browser_controller:
310-
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
313+
browser_controller = await asyncio.wait_for(self._launch_new_browser(page_id, plugin), timeout)
311314
browser_new_context_options = dict(plugin.browser_new_context_options)
312315

313316
await self._execute_hooks(
@@ -356,9 +359,22 @@ def _retire_browser(self, browser: BrowserController) -> None:
356359
self._active_browsers.remove(browser)
357360
self._inactive_browsers.append(browser)
358361

359-
async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController:
362+
async def _launch_new_browser(self, page_id: str, plugin: BrowserPlugin) -> BrowserController:
360363
"""Launch a new browser instance using the specified plugin."""
364+
await self._execute_hooks(self._pre_launch_hooks, page_id, plugin)
361365
browser = await plugin.new_browser()
366+
367+
try:
368+
await self._execute_hooks(self._post_launch_hooks, page_id, browser)
369+
except BaseException:
370+
# Catch BaseException to also clean up on CancelledError raised by the outer
371+
# asyncio.wait_for(operation_timeout) wrapping this call.
372+
try:
373+
await browser.close(force=True)
374+
except Exception:
375+
logger.exception('Failed to close browser after post_launch_hook error.')
376+
raise
377+
362378
self._active_browsers.append(browser)
363379
return browser
364380

@@ -395,6 +411,27 @@ async def close_with_hooks(*args: Any, **kwargs: Any) -> None:
395411

396412
crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks
397413

414+
def pre_launch_hook(
415+
self, hook: Callable[[str, BrowserPlugin], Awaitable[None]]
416+
) -> Callable[[str, BrowserPlugin], Awaitable[None]]:
417+
"""Register a hook to be called just before a new browser is launched.
418+
419+
The hook receives the page ID that triggered the launch and the `BrowserPlugin` being used.
420+
Use it for logging, metrics, or other side effects scoped to the browser launch.
421+
"""
422+
self._pre_launch_hooks.append(hook)
423+
return hook
424+
425+
def post_launch_hook(
426+
self, hook: Callable[[str, BrowserController], Awaitable[None]]
427+
) -> Callable[[str, BrowserController], Awaitable[None]]:
428+
"""Register a hook to be called right after a new browser is launched.
429+
430+
The hook receives the page ID that triggered the launch and the newly created `BrowserController`.
431+
"""
432+
self._post_launch_hooks.append(hook)
433+
return hook
434+
398435
def pre_page_create_hook(
399436
self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
400437
) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]:

tests/unit/browsers/test_browser_pool.py

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from yarl import URL
1919

20+
from crawlee.browsers._browser_plugin import BrowserPlugin
2021
from crawlee.proxy_configuration import ProxyInfo
2122

2223

@@ -309,11 +310,19 @@ async def hook(page_id: str, controller: BrowserController) -> None:
309310
assert isinstance(controller, BrowserController)
310311

311312

312-
async def test_page_hooks_execution_order() -> None:
313+
async def test_hooks_execution_order() -> None:
313314
call_order: list[str] = []
314315

315316
async with BrowserPool() as browser_pool:
316317

318+
@browser_pool.pre_launch_hook
319+
async def pre_launch(_page_id: str, _plugin: BrowserPlugin) -> None:
320+
call_order.append('pre_launch')
321+
322+
@browser_pool.post_launch_hook
323+
async def post_launch(_page_id: str, _controller: BrowserController) -> None:
324+
call_order.append('post_launch')
325+
317326
@browser_pool.pre_page_create_hook
318327
async def pre_create(
319328
_page_id: str,
@@ -338,7 +347,7 @@ async def post_close(_page_id: str, _controller: BrowserController) -> None:
338347
page = await browser_pool.new_page()
339348
await page.page.close()
340349

341-
assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close']
350+
assert call_order == ['pre_launch', 'post_launch', 'pre_create', 'post_create', 'pre_close', 'post_close']
342351

343352

344353
async def test_multiple_hooks_all_called() -> None:
@@ -358,3 +367,76 @@ async def second(_crawlee_page: CrawleePage, _controller: BrowserController) ->
358367
await page.page.close()
359368

360369
assert call_order == ['first', 'second']
370+
371+
372+
async def test_pre_launch_hook_is_called() -> None:
373+
call_mock = AsyncMock()
374+
375+
async with BrowserPool() as browser_pool:
376+
377+
@browser_pool.pre_launch_hook
378+
async def hook(page_id: str, plugin: BrowserPlugin) -> None:
379+
await call_mock(page_id, plugin)
380+
381+
test_page = await browser_pool.new_page()
382+
await test_page.page.close()
383+
384+
call_mock.assert_awaited_once()
385+
page_id, plugin = call_mock.call_args[0]
386+
387+
assert isinstance(page_id, str)
388+
assert test_page.id == page_id
389+
assert isinstance(plugin, PlaywrightBrowserPlugin)
390+
391+
392+
async def test_post_launch_hook_is_called() -> None:
393+
call_mock = AsyncMock()
394+
395+
async with BrowserPool() as browser_pool:
396+
397+
@browser_pool.post_launch_hook
398+
async def hook(page_id: str, controller: BrowserController) -> None:
399+
await call_mock(page_id, controller)
400+
401+
test_page = await browser_pool.new_page()
402+
await test_page.page.close()
403+
404+
call_mock.assert_awaited_once()
405+
page_id, controller = call_mock.call_args[0]
406+
407+
assert isinstance(page_id, str)
408+
assert test_page.id == page_id
409+
assert isinstance(controller, BrowserController)
410+
411+
412+
async def test_post_launch_hook_error_closes_browser() -> None:
413+
async with BrowserPool() as browser_pool:
414+
415+
@browser_pool.post_launch_hook
416+
async def hook(_page_id: str, _controller: BrowserController) -> None:
417+
raise ValueError('Hook failed')
418+
419+
with pytest.raises(ValueError, match='Hook failed'):
420+
await browser_pool.new_page()
421+
422+
assert len(browser_pool.active_browsers) == 0
423+
assert len(browser_pool.inactive_browsers) == 0
424+
425+
426+
async def test_launch_hooks_not_called_for_existing_browser() -> None:
427+
launch_hook_calls = 0
428+
429+
async with BrowserPool() as browser_pool:
430+
431+
@browser_pool.pre_launch_hook
432+
async def hook(_page_id: str, _plugin: BrowserPlugin) -> None:
433+
nonlocal launch_hook_calls
434+
launch_hook_calls += 1
435+
436+
page_1 = await browser_pool.new_page()
437+
page_2 = await browser_pool.new_page()
438+
439+
await page_1.page.close()
440+
await page_2.page.close()
441+
442+
assert launch_hook_calls == 1

0 commit comments

Comments
 (0)