Skip to content

Commit 439d81e

Browse files
authored
docs: Add guide about integrating Stagehand (#1290)
### Description - Add guide about integrating `stagehand-python` v.0.4.0 ### Issues - Closes: #1278
1 parent df3fb5d commit 439d81e

File tree

6 files changed

+291
-0
lines changed

6 files changed

+291
-0
lines changed

docs/guides/code_examples/playwright_crawler_stagehand/__init__.py

Whitespace-only changes.
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from __future__ import annotations
2+
3+
from datetime import datetime, timezone
4+
from typing import TYPE_CHECKING, Any, cast
5+
6+
from stagehand.context import StagehandContext
7+
from typing_extensions import override
8+
9+
from crawlee.browsers import (
10+
PlaywrightBrowserController,
11+
PlaywrightBrowserPlugin,
12+
PlaywrightPersistentBrowser,
13+
)
14+
15+
from .support_classes import CrawleeStagehandPage
16+
17+
if TYPE_CHECKING:
18+
from collections.abc import Mapping
19+
20+
from playwright.async_api import Page
21+
from stagehand import Stagehand
22+
23+
from crawlee.proxy_configuration import ProxyInfo
24+
25+
26+
class StagehandBrowserController(PlaywrightBrowserController):
27+
@override
28+
def __init__(
29+
self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any
30+
) -> None:
31+
# Initialize with browser context instead of browser instance
32+
super().__init__(browser, **kwargs)
33+
34+
self._stagehand = stagehand
35+
self._stagehand_context: StagehandContext | None = None
36+
37+
@override
38+
async def new_page(
39+
self,
40+
browser_new_context_options: Mapping[str, Any] | None = None,
41+
proxy_info: ProxyInfo | None = None,
42+
) -> Page:
43+
# Initialize browser context if not already done
44+
if not self._browser_context:
45+
self._browser_context = await self._create_browser_context(
46+
browser_new_context_options=browser_new_context_options,
47+
proxy_info=proxy_info,
48+
)
49+
50+
# Initialize Stagehand context if not already done
51+
if not self._stagehand_context:
52+
self._stagehand_context = await StagehandContext.init(
53+
self._browser_context, self._stagehand
54+
)
55+
56+
# Create a new page using Stagehand context
57+
page = await self._stagehand_context.new_page()
58+
59+
pw_page = page._page # noqa: SLF001
60+
61+
# Handle page close event
62+
pw_page.on(event='close', f=self._on_page_close)
63+
64+
# Update internal state
65+
self._pages.append(pw_page)
66+
self._last_page_opened_at = datetime.now(timezone.utc)
67+
68+
self._total_opened_pages += 1
69+
70+
# Wrap StagehandPage to provide Playwright Page interface
71+
return cast('Page', CrawleeStagehandPage(page))
72+
73+
74+
class StagehandPlugin(PlaywrightBrowserPlugin):
75+
"""Browser plugin that integrates Stagehand with Crawlee's browser management."""
76+
77+
@override
78+
def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None:
79+
super().__init__(**kwargs)
80+
81+
self._stagehand = stagehand
82+
83+
@override
84+
async def new_browser(self) -> StagehandBrowserController:
85+
if not self._playwright:
86+
raise RuntimeError('Playwright browser plugin is not initialized.')
87+
88+
browser = PlaywrightPersistentBrowser(
89+
# Stagehand can run only on a Chromium-based browser.
90+
self._playwright.chromium,
91+
self._user_data_dir,
92+
self._browser_launch_options,
93+
)
94+
95+
# Return custom controller with Stagehand
96+
return StagehandBrowserController(
97+
browser=browser,
98+
stagehand=self._stagehand,
99+
header_generator=None,
100+
fingerprint_generator=None,
101+
)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import os
5+
from typing import cast
6+
7+
from stagehand import StagehandConfig, StagehandPage
8+
9+
from crawlee import ConcurrencySettings
10+
from crawlee.browsers import BrowserPool
11+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
12+
13+
from .browser_classes import StagehandPlugin
14+
from .support_classes import CrawleeStagehand
15+
16+
17+
async def main() -> None:
18+
# Configure local Stagehand with Gemini model
19+
config = StagehandConfig(
20+
env='LOCAL',
21+
model_name='google/gemini-2.5-flash-preview-05-20',
22+
model_api_key=os.getenv('GEMINI_API_KEY'),
23+
)
24+
25+
# Create Stagehand instance
26+
stagehand = CrawleeStagehand(config)
27+
28+
# Create crawler with custom browser pool using Stagehand
29+
crawler = PlaywrightCrawler(
30+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
31+
max_requests_per_crawl=10,
32+
# Custom browser pool. Gives users full control over browsers used by the crawler.
33+
concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10),
34+
browser_pool=BrowserPool(
35+
plugins=[
36+
StagehandPlugin(stagehand, browser_launch_options={'headless': True})
37+
],
38+
),
39+
)
40+
41+
# Define the default request handler, which will be called for every request.
42+
@crawler.router.default_handler
43+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
44+
context.log.info(f'Processing {context.request.url} ...')
45+
46+
# Cast to StagehandPage for proper type hints in IDE
47+
page = cast('StagehandPage', context.page)
48+
49+
# Use regular Playwright method
50+
playwright_title = await page.title()
51+
context.log.info(f'Playwright page title: {playwright_title}')
52+
53+
# highlight-start
54+
# Use AI-powered extraction with natural language
55+
gemini_title = await page.extract('Extract page title')
56+
context.log.info(f'Gemini page title: {gemini_title}')
57+
# highlight-end
58+
59+
await context.enqueue_links()
60+
61+
# Run the crawler with the initial list of URLs.
62+
await crawler.run(['https://crawlee.dev/'])
63+
64+
65+
if __name__ == '__main__':
66+
asyncio.run(main())
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING, Any
4+
5+
from stagehand import Stagehand, StagehandPage
6+
7+
if TYPE_CHECKING:
8+
from types import TracebackType
9+
10+
11+
class CrawleeStagehandPage:
12+
"""StagehandPage wrapper for Crawlee."""
13+
14+
def __init__(self, page: StagehandPage) -> None:
15+
self._page = page
16+
17+
async def goto(
18+
self,
19+
url: str,
20+
*,
21+
referer: str | None = None,
22+
timeout: int | None = None,
23+
wait_until: str | None = None,
24+
) -> Any:
25+
"""Navigate to the specified URL."""
26+
# Override goto to return navigation result that `PlaywrightCrawler` expects
27+
return await self._page._page.goto( # noqa: SLF001
28+
url,
29+
referer=referer,
30+
timeout=timeout,
31+
wait_until=wait_until,
32+
)
33+
34+
def __getattr__(self, name: str) -> Any:
35+
"""Delegate all other methods to the underlying StagehandPage."""
36+
return getattr(self._page, name)
37+
38+
async def __aenter__(self) -> CrawleeStagehandPage:
39+
"""Enter the context manager."""
40+
return self
41+
42+
async def __aexit__(
43+
self,
44+
exc_type: type[BaseException] | None,
45+
exc_value: BaseException | None,
46+
exc_traceback: TracebackType | None,
47+
) -> None:
48+
await self._page.close()
49+
50+
51+
class CrawleeStagehand(Stagehand):
52+
"""Stagehand wrapper for Crawlee to disable the launch of Playwright."""
53+
54+
async def init(self) -> None:
55+
# Skip Stagehand's own Playwright initialization
56+
# Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle
57+
self._initialized = True
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
---
2+
id: playwright-crawler-stagehand
3+
title: Playwright with Stagehand
4+
description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler.
5+
---
6+
7+
import ApiLink from '@site/src/components/ApiLink';
8+
import CodeBlock from '@theme/CodeBlock';
9+
10+
import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py';
11+
import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py';
12+
import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py';
13+
14+
[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic.
15+
16+
Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider.
17+
18+
:::info
19+
20+
This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions.
21+
22+
:::
23+
24+
## Get Gemini API key
25+
26+
You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key.
27+
28+
## Create support classes for Stagehand
29+
30+
To integrate Stagehand with Crawlee, you need to create wrapper classes that allow <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> to manage the Playwright lifecycle.
31+
32+
Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance.
33+
34+
Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.
35+
36+
<CodeBlock className="language-python" title="support_classes.py">
37+
{SupportClasses}
38+
</CodeBlock>
39+
40+
## Create browser integration classes
41+
42+
You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`.
43+
44+
Create `StagehandPlugin` - a subclass of <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink> that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances.
45+
46+
Create `StagehandBrowserController` - a subclass of <ApiLink to="class/PlaywrightBrowserController">`PlaywrightBrowserController`</ApiLink> that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand.
47+
48+
<CodeBlock className="language-python" title="browser_classes.py">
49+
{BrowserClasses}
50+
</CodeBlock>
51+
52+
## Create a crawler
53+
54+
Now you can create a <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> that uses Stagehand's AI capabilities to interact with web pages using natural language commands:
55+
56+
<CodeBlock className="language-python" title="stagehand_run.py">
57+
{StagehandRun}
58+
</CodeBlock>
59+
60+
The integration works through several key components:
61+
- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle
62+
- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances
63+
- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities
64+
- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations
65+
66+
In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ module = [
239239
"apify_fingerprint_datapoints", # Untyped and stubs not available
240240
"camoufox", # Example code shows integration of camoufox and crawlee.
241241
"fastapi", # Example code shows running in webserver.
242+
"stagehand.*", # Example code shows integration of Stagehand and crawlee.
242243
"starlette.*", # Example code shows running in webserver.
243244
"flask", # Example code shows deploy on Google Cloud.
244245
"functions_framework", # Example code shows deploy on Google Cloud.

0 commit comments

Comments
 (0)