Skip to content

Commit 1d327e6

Browse files
committed
add CDP
1 parent 67b028d commit 1d327e6

File tree

5 files changed

+54
-43
lines changed

5 files changed

+54
-43
lines changed

.env.example

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ PROXY_SERVER=
77
PROXY_USERNAME=
88
PROXY_PASSWORD=
99
PORT=3000
10-
PERSISTENT_CONTEXT=False
10+
PERSISTENT_CONTEXT=False
11+
REMOTE_CDP=

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
22

33
LABEL maintainer="loorisr"
4-
LABEL repository="https://github.com/loorisr/playwright-scrape-api"
4+
LABEL repository="https://github.com/loorisr/patchright-scrape-api"
55
LABEL description="Simple scraping API based on patchright "
6-
LABEL date="2025-02-24"
6+
LABEL date="2025-02-26"
77

88
# Install the project into `/app`
99
WORKDIR /app

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ Features:
1313
* better media blocking handling
1414
* scrape multiple pages in parallel
1515
* scrape endpoint compatible with Firecrawl API
16-
* return cleaned html and markdow
16+
* return cleaned html and markdown
1717
* temporary or persistent context
18+
* can connect to remote browser via CDP
1819
* lightweight: 1.2 Go
1920

2021
Available on Docker hub: `docker pull loorisr/patchright-scrape-api:latest`
@@ -38,6 +39,8 @@ Available on Docker hub: `docker pull loorisr/patchright-scrape-api:latest`
3839

3940
* `PERSISTENT_CONTEXT`: To enable persistent context. If true, a volume needs to be mounted at /context. Default: False
4041

42+
* `REMOTE_CDP`: Address of a remote browser with CDP (Chrome DevTools Protocol). Allows you to connect to a provider or use https://github.com/JacobLinCool/playwright-docker for example.
43+
4144
## Endpoints
4245
* `/scrape`
4346
- **url**: url to scrape : http://www.domain.tld

app/app.py

Lines changed: 45 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
PERSISTENT_CONTEXT = os.getenv("PERSISTENT_CONTEXT", 'False').lower() in ('true', '1', 't')
4646
PATH_CONTEXT = "/context"
4747

48+
REMOTE_CDP = os.getenv('REMOTE_CDP')
49+
4850
# Global browser and context instances
4951
browser: Browser = None
5052
context: BrowserContext = None
@@ -170,46 +172,50 @@ async def lifespan(app: FastAPI):
170172
# Startup logic
171173
playwright = await async_playwright().start()
172174

173-
if PERSISTENT_CONTEXT:
174-
print("Launching Chrome with persistent context.")
175-
context = await playwright.chromium.launch_persistent_context(
176-
user_data_dir=PATH_CONTEXT,
177-
headless=True,
178-
channel="chrome",
179-
args=[
180-
'--no-sandbox',
181-
'--disable-setuid-sandbox',
182-
'--disable-dev-shm-usage',
183-
'--disable-accelerated-2d-canvas',
184-
'--no-first-run',
185-
'--no-zygote',
186-
'--single-process',
187-
'--disable-gpu',
188-
'--no-default-browser-check',
189-
'--disable-infobars'
190-
],
191-
**context_options
192-
)
175+
if REMOTE_CDP:
176+
browser = await playwright.chromium.connect_over_cdp(f"wss://{REMOTE_CDP}")
177+
context = browser.contexts[0]
193178
else:
194-
print("Launching Chrome with temporary context.")
195-
browser = await playwright.chromium.launch(
196-
headless=True,
197-
channel="chrome",
198-
args=[
199-
'--no-sandbox',
200-
'--disable-setuid-sandbox',
201-
'--disable-dev-shm-usage',
202-
'--disable-accelerated-2d-canvas',
203-
'--no-first-run',
204-
'--no-zygote',
205-
'--single-process',
206-
'--disable-gpu',
207-
'--no-default-browser-check',
208-
'--no-startup-window',
209-
'--disable-infobars'
210-
]
211-
)
212-
context = await browser.new_context(**context_options)
179+
if PERSISTENT_CONTEXT:
180+
print("Launching Chrome with persistent context.")
181+
context = await playwright.chromium.launch_persistent_context(
182+
user_data_dir=PATH_CONTEXT,
183+
headless=True,
184+
channel="chrome",
185+
args=[
186+
'--no-sandbox',
187+
'--disable-setuid-sandbox',
188+
'--disable-dev-shm-usage',
189+
'--disable-accelerated-2d-canvas',
190+
'--no-first-run',
191+
'--no-zygote',
192+
'--single-process',
193+
'--disable-gpu',
194+
'--no-default-browser-check',
195+
'--disable-infobars'
196+
],
197+
**context_options
198+
)
199+
else:
200+
print("Launching Chrome with temporary context.")
201+
browser = await playwright.chromium.launch(
202+
headless=True,
203+
channel="chrome",
204+
args=[
205+
'--no-sandbox',
206+
'--disable-setuid-sandbox',
207+
'--disable-dev-shm-usage',
208+
'--disable-accelerated-2d-canvas',
209+
'--no-first-run',
210+
'--no-zygote',
211+
'--single-process',
212+
'--disable-gpu',
213+
'--no-default-browser-check',
214+
'--no-startup-window',
215+
'--disable-infobars'
216+
]
217+
)
218+
context = await browser.new_context(**context_options)
213219

214220

215221
if RESOURCES_BLOCKED:

docker-compose.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ services:
1414
- PROXY_PASSWORD=
1515
- PORT=3000
1616
- PERSISTENT_CONTEXT=False
17+
- REMOTE_CDP=
1718
volumes:
1819
- patchright_context:/context # needed if PERSISTENT_CONTEXT=True
1920
restart: unless-stopped

0 commit comments

Comments
 (0)