Skip to content

Commit ff0a2e7

Browse files
SkylarKeltyclaude
andcommitted
Harden Playwright: context recycling, resource blocking, HTML size cap
- Recycle the browser context every N pages (default 50, configurable via PLAYWRIGHT_CONTEXT_RECYCLE_PAGES) to prevent memory leaks from long-lived contexts. The Chromium process itself is kept alive. - Block heavy resource types (images, fonts, stylesheets, media) via page.route() to reduce memory and bandwidth per fetch. - Cap extracted HTML size (default 5MB, configurable via PLAYWRIGHT_MAX_HTML_BYTES) to prevent OOM from huge pages. - Increment page counter in the finally block so it tracks even on navigation failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7899699 commit ff0a2e7

File tree

1 file changed

+77
-19
lines changed

1 file changed

+77
-19
lines changed

artemis/extractor.py

Lines changed: 77 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@
3333
_context: BrowserContext | None = None
3434
_pw_instance = None # Playwright instance holder
3535
_launch_lock = asyncio.Lock()
36+
_page_count: int = 0 # Pages fetched on current context
37+
38+
# Resource types to block during page fetch (saves memory + bandwidth)
39+
_BLOCKED_RESOURCE_TYPES: frozenset[str] = frozenset({
40+
"image", "media", "font", "stylesheet",
41+
})
3642

3743
# Module-level content cache (created lazily based on config)
3844
_content_cache: AsyncTTLCache[Optional[str]] | None = None
@@ -73,37 +79,64 @@ def _is_blocked(url: str) -> bool:
7379
return False
7480

7581

82+
def _new_context_kwargs() -> dict:
83+
"""Kwargs for creating a fresh BrowserContext."""
84+
return dict(
85+
user_agent=(
86+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
87+
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
88+
),
89+
java_script_enabled=True,
90+
ignore_https_errors=True,
91+
)
92+
93+
7694
async def _get_context() -> BrowserContext:
7795
"""Return a shared Playwright browser context, launching Chromium lazily.
7896
97+
Recycles the context after a configurable number of pages to prevent
98+
memory leaks from long-lived contexts. The browser process itself is
99+
kept alive (launching Chromium is expensive).
100+
79101
Uses an asyncio.Lock to prevent concurrent callers from launching
80-
duplicate browser instances.
102+
duplicate browser instances or racing on context recycling.
81103
"""
82-
global _browser, _context, _pw_instance
83-
if _context is not None:
104+
global _browser, _context, _pw_instance, _page_count
105+
settings = get_settings()
106+
recycle_threshold = settings.playwright_context_recycle_pages
107+
108+
if _context is not None and _page_count < recycle_threshold:
84109
return _context
110+
85111
async with _launch_lock:
86-
# Double-check after acquiring the lock
112+
# Recycle: close old context but keep the browser process
113+
if _context is not None and _page_count >= recycle_threshold:
114+
logger.info("Recycling browser context after %d pages", _page_count)
115+
try:
116+
await _context.close()
117+
except Exception:
118+
logger.warning("Failed to close old browser context", exc_info=True)
119+
_context = None
120+
_page_count = 0
121+
87122
if _context is not None:
88123
return _context
89-
logger.info("Launching Playwright headless Chromium browser")
90-
_pw_instance = await async_playwright().start()
91-
_browser = await _pw_instance.chromium.launch(headless=True)
92-
logger.info("Playwright Chromium browser launched successfully")
93-
_context = await _browser.new_context(
94-
user_agent=(
95-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
96-
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
97-
),
98-
java_script_enabled=True,
99-
ignore_https_errors=True,
100-
)
124+
125+
# Launch browser process if needed
126+
if _browser is None:
127+
logger.info("Launching Playwright headless Chromium browser")
128+
_pw_instance = await async_playwright().start()
129+
_browser = await _pw_instance.chromium.launch(headless=True)
130+
logger.info("Playwright Chromium browser launched successfully")
131+
132+
_context = await _browser.new_context(**_new_context_kwargs())
133+
_page_count = 0
101134
return _context
102135

103136

104137
async def close_client() -> None:
105138
"""Shut down the Playwright browser and clear cache (called during app shutdown)."""
106-
global _browser, _context, _pw_instance, _content_cache
139+
global _browser, _context, _pw_instance, _content_cache, _page_count
107140
logger.info("Shutting down Playwright browser")
108141
if _context is not None:
109142
await _context.close()
@@ -114,6 +147,7 @@ async def close_client() -> None:
114147
if _pw_instance is not None:
115148
await _pw_instance.stop()
116149
_pw_instance = None
150+
_page_count = 0
117151
if _content_cache is not None:
118152
logger.info(
119153
"Content cache stats: hits=%d misses=%d coalesced=%d hit_rate=%.1f%%",
@@ -151,7 +185,9 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
151185
"""Fetch raw HTML from a URL using headless Chromium.
152186
153187
Skips known paywalled domains. Uses a real browser to avoid
154-
TLS-fingerprint and JS-challenge bot detection.
188+
TLS-fingerprint and JS-challenge bot detection. Blocks heavy
189+
resource types (images, fonts, stylesheets, media) to save memory
190+
and bandwidth. Caps the extracted HTML size to prevent OOM.
155191
156192
Args:
157193
url: The URL to fetch
@@ -160,12 +196,28 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
160196
Returns:
161197
Raw HTML string, or None on failure
162198
"""
199+
global _page_count
163200
if _is_blocked(url):
164201
logger.debug("Skipping blocked domain: %s", url)
165202
return None
203+
204+
settings = get_settings()
205+
max_html_bytes = settings.playwright_max_html_bytes
206+
166207
try:
167208
context = await _get_context()
168209
page = await context.new_page()
210+
211+
# Block heavy resource types to save memory and bandwidth
212+
await page.route(
213+
"**/*",
214+
lambda route: (
215+
route.abort()
216+
if route.request.resource_type in _BLOCKED_RESOURCE_TYPES
217+
else route.continue_()
218+
),
219+
)
220+
169221
logger.debug("Playwright: navigating to %s", url)
170222
try:
171223
response = await page.goto(
@@ -179,10 +231,16 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
179231
content_type = response.headers.get("content-type", "")
180232
if "text/html" not in content_type and "application/xhtml" not in content_type:
181233
return None
234+
html = await page.content()
235+
if len(html) > max_html_bytes:
236+
logger.debug("Playwright: truncating HTML from %d to %d bytes for %s",
237+
len(html), max_html_bytes, url)
238+
html = html[:max_html_bytes]
182239
logger.debug("Playwright: successfully fetched %s", url)
183-
return await page.content()
240+
return html
184241
finally:
185242
await page.close()
243+
_page_count += 1
186244
except Exception as exc:
187245
logger.warning("Playwright: failed to fetch %s: %s", url, exc)
188246
return None

0 commit comments

Comments
 (0)