Harden Playwright: context recycling, resource blocking, HTML size cap

SkylarKelty · claude · SkylarKelty · commit ff0a2e7fca05 · 2026-03-11T12:27:50.000Z
- Recycle the browser context every N pages (default 50, configurable
  via PLAYWRIGHT_CONTEXT_RECYCLE_PAGES) to prevent memory leaks from
  long-lived contexts. The Chromium process itself is kept alive.
- Block heavy resource types (images, fonts, stylesheets, media) via
  page.route() to reduce memory and bandwidth per fetch.
- Cap extracted HTML size (default 5MB, configurable via
  PLAYWRIGHT_MAX_HTML_BYTES) to prevent OOM from huge pages.
- Increment page counter in the finally block so it tracks even on
  navigation failures.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/artemis/extractor.py b/artemis/extractor.py
@@ -33,6 +33,12 @@
 _context: BrowserContext | None = None
 _pw_instance = None  # Playwright instance holder
 _launch_lock = asyncio.Lock()
+_page_count: int = 0  # Pages fetched on current context
+
+# Resource types to block during page fetch (saves memory + bandwidth)
+_BLOCKED_RESOURCE_TYPES: frozenset[str] = frozenset({
+    "image", "media", "font", "stylesheet",
+})
 
 # Module-level content cache (created lazily based on config)
 _content_cache: AsyncTTLCache[Optional[str]] | None = None
@@ -73,37 +79,64 @@ def _is_blocked(url: str) -> bool:
     return False
 
 
+def _new_context_kwargs() -> dict:
+    """Kwargs for creating a fresh BrowserContext."""
+    return dict(
+        user_agent=(
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+            "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+        ),
+        java_script_enabled=True,
+        ignore_https_errors=True,
+    )
+
+
 async def _get_context() -> BrowserContext:
     """Return a shared Playwright browser context, launching Chromium lazily.
 
+    Recycles the context after a configurable number of pages to prevent
+    memory leaks from long-lived contexts. The browser process itself is
+    kept alive (launching Chromium is expensive).
+
     Uses an asyncio.Lock to prevent concurrent callers from launching
-    duplicate browser instances.
+    duplicate browser instances or racing on context recycling.
     """
-    global _browser, _context, _pw_instance
-    if _context is not None:
+    global _browser, _context, _pw_instance, _page_count
+    settings = get_settings()
+    recycle_threshold = settings.playwright_context_recycle_pages
+
+    if _context is not None and _page_count < recycle_threshold:
         return _context
+
     async with _launch_lock:
-        # Double-check after acquiring the lock
+        # Recycle: close old context but keep the browser process
+        if _context is not None and _page_count >= recycle_threshold:
+            logger.info("Recycling browser context after %d pages", _page_count)
+            try:
+                await _context.close()
+            except Exception:
+                logger.warning("Failed to close old browser context", exc_info=True)
+            _context = None
+            _page_count = 0
+
         if _context is not None:
             return _context
-        logger.info("Launching Playwright headless Chromium browser")
-        _pw_instance = await async_playwright().start()
-        _browser = await _pw_instance.chromium.launch(headless=True)
-        logger.info("Playwright Chromium browser launched successfully")
-        _context = await _browser.new_context(
-            user_agent=(
-                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
-                "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
-            ),
-            java_script_enabled=True,
-            ignore_https_errors=True,
-        )
+
+        # Launch browser process if needed
+        if _browser is None:
+            logger.info("Launching Playwright headless Chromium browser")
+            _pw_instance = await async_playwright().start()
+            _browser = await _pw_instance.chromium.launch(headless=True)
+            logger.info("Playwright Chromium browser launched successfully")
+
+        _context = await _browser.new_context(**_new_context_kwargs())
+        _page_count = 0
         return _context
 
 
 async def close_client() -> None:
     """Shut down the Playwright browser and clear cache (called during app shutdown)."""
-    global _browser, _context, _pw_instance, _content_cache
+    global _browser, _context, _pw_instance, _content_cache, _page_count
     logger.info("Shutting down Playwright browser")
     if _context is not None:
         await _context.close()
@@ -114,6 +147,7 @@ async def close_client() -> None:
     if _pw_instance is not None:
         await _pw_instance.stop()
         _pw_instance = None
+    _page_count = 0
     if _content_cache is not None:
         logger.info(
             "Content cache stats: hits=%d misses=%d coalesced=%d hit_rate=%.1f%%",
@@ -151,7 +185,9 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
     """Fetch raw HTML from a URL using headless Chromium.
 
     Skips known paywalled domains. Uses a real browser to avoid
-    TLS-fingerprint and JS-challenge bot detection.
+    TLS-fingerprint and JS-challenge bot detection. Blocks heavy
+    resource types (images, fonts, stylesheets, media) to save memory
+    and bandwidth. Caps the extracted HTML size to prevent OOM.
 
     Args:
         url: The URL to fetch
@@ -160,12 +196,28 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
     Returns:
         Raw HTML string, or None on failure
     """
+    global _page_count
     if _is_blocked(url):
         logger.debug("Skipping blocked domain: %s", url)
         return None
+
+    settings = get_settings()
+    max_html_bytes = settings.playwright_max_html_bytes
+
     try:
         context = await _get_context()
         page = await context.new_page()
+
+        # Block heavy resource types to save memory and bandwidth
+        await page.route(
+            "**/*",
+            lambda route: (
+                route.abort()
+                if route.request.resource_type in _BLOCKED_RESOURCE_TYPES
+                else route.continue_()
+            ),
+        )
+
         logger.debug("Playwright: navigating to %s", url)
         try:
             response = await page.goto(
@@ -179,10 +231,16 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
             content_type = response.headers.get("content-type", "")
             if "text/html" not in content_type and "application/xhtml" not in content_type:
                 return None
+            html = await page.content()
+            if len(html) > max_html_bytes:
+                logger.debug("Playwright: truncating HTML from %d to %d bytes for %s",
+                             len(html), max_html_bytes, url)
+                html = html[:max_html_bytes]
             logger.debug("Playwright: successfully fetched %s", url)
-            return await page.content()
+            return html
         finally:
             await page.close()
+            _page_count += 1
     except Exception as exc:
         logger.warning("Playwright: failed to fetch %s: %s", url, exc)
         return None