3333_context : BrowserContext | None = None
3434_pw_instance = None # Playwright instance holder
3535_launch_lock = asyncio .Lock ()
36+ _page_count : int = 0 # Pages fetched on current context
37+
38+ # Resource types to block during page fetch (saves memory + bandwidth)
39+ _BLOCKED_RESOURCE_TYPES : frozenset [str ] = frozenset ({
40+ "image" , "media" , "font" , "stylesheet" ,
41+ })
3642
3743# Module-level content cache (created lazily based on config)
3844_content_cache : AsyncTTLCache [Optional [str ]] | None = None
@@ -73,37 +79,64 @@ def _is_blocked(url: str) -> bool:
7379 return False
7480
7581
82+ def _new_context_kwargs () -> dict :
83+ """Kwargs for creating a fresh BrowserContext."""
84+ return dict (
85+ user_agent = (
86+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
87+ "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
88+ ),
89+ java_script_enabled = True ,
90+ ignore_https_errors = True ,
91+ )
92+
93+
7694async def _get_context () -> BrowserContext :
7795 """Return a shared Playwright browser context, launching Chromium lazily.
7896
97+ Recycles the context after a configurable number of pages to prevent
98+ memory leaks from long-lived contexts. The browser process itself is
99+ kept alive (launching Chromium is expensive).
100+
79101 Uses an asyncio.Lock to prevent concurrent callers from launching
80- duplicate browser instances.
102+ duplicate browser instances or racing on context recycling .
81103 """
82- global _browser , _context , _pw_instance
83- if _context is not None :
104+ global _browser , _context , _pw_instance , _page_count
105+ settings = get_settings ()
106+ recycle_threshold = settings .playwright_context_recycle_pages
107+
108+ if _context is not None and _page_count < recycle_threshold :
84109 return _context
110+
85111 async with _launch_lock :
86- # Double-check after acquiring the lock
112+ # Recycle: close old context but keep the browser process
113+ if _context is not None and _page_count >= recycle_threshold :
114+ logger .info ("Recycling browser context after %d pages" , _page_count )
115+ try :
116+ await _context .close ()
117+ except Exception :
118+ logger .warning ("Failed to close old browser context" , exc_info = True )
119+ _context = None
120+ _page_count = 0
121+
87122 if _context is not None :
88123 return _context
89- logger .info ("Launching Playwright headless Chromium browser" )
90- _pw_instance = await async_playwright ().start ()
91- _browser = await _pw_instance .chromium .launch (headless = True )
92- logger .info ("Playwright Chromium browser launched successfully" )
93- _context = await _browser .new_context (
94- user_agent = (
95- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
96- "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
97- ),
98- java_script_enabled = True ,
99- ignore_https_errors = True ,
100- )
124+
125+ # Launch browser process if needed
126+ if _browser is None :
127+ logger .info ("Launching Playwright headless Chromium browser" )
128+ _pw_instance = await async_playwright ().start ()
129+ _browser = await _pw_instance .chromium .launch (headless = True )
130+ logger .info ("Playwright Chromium browser launched successfully" )
131+
132+ _context = await _browser .new_context (** _new_context_kwargs ())
133+ _page_count = 0
101134 return _context
102135
103136
104137async def close_client () -> None :
105138 """Shut down the Playwright browser and clear cache (called during app shutdown)."""
106- global _browser , _context , _pw_instance , _content_cache
139+ global _browser , _context , _pw_instance , _content_cache , _page_count
107140 logger .info ("Shutting down Playwright browser" )
108141 if _context is not None :
109142 await _context .close ()
@@ -114,6 +147,7 @@ async def close_client() -> None:
114147 if _pw_instance is not None :
115148 await _pw_instance .stop ()
116149 _pw_instance = None
150+ _page_count = 0
117151 if _content_cache is not None :
118152 logger .info (
119153 "Content cache stats: hits=%d misses=%d coalesced=%d hit_rate=%.1f%%" ,
@@ -151,7 +185,9 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
151185 """Fetch raw HTML from a URL using headless Chromium.
152186
153187 Skips known paywalled domains. Uses a real browser to avoid
154- TLS-fingerprint and JS-challenge bot detection.
188+ TLS-fingerprint and JS-challenge bot detection. Blocks heavy
189+ resource types (images, fonts, stylesheets, media) to save memory
190+ and bandwidth. Caps the extracted HTML size to prevent OOM.
155191
156192 Args:
157193 url: The URL to fetch
@@ -160,12 +196,28 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
160196 Returns:
161197 Raw HTML string, or None on failure
162198 """
199+ global _page_count
163200 if _is_blocked (url ):
164201 logger .debug ("Skipping blocked domain: %s" , url )
165202 return None
203+
204+ settings = get_settings ()
205+ max_html_bytes = settings .playwright_max_html_bytes
206+
166207 try :
167208 context = await _get_context ()
168209 page = await context .new_page ()
210+
211+ # Block heavy resource types to save memory and bandwidth
212+ await page .route (
213+ "**/*" ,
214+ lambda route : (
215+ route .abort ()
216+ if route .request .resource_type in _BLOCKED_RESOURCE_TYPES
217+ else route .continue_ ()
218+ ),
219+ )
220+
169221 logger .debug ("Playwright: navigating to %s" , url )
170222 try :
171223 response = await page .goto (
@@ -179,10 +231,16 @@ async def fetch_page(url: str, timeout: float = 15.0) -> Optional[str]:
179231 content_type = response .headers .get ("content-type" , "" )
180232 if "text/html" not in content_type and "application/xhtml" not in content_type :
181233 return None
234+ html = await page .content ()
235+ if len (html ) > max_html_bytes :
236+ logger .debug ("Playwright: truncating HTML from %d to %d bytes for %s" ,
237+ len (html ), max_html_bytes , url )
238+ html = html [:max_html_bytes ]
182239 logger .debug ("Playwright: successfully fetched %s" , url )
183- return await page . content ()
240+ return html
184241 finally :
185242 await page .close ()
243+ _page_count += 1
186244 except Exception as exc :
187245 logger .warning ("Playwright: failed to fetch %s: %s" , url , exc )
188246 return None
0 commit comments