Skip to content

Commit 4e1c406

Browse files
committed
fix: prevent memory leak by closing unused context
When scraping many URLs continuously, browser contexts accumulated in memory and were never cleaned up. The existing cleanup only ran when browsers went idle, which never happened under continuous load. See: #943. Key changes: - browser_manager.py: Add _context_refcounts tracking, cleanup_contexts(), and release_context() methods - async_crawler_strategy.py: Release context ref in finally block after crawl - deploy/docker/api.py: Trigger context cleanup after each request This fixes or at least, drastically improve the memory leaks in my testing.
1 parent 0024c82 commit 4e1c406

File tree

3 files changed

+162
-10
lines changed

3 files changed

+162
-10
lines changed

crawl4ai/async_crawler_strategy.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1047,14 +1047,27 @@ async def get_delayed_content(delay: float = 5.0) -> str:
10471047
raise e
10481048

10491049
finally:
1050+
# Release the context reference so cleanup can work
1051+
if not self.browser_config.use_managed_browser:
1052+
try:
1053+
config_signature = self.browser_manager._make_config_signature(config)
1054+
await self.browser_manager.release_context(config_signature)
1055+
except Exception:
1056+
pass # Don't fail on cleanup
1057+
10501058
# If no session_id is given we should close the page
10511059
all_contexts = page.context.browser.contexts
1052-
total_pages = sum(len(context.pages) for context in all_contexts)
1060+
total_pages = sum(len(context.pages) for context in all_contexts)
10531061
if config.session_id:
10541062
pass
10551063
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
10561064
pass
10571065
else:
1066+
self.logger.info(
1067+
message="Closing page for URL: {url}",
1068+
tag="CLOSE",
1069+
params={"url": url},
1070+
)
10581071
# Detach listeners before closing to prevent potential errors during close
10591072
if config.capture_network_requests:
10601073
page.remove_listener("request", handle_request_capture)

crawl4ai/browser_manager.py

Lines changed: 130 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -611,12 +611,16 @@ def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: b
611611
# Keep track of contexts by a "config signature," so each unique config reuses a single context
612612
self.contexts_by_config = {}
613613
self._contexts_lock = asyncio.Lock()
614-
614+
615+
# Reference counting for contexts - tracks how many requests are using each context
616+
# Key: config_signature, Value: count of active requests using this context
617+
self._context_refcounts = {}
618+
615619
# Serialize context.new_page() across concurrent tasks to avoid races
616620
# when using a shared persistent context (context.pages may be empty
617621
# for all racers). Prevents 'Target page/context closed' errors.
618622
self._page_lock = asyncio.Lock()
619-
623+
620624
# Stealth adapter for stealth mode
621625
self._stealth_adapter = None
622626
if self.config.enable_stealth and not self.use_undetected:
@@ -1102,6 +1106,9 @@ async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
11021106
await self.setup_context(context, crawlerRunConfig)
11031107
self.contexts_by_config[config_signature] = context
11041108

1109+
# Increment reference count - this context is now in use
1110+
self._context_refcounts[config_signature] = self._context_refcounts.get(config_signature, 0) + 1
1111+
11051112
# Create a new page from the chosen context
11061113
page = await context.new_page()
11071114
await self._apply_stealth_to_page(page)
@@ -1137,11 +1144,131 @@ def _cleanup_expired_sessions(self):
11371144
for sid in expired_sessions:
11381145
asyncio.create_task(self.kill_session(sid))
11391146

1147+
async def cleanup_contexts(self, max_contexts: int = 5, force: bool = False):
1148+
"""
1149+
Clean up contexts to prevent memory growth.
1150+
Only closes contexts that have no active references AND no open pages (safe cleanup).
1151+
1152+
Args:
1153+
max_contexts: Maximum number of contexts to keep. Excess idle contexts
1154+
will be closed, starting with the oldest ones.
1155+
force: If True, close contexts even if they have pages (but never if refcount > 0).
1156+
Use with caution.
1157+
"""
1158+
async with self._contexts_lock:
1159+
# First, identify contexts that are safe to close:
1160+
# - No active references (refcount == 0)
1161+
# - No open pages (or force=True)
1162+
idle_contexts = []
1163+
active_contexts = []
1164+
1165+
for sig, ctx in list(self.contexts_by_config.items()):
1166+
try:
1167+
refcount = self._context_refcounts.get(sig, 0)
1168+
has_pages = hasattr(ctx, 'pages') and len(ctx.pages) > 0
1169+
1170+
# Context is safe to close only if refcount is 0
1171+
if refcount > 0:
1172+
# Context is actively being used by a request - never close
1173+
active_contexts.append((sig, ctx))
1174+
elif has_pages and not force:
1175+
# Has pages but no refs - might be finishing up, skip unless forced
1176+
active_contexts.append((sig, ctx))
1177+
else:
1178+
# refcount == 0 and (no pages or force=True) - safe to close
1179+
idle_contexts.append((sig, ctx))
1180+
except Exception:
1181+
# Context may be in bad state, only cleanup if no refs
1182+
if self._context_refcounts.get(sig, 0) == 0:
1183+
idle_contexts.append((sig, ctx))
1184+
else:
1185+
active_contexts.append((sig, ctx))
1186+
1187+
# Log context status for debugging
1188+
if self.logger:
1189+
self.logger.debug(
1190+
message="Context cleanup check: {total} total, {idle} idle (refcount=0), {active} active",
1191+
tag="CLEANUP",
1192+
params={
1193+
"total": len(self.contexts_by_config),
1194+
"idle": len(idle_contexts),
1195+
"active": len(active_contexts)
1196+
}
1197+
)
1198+
1199+
# Close idle contexts if we exceed max_contexts total
1200+
contexts_to_close = []
1201+
if len(self.contexts_by_config) > max_contexts:
1202+
# Calculate how many we need to close
1203+
excess = len(self.contexts_by_config) - max_contexts
1204+
# Only close from idle contexts (safe)
1205+
contexts_to_close = idle_contexts[:excess]
1206+
1207+
# If force=True and we still have too many, close active ones too
1208+
if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts:
1209+
remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts
1210+
contexts_to_close.extend(active_contexts[:remaining_excess])
1211+
1212+
# Perform cleanup
1213+
for sig, ctx in contexts_to_close:
1214+
try:
1215+
# If forcing and context has pages, close them first
1216+
if force and hasattr(ctx, 'pages'):
1217+
for page in list(ctx.pages):
1218+
try:
1219+
await page.close()
1220+
except Exception:
1221+
pass
1222+
1223+
# Remove from our tracking dicts
1224+
self.contexts_by_config.pop(sig, None)
1225+
self._context_refcounts.pop(sig, None)
1226+
1227+
# Close the context
1228+
await ctx.close()
1229+
1230+
if self.logger:
1231+
self.logger.info(
1232+
message="Cleaned up context: {sig}",
1233+
tag="CLEANUP",
1234+
params={"sig": sig[:8]}
1235+
)
1236+
except Exception as e:
1237+
# Still remove from tracking even if close fails
1238+
self.contexts_by_config.pop(sig, None)
1239+
self._context_refcounts.pop(sig, None)
1240+
if self.logger:
1241+
self.logger.warning(
1242+
message="Error closing context during cleanup: {error}",
1243+
tag="WARNING",
1244+
params={"error": str(e)}
1245+
)
1246+
1247+
return len(contexts_to_close) # Return count of cleaned contexts
1248+
1249+
async def release_context(self, config_signature: str):
1250+
"""
1251+
Decrement the reference count for a context after a crawl completes.
1252+
Call this when a crawl operation finishes (success or failure).
1253+
1254+
Args:
1255+
config_signature: The config signature of the context to release
1256+
"""
1257+
async with self._contexts_lock:
1258+
if config_signature in self._context_refcounts:
1259+
self._context_refcounts[config_signature] = max(0, self._context_refcounts[config_signature] - 1)
1260+
if self.logger:
1261+
self.logger.debug(
1262+
message="Released context ref: {sig}, remaining refs: {refs}",
1263+
tag="CLEANUP",
1264+
params={"sig": config_signature[:8], "refs": self._context_refcounts[config_signature]}
1265+
)
1266+
11401267
async def close(self):
11411268
"""Close all browser resources and clean up."""
11421269
if self.config.cdp_url:
11431270
return
1144-
1271+
11451272
if self.config.sleep_on_close:
11461273
await asyncio.sleep(0.5)
11471274

deploy/docker/api.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -576,21 +576,33 @@ async def handle_crawl_request(
576576

577577
results = []
578578
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
579-
partial_func = partial(func,
580-
urls[0] if len(urls) == 1 else urls,
581-
config=crawler_config,
579+
partial_func = partial(func,
580+
urls[0] if len(urls) == 1 else urls,
581+
config=crawler_config,
582582
dispatcher=dispatcher)
583583
results = await partial_func()
584-
584+
585585
# Ensure results is always a list
586586
if not isinstance(results, list):
587587
results = [results]
588588

589+
# Clean up idle browser contexts to prevent memory leaks
590+
# Only closes contexts with no open pages (safe cleanup)
591+
try:
592+
if hasattr(crawler, 'crawler_strategy') and hasattr(crawler.crawler_strategy, 'browser_manager'):
593+
bm = crawler.crawler_strategy.browser_manager
594+
# Clean up idle contexts (keep at most 3 to allow some reuse)
595+
cleaned_count = await bm.cleanup_contexts(max_contexts=3)
596+
if cleaned_count > 0:
597+
logger.info(f"Browser cleanup: closed {cleaned_count} idle context(s)")
598+
except Exception as e:
599+
logger.warning(f"Browser context cleanup warning: {e}")
600+
589601
# await crawler.close()
590-
602+
591603
end_mem_mb = _get_memory_mb() # <--- Get memory after
592604
end_time = time.time()
593-
605+
594606
if start_mem_mb is not None and end_mem_mb is not None:
595607
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
596608
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory

0 commit comments

Comments
 (0)