Skip to content

Commit a046203

Browse files
committed
fix: prevent memory leak by closing unused context
When scraping many URLs continuously, browser contexts accumulated in memory and were never cleaned up. The existing cleanup only ran when browsers went idle, which never happened under continuous load. See: #943. Key changes: - browser_manager.py: Add _context_refcounts tracking, cleanup_contexts(), and release_context() methods - async_crawler_strategy.py: Release context ref in finally block after crawl - deploy/docker/api.py: Trigger context cleanup after each request This fixes or at least, drastically improve the memory leaks in my testing.
1 parent c85f56b commit a046203

File tree

3 files changed

+153
-10
lines changed

3 files changed

+153
-10
lines changed

crawl4ai/async_crawler_strategy.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1053,9 +1053,17 @@ async def get_delayed_content(delay: float = 5.0) -> str:
10531053
raise e
10541054

10551055
finally:
1056+
# Release the context reference so cleanup can work
1057+
if not self.browser_config.use_managed_browser:
1058+
try:
1059+
config_signature = self.browser_manager._make_config_signature(config)
1060+
await self.browser_manager.release_context(config_signature)
1061+
except Exception:
1062+
pass # Don't fail on cleanup
1063+
10561064
# If no session_id is given we should close the page
10571065
all_contexts = page.context.browser.contexts
1058-
total_pages = sum(len(context.pages) for context in all_contexts)
1066+
total_pages = sum(len(context.pages) for context in all_contexts)
10591067
if config.session_id:
10601068
pass
10611069
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):

crawl4ai/browser_manager.py

Lines changed: 126 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -611,12 +611,16 @@ def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: b
611611
# Keep track of contexts by a "config signature," so each unique config reuses a single context
612612
self.contexts_by_config = {}
613613
self._contexts_lock = asyncio.Lock()
614-
614+
615+
# Reference counting for contexts - tracks how many requests are using each context
616+
# Key: config_signature, Value: count of active requests using this context
617+
self._context_refcounts = {}
618+
615619
# Serialize context.new_page() across concurrent tasks to avoid races
616620
# when using a shared persistent context (context.pages may be empty
617621
# for all racers). Prevents 'Target page/context closed' errors.
618622
self._page_lock = asyncio.Lock()
619-
623+
620624
# Stealth adapter for stealth mode
621625
self._stealth_adapter = None
622626
if self.config.enable_stealth and not self.use_undetected:
@@ -1102,6 +1106,9 @@ async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
11021106
await self.setup_context(context, crawlerRunConfig)
11031107
self.contexts_by_config[config_signature] = context
11041108

1109+
# Increment reference count - this context is now in use
1110+
self._context_refcounts[config_signature] = self._context_refcounts.get(config_signature, 0) + 1
1111+
11051112
# Create a new page from the chosen context
11061113
page = await context.new_page()
11071114
await self._apply_stealth_to_page(page)
@@ -1137,11 +1144,127 @@ def _cleanup_expired_sessions(self):
11371144
for sid in expired_sessions:
11381145
asyncio.create_task(self.kill_session(sid))
11391146

1147+
async def cleanup_contexts(self, max_contexts: int = 5, force: bool = False):
1148+
"""
1149+
Clean up contexts to prevent memory growth.
1150+
Only closes contexts that have no active references AND no open pages (safe cleanup).
1151+
1152+
Args:
1153+
max_contexts: Maximum number of contexts to keep. Excess idle contexts
1154+
will be closed, starting with the oldest ones.
1155+
force: If True, close contexts even if they have pages (but never if refcount > 0).
1156+
Use with caution.
1157+
"""
1158+
async with self._contexts_lock:
1159+
# First, identify contexts that are safe to close:
1160+
# - No active references (refcount == 0)
1161+
# - No open pages (or force=True)
1162+
idle_contexts = []
1163+
active_contexts = []
1164+
1165+
for sig, ctx in list(self.contexts_by_config.items()):
1166+
try:
1167+
refcount = self._context_refcounts.get(sig, 0)
1168+
has_pages = hasattr(ctx, 'pages') and len(ctx.pages) > 0
1169+
1170+
# Context is safe to close only if refcount is 0
1171+
if refcount > 0:
1172+
# Context is actively being used by a request - never close
1173+
active_contexts.append((sig, ctx))
1174+
elif has_pages and not force:
1175+
# Has pages but no refs - might be finishing up, skip unless forced
1176+
active_contexts.append((sig, ctx))
1177+
else:
1178+
# refcount == 0 and (no pages or force=True) - safe to close
1179+
idle_contexts.append((sig, ctx))
1180+
except Exception:
1181+
# Context may be in bad state, only cleanup if no refs
1182+
if self._context_refcounts.get(sig, 0) == 0:
1183+
idle_contexts.append((sig, ctx))
1184+
else:
1185+
active_contexts.append((sig, ctx))
1186+
1187+
# Log context status for debugging
1188+
self.logger.debug(
1189+
message="Context cleanup check: {total} total, {idle} idle (refcount=0), {active} active",
1190+
tag="CLEANUP",
1191+
params={
1192+
"total": len(self.contexts_by_config),
1193+
"idle": len(idle_contexts),
1194+
"active": len(active_contexts)
1195+
}
1196+
)
1197+
1198+
# Close idle contexts if we exceed max_contexts total
1199+
contexts_to_close = []
1200+
if len(self.contexts_by_config) > max_contexts:
1201+
# Calculate how many we need to close
1202+
excess = len(self.contexts_by_config) - max_contexts
1203+
# Only close from idle contexts (safe)
1204+
contexts_to_close = idle_contexts[:excess]
1205+
1206+
# If force=True and we still have too many, close active ones too
1207+
if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts:
1208+
remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts
1209+
contexts_to_close.extend(active_contexts[:remaining_excess])
1210+
1211+
# Perform cleanup
1212+
for sig, ctx in contexts_to_close:
1213+
try:
1214+
# If forcing and context has pages, close them first
1215+
if force and hasattr(ctx, 'pages'):
1216+
for page in list(ctx.pages):
1217+
try:
1218+
await page.close()
1219+
except Exception:
1220+
pass
1221+
1222+
# Remove from our tracking dicts
1223+
self.contexts_by_config.pop(sig, None)
1224+
self._context_refcounts.pop(sig, None)
1225+
1226+
# Close the context
1227+
await ctx.close()
1228+
1229+
self.logger.info(
1230+
message="Cleaned up context: {sig}",
1231+
tag="CLEANUP",
1232+
params={"sig": sig[:8]}
1233+
)
1234+
except Exception as e:
1235+
# Still remove from tracking even if close fails
1236+
self.contexts_by_config.pop(sig, None)
1237+
self._context_refcounts.pop(sig, None)
1238+
self.logger.warning(
1239+
message="Error closing context during cleanup: {error}",
1240+
tag="WARNING",
1241+
params={"error": str(e)}
1242+
)
1243+
1244+
return len(contexts_to_close) # Return count of cleaned contexts
1245+
1246+
async def release_context(self, config_signature: str):
1247+
"""
1248+
Decrement the reference count for a context after a crawl completes.
1249+
Call this when a crawl operation finishes (success or failure).
1250+
1251+
Args:
1252+
config_signature: The config signature of the context to release
1253+
"""
1254+
async with self._contexts_lock:
1255+
if config_signature in self._context_refcounts:
1256+
self._context_refcounts[config_signature] = max(0, self._context_refcounts[config_signature] - 1)
1257+
self.logger.debug(
1258+
message="Released context ref: {sig}, remaining refs: {refs}",
1259+
tag="CLEANUP",
1260+
params={"sig": config_signature[:8], "refs": self._context_refcounts[config_signature]}
1261+
)
1262+
11401263
async def close(self):
11411264
"""Close all browser resources and clean up."""
11421265
if self.config.cdp_url:
11431266
return
1144-
1267+
11451268
if self.config.sleep_on_close:
11461269
await asyncio.sleep(0.5)
11471270

deploy/docker/api.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -579,21 +579,33 @@ async def handle_crawl_request(
579579

580580
results = []
581581
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
582-
partial_func = partial(func,
583-
urls[0] if len(urls) == 1 else urls,
584-
config=crawler_config,
582+
partial_func = partial(func,
583+
urls[0] if len(urls) == 1 else urls,
584+
config=crawler_config,
585585
dispatcher=dispatcher)
586586
results = await partial_func()
587-
587+
588588
# Ensure results is always a list
589589
if not isinstance(results, list):
590590
results = [results]
591591

592+
# Clean up idle browser contexts to prevent memory leaks
593+
# Only closes contexts with no open pages (safe cleanup)
594+
try:
595+
if hasattr(crawler, 'crawler_strategy') and hasattr(crawler.crawler_strategy, 'browser_manager'):
596+
bm = crawler.crawler_strategy.browser_manager
597+
# Clean up idle contexts (keep at most 3 to allow some reuse)
598+
cleaned_count = await bm.cleanup_contexts(max_contexts=3)
599+
if cleaned_count > 0:
600+
logger.info(f"Browser cleanup: closed {cleaned_count} idle context(s)")
601+
except Exception as e:
602+
logger.warning(f"Browser context cleanup warning: {e}")
603+
592604
# await crawler.close()
593-
605+
594606
end_mem_mb = _get_memory_mb() # <--- Get memory after
595607
end_time = time.time()
596-
608+
597609
if start_mem_mb is not None and end_mem_mb is not None:
598610
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
599611
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory

0 commit comments

Comments
 (0)