Skip to content

Commit a52322f

Browse files
feat(langchain): add per-thread sandbox isolation for chat applications
Add ThreadedSandboxManager to map conversation thread IDs to isolated sandbox environments with persistent filesystems: - ThreadedSandboxManager class with get_backend(), delete_thread(), cleanup_idle(), and close() methods for lifecycle management - create_threaded_backend_factory() for DeepAgents integration that reads thread_id from LangGraph config - SandboxClient enhancements: claim_name parameter for custom/deterministic names, delete_on_exit flag, connect() class method, delete() method, and was_reconnected property Includes proper error handling: - Resource cleanup on backend creation failure - Continues delete() even if __exit__() fails - Logs kubectl commands for manual cleanup on failures New tests cover creation failure, deletion exceptions, concurrent access, and call ordering (25 tests total for thread manager). Example multi_thread_chat.py demonstrates multi-thread isolation with LangGraph checkpointer for message history.
1 parent 913524e commit a52322f

File tree

6 files changed

+1467
-4
lines changed

6 files changed

+1467
-4
lines changed

clients/python/agentic-sandbox-client/agentic_sandbox/sandbox_client.py

Lines changed: 154 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,24 @@ class ExecutionResult:
6767
class SandboxClient:
6868
"""
6969
A client for creating and interacting with a stateful Sandbox via a router.
70+
71+
Args:
72+
template_name: Name of the SandboxTemplate to claim.
73+
namespace: Kubernetes namespace for the sandbox (default: "default").
74+
gateway_name: Optional gateway name for production mode.
75+
gateway_namespace: Namespace where the gateway lives (default: "default").
76+
api_url: Direct API URL bypassing gateway discovery.
77+
server_port: Port the sandbox runtime listens on (default: 8888).
78+
sandbox_ready_timeout: Timeout waiting for sandbox readiness (default: 180s).
79+
gateway_ready_timeout: Timeout waiting for gateway IP (default: 180s).
80+
port_forward_ready_timeout: Timeout waiting for port-forward (default: 30s).
81+
enable_tracing: Enable OpenTelemetry tracing.
82+
trace_service_name: Service name for tracing (default: "sandbox-client").
83+
claim_name: Optional custom claim name. If provided and the claim already
84+
exists, the client reconnects to it (check was_reconnected property).
85+
If the claim does not exist, a new one is created with this name.
86+
delete_on_exit: Whether to delete the sandbox on context exit (default: True).
87+
Set to False for persistent sandboxes that survive across sessions.
7088
"""
7189

7290
def __init__(
@@ -82,6 +100,8 @@ def __init__(
82100
port_forward_ready_timeout: int = 30,
83101
enable_tracing: bool = False,
84102
trace_service_name: str = "sandbox-client",
103+
claim_name: str | None = None,
104+
delete_on_exit: bool = True,
85105
):
86106
self.trace_service_name = trace_service_name
87107
self.tracing_manager = None
@@ -105,11 +125,14 @@ def __init__(
105125
self.sandbox_ready_timeout = sandbox_ready_timeout
106126
self.gateway_ready_timeout = gateway_ready_timeout
107127
self.port_forward_ready_timeout = port_forward_ready_timeout
128+
self.delete_on_exit = delete_on_exit
129+
self._custom_claim_name = claim_name
108130

109131
self.port_forward_process: subprocess.Popen | None = None
110132

111133
self.claim_name: str | None = None
112134
self.sandbox_name: str | None = None
135+
self._reconnected: bool = False
113136
self.pod_name: str | None = None
114137
self.annotations: dict | None = None
115138

@@ -135,10 +158,133 @@ def is_ready(self) -> bool:
135158
"""Returns True if the sandbox is ready and the Gateway IP has been found."""
136159
return self.base_url is not None
137160

161+
@property
162+
def was_reconnected(self) -> bool:
163+
"""Returns True if this client reconnected to an existing sandbox.
164+
165+
Only valid after entering the context manager. Returns True if a
166+
custom claim_name was provided and the claim already existed.
167+
"""
168+
return self._reconnected
169+
170+
@classmethod
171+
def connect(
172+
cls,
173+
claim_name: str,
174+
template_name: str,
175+
namespace: str = "default",
176+
**kwargs,
177+
) -> "SandboxClient":
178+
"""Connect to an existing sandbox by claim name.
179+
180+
This is a convenience method for reconnecting to a sandbox that was
181+
created with delete_on_exit=False. If the claim doesn't exist, it will
182+
be created.
183+
184+
Note:
185+
This method automatically sets delete_on_exit=False, so the sandbox
186+
will persist after the context manager exits.
187+
188+
Args:
189+
claim_name: The name of the existing SandboxClaim.
190+
template_name: The template name (required for creating if missing).
191+
namespace: Kubernetes namespace (default: "default").
192+
**kwargs: Additional arguments passed to SandboxClient.
193+
194+
Returns:
195+
SandboxClient configured to connect to the existing sandbox.
196+
197+
Example:
198+
# First session - create persistent sandbox
199+
with SandboxClient(
200+
template_name="python",
201+
claim_name="my-sandbox",
202+
delete_on_exit=False
203+
) as client:
204+
client.run("echo 'hello' > /app/state.txt")
205+
206+
# Later session - reconnect (delete_on_exit=False is set automatically)
207+
with SandboxClient.connect(
208+
claim_name="my-sandbox",
209+
template_name="python"
210+
) as client:
211+
result = client.run("cat /app/state.txt")
212+
"""
213+
return cls(
214+
template_name=template_name,
215+
namespace=namespace,
216+
claim_name=claim_name,
217+
delete_on_exit=False,
218+
**kwargs,
219+
)
220+
221+
def delete(self) -> bool:
222+
"""Explicitly delete the sandbox claim.
223+
224+
Useful when delete_on_exit=False but you want to clean up manually.
225+
226+
Returns:
227+
True if deleted successfully, False if not found or error.
228+
"""
229+
if not self.claim_name:
230+
logging.warning("No claim name set, nothing to delete")
231+
return False
232+
233+
logging.info(f"Explicitly deleting SandboxClaim: {self.claim_name}")
234+
try:
235+
self.custom_objects_api.delete_namespaced_custom_object(
236+
group=CLAIM_API_GROUP,
237+
version=CLAIM_API_VERSION,
238+
namespace=self.namespace,
239+
plural=CLAIM_PLURAL_NAME,
240+
name=self.claim_name
241+
)
242+
return True
243+
except client.ApiException as e:
244+
if e.status == 404:
245+
logging.warning(f"SandboxClaim '{self.claim_name}' not found")
246+
return False
247+
logging.error(f"Error deleting sandbox claim: {e}", exc_info=True)
248+
return False
249+
except Exception as e:
250+
logging.error(f"Unexpected error deleting sandbox claim: {e}", exc_info=True)
251+
return False
252+
253+
def _claim_exists(self, claim_name: str) -> bool:
254+
"""Check if a SandboxClaim with the given name exists."""
255+
try:
256+
self.custom_objects_api.get_namespaced_custom_object(
257+
group=CLAIM_API_GROUP,
258+
version=CLAIM_API_VERSION,
259+
namespace=self.namespace,
260+
plural=CLAIM_PLURAL_NAME,
261+
name=claim_name,
262+
)
263+
return True
264+
except client.ApiException as e:
265+
if e.status == 404:
266+
return False
267+
raise
268+
138269
@trace_span("create_claim")
139270
def _create_claim(self, trace_context_str: str = ""):
140-
"""Creates the SandboxClaim custom resource in the Kubernetes cluster."""
141-
self.claim_name = f"sandbox-claim-{os.urandom(4).hex()}"
271+
"""Creates the SandboxClaim custom resource in the Kubernetes cluster.
272+
273+
If a custom claim_name was provided and that claim already exists,
274+
this method will reconnect to it instead of creating a new one.
275+
"""
276+
if self._custom_claim_name:
277+
self.claim_name = self._custom_claim_name
278+
# Check if the claim already exists
279+
if self._claim_exists(self.claim_name):
280+
logging.info(
281+
f"Reconnecting to existing SandboxClaim '{self.claim_name}' "
282+
f"in namespace '{self.namespace}'..."
283+
)
284+
self._reconnected = True
285+
return
286+
else:
287+
self.claim_name = f"sandbox-claim-{os.urandom(4).hex()}"
142288

143289
span = trace.get_current_span()
144290
if span.is_recording():
@@ -363,8 +509,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
363509
except Exception as e:
364510
logging.error(f"Failed to stop port-forwarding: {e}")
365511

366-
# Delete the SandboxClaim
367-
if self.claim_name:
512+
# Delete the SandboxClaim only if delete_on_exit is True
513+
if self.claim_name and self.delete_on_exit:
368514
logging.info(f"Deleting SandboxClaim: {self.claim_name}")
369515
try:
370516
self.custom_objects_api.delete_namespaced_custom_object(
@@ -381,6 +527,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
381527
except Exception as e:
382528
logging.error(
383529
f"Unexpected error deleting sandbox claim: {e}", exc_info=True)
530+
elif self.claim_name and not self.delete_on_exit:
531+
logging.info(
532+
f"Keeping SandboxClaim '{self.claim_name}' alive (delete_on_exit=False)"
533+
)
384534

385535
# Cleanup Trace if it exists
386536
if self.tracing_manager:

clients/python/langchain-agent-sandbox/langchain_agent_sandbox/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@
33
SandboxPolicyWrapper,
44
WarmPoolBackend,
55
create_sandbox_backend_factory,
6+
create_threaded_backend_factory,
67
)
8+
from .thread_manager import ThreadedSandboxManager
79

810
__all__ = [
911
"AgentSandboxBackend",
1012
"SandboxPolicyWrapper",
1113
"WarmPoolBackend",
14+
"ThreadedSandboxManager",
1215
"create_sandbox_backend_factory",
16+
"create_threaded_backend_factory",
1317
]

clients/python/langchain-agent-sandbox/langchain_agent_sandbox/backend.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,97 @@ def factory(_runtime: Any) -> AgentSandboxBackend:
669669
return factory
670670

671671

672+
def _get_thread_manager_class() -> type:
673+
"""Lazy import of ThreadedSandboxManager to avoid circular imports."""
674+
from .thread_manager import ThreadedSandboxManager
675+
return ThreadedSandboxManager
676+
677+
678+
def create_threaded_backend_factory(
679+
template_name: str,
680+
manager: Optional[Any] = None,
681+
namespace: str = "default",
682+
**kwargs: Any,
683+
) -> Callable[[Any], AgentSandboxBackend]:
684+
"""Create a BackendFactory that provides per-thread sandbox isolation.
685+
686+
This factory reads the thread_id from the LangGraph config and uses a
687+
ThreadedSandboxManager to provide isolated sandboxes per conversation thread.
688+
Each thread gets its own sandbox with persistent filesystem.
689+
690+
Usage:
691+
from deepagents import create_deep_agent
692+
from langgraph.checkpoint.memory import MemorySaver
693+
from langchain_agent_sandbox import (
694+
ThreadedSandboxManager,
695+
create_threaded_backend_factory,
696+
)
697+
698+
# Create manager for lifecycle control
699+
manager = ThreadedSandboxManager(
700+
template_name="python-deepagent",
701+
idle_ttl=timedelta(hours=1),
702+
)
703+
704+
# Create agent with threaded backend
705+
agent = create_deep_agent(
706+
model=model,
707+
backend=create_threaded_backend_factory("python-deepagent", manager=manager),
708+
checkpointer=MemorySaver(), # For message history
709+
)
710+
711+
# Same thread = same sandbox (filesystem persists)
712+
agent.invoke(msg1, config={"configurable": {"thread_id": "user-123"}})
713+
agent.invoke(msg2, config={"configurable": {"thread_id": "user-123"}})
714+
715+
# Different thread = different sandbox
716+
agent.invoke(msg3, config={"configurable": {"thread_id": "user-456"}})
717+
718+
# Cleanup when done
719+
manager.close()
720+
721+
Args:
722+
template_name: Name of the SandboxTemplate to claim.
723+
manager: Optional ThreadedSandboxManager instance. If not provided,
724+
one will be created (but you won't have lifecycle control).
725+
namespace: Kubernetes namespace for the sandbox.
726+
**kwargs: Additional arguments passed to ThreadedSandboxManager.
727+
728+
Returns:
729+
A factory callable that accepts a ToolRuntime and returns an
730+
AgentSandboxBackend for the current thread.
731+
732+
Note:
733+
The thread_id is read from `runtime.config.get("configurable", {}).get("thread_id")`.
734+
If no thread_id is found, a default "default-thread" is used.
735+
736+
Warning:
737+
If no manager is provided, an internal manager is created but you will
738+
have no way to call close() or delete_thread(). For production use,
739+
always pass an explicit manager instance for lifecycle control.
740+
"""
741+
# Create manager if not provided
742+
_manager = manager
743+
if _manager is None:
744+
ThreadedSandboxManager = _get_thread_manager_class()
745+
_manager = ThreadedSandboxManager(
746+
template_name=template_name,
747+
namespace=namespace,
748+
**kwargs,
749+
)
750+
751+
def factory(runtime: Any) -> AgentSandboxBackend:
752+
# Extract thread_id from LangGraph config
753+
config = getattr(runtime, "config", {}) or {}
754+
configurable = config.get("configurable", {}) or {}
755+
thread_id = configurable.get("thread_id", "default-thread")
756+
757+
logger.debug("Creating backend for thread_id: %s", thread_id)
758+
return _manager.get_backend(thread_id)
759+
760+
return factory
761+
762+
672763
class SandboxPolicyWrapper:
673764
"""Wraps AgentSandboxBackend with policy enforcement.
674765

0 commit comments

Comments
 (0)