Skip to content

Commit daeb330

Browse files
fix(agent-sandbox): wait for sandbox api reachability after ready
1 parent f7f6921 commit daeb330

File tree

2 files changed

+150
-1
lines changed

2 files changed

+150
-1
lines changed

clients/python/agentic-sandbox-client/k8s_agent_sandbox/sandbox_client.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
file I/O) with the sandbox environment, including optional OpenTelemetry tracing.
1818
"""
1919

20-
import json
2120
import os
2221
import sys
2322
import time
@@ -106,6 +105,8 @@ def __init__(
106105
sandbox_ready_timeout: int = 180,
107106
gateway_ready_timeout: int = 180,
108107
port_forward_ready_timeout: int = 30,
108+
api_ready_timeout: float = 15.0,
109+
api_probe_interval: float = 0.5,
109110
enable_tracing: bool = False,
110111
trace_service_name: str = "sandbox-client",
111112
claim_name: str | None = None,
@@ -133,6 +134,8 @@ def __init__(
133134
self.sandbox_ready_timeout = sandbox_ready_timeout
134135
self.gateway_ready_timeout = gateway_ready_timeout
135136
self.port_forward_ready_timeout = port_forward_ready_timeout
137+
self.api_ready_timeout = api_ready_timeout
138+
self.api_probe_interval = api_probe_interval
136139
self.delete_on_exit = delete_on_exit
137140
self._custom_claim_name = claim_name
138141

@@ -491,6 +494,54 @@ def _wait_for_gateway_ip(self):
491494
f" an IP within {self.gateway_ready_timeout} seconds."
492495
)
493496

497+
@trace_span("wait_for_api_ready")
498+
def _wait_for_api_ready(self):
499+
"""Wait until the configured API URL is DNS-resolvable and HTTP-reachable."""
500+
if not self.base_url:
501+
raise RuntimeError("Cannot wait for API; base_url is not configured.")
502+
503+
parsed_url = urllib.parse.urlparse(self.base_url)
504+
host = parsed_url.hostname
505+
if not host:
506+
raise RuntimeError(f"Invalid base URL, missing host: '{self.base_url}'")
507+
508+
port = parsed_url.port
509+
if port is None:
510+
port = 443 if parsed_url.scheme == "https" else 80
511+
512+
deadline = time.monotonic() + self.api_ready_timeout
513+
last_error: Exception | None = None
514+
515+
logging.info(f"Waiting for sandbox API reachability at '{self.base_url}'...")
516+
517+
while time.monotonic() < deadline:
518+
try:
519+
socket.getaddrinfo(host, port, type=socket.SOCK_STREAM)
520+
response = requests.get(
521+
self.base_url,
522+
timeout=1.5,
523+
allow_redirects=False,
524+
)
525+
if response.status_code < 500:
526+
logging.info(
527+
f"Sandbox API is reachable at '{self.base_url}' "
528+
f"(status {response.status_code})."
529+
)
530+
return
531+
last_error = RuntimeError(
532+
f"Received HTTP {response.status_code} from sandbox API."
533+
)
534+
except (socket.gaierror, requests.exceptions.RequestException) as e:
535+
last_error = e
536+
537+
time.sleep(self.api_probe_interval)
538+
539+
self.__exit__(None, None, None)
540+
raise TimeoutError(
541+
f"Sandbox API '{self.base_url}' did not become reachable within "
542+
f"{self.api_ready_timeout} seconds. Last error: {last_error}"
543+
)
544+
494545
def __enter__(self) -> 'SandboxClient':
495546
trace_context_str = ""
496547
# We can't use the "with trace..." context management. This is the equivalent.
@@ -515,6 +566,7 @@ def __enter__(self) -> 'SandboxClient':
515566
# Case 3: No Gateway, No URL -> Developer Mode (Port Forward to Router)
516567
self._start_and_wait_for_port_forward()
517568

569+
self._wait_for_api_ready()
518570
return self
519571

520572
def __exit__(self, exc_type, exc_val, exc_tb):

clients/python/agentic-sandbox-client/test_sandbox_client_unit.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import socket
1516
from unittest.mock import MagicMock
1617

1718
import pytest
@@ -25,6 +26,7 @@ def _build_client(
2526
custom_objects_api: MagicMock,
2627
template_name: str = "python-template",
2728
claim_name: str | None = None,
29+
api_url: str | None = None,
2830
) -> SandboxClient:
2931
def _raise_incluster() -> None:
3032
raise sandbox_client_module.config.ConfigException("not running in cluster")
@@ -39,6 +41,7 @@ def _raise_incluster() -> None:
3941
return SandboxClient(
4042
template_name=template_name,
4143
claim_name=claim_name,
44+
api_url=api_url,
4245
delete_on_exit=False,
4346
)
4447

@@ -92,3 +95,97 @@ def test_delete_clears_claim_name(monkeypatch: pytest.MonkeyPatch):
9295
assert sandbox_client.delete() is True
9396
assert sandbox_client.claim_name is None
9497
custom_objects_api.delete_namespaced_custom_object.assert_called_once()
98+
99+
100+
def test_enter_waits_for_api_ready(monkeypatch: pytest.MonkeyPatch):
101+
custom_objects_api = MagicMock()
102+
sandbox_client = _build_client(
103+
monkeypatch,
104+
custom_objects_api,
105+
api_url="http://sb-abc.sandbox-router",
106+
)
107+
108+
setup_claim = MagicMock()
109+
wait_for_ready = MagicMock()
110+
wait_for_api = MagicMock()
111+
wait_for_gateway = MagicMock()
112+
start_port_forward = MagicMock()
113+
114+
monkeypatch.setattr(sandbox_client, "_setup_claim", setup_claim)
115+
monkeypatch.setattr(sandbox_client, "_wait_for_sandbox_ready", wait_for_ready)
116+
monkeypatch.setattr(sandbox_client, "_wait_for_api_ready", wait_for_api)
117+
monkeypatch.setattr(sandbox_client, "_wait_for_gateway_ip", wait_for_gateway)
118+
monkeypatch.setattr(
119+
sandbox_client, "_start_and_wait_for_port_forward", start_port_forward
120+
)
121+
122+
assert sandbox_client.__enter__() is sandbox_client
123+
setup_claim.assert_called_once()
124+
wait_for_ready.assert_called_once()
125+
wait_for_api.assert_called_once()
126+
wait_for_gateway.assert_not_called()
127+
start_port_forward.assert_not_called()
128+
129+
130+
def test_wait_for_api_ready_retries_until_resolvable_and_reachable(
131+
monkeypatch: pytest.MonkeyPatch,
132+
):
133+
custom_objects_api = MagicMock()
134+
sandbox_client = _build_client(
135+
monkeypatch,
136+
custom_objects_api,
137+
api_url="http://sb-abc.sandbox-router",
138+
)
139+
sandbox_client.api_ready_timeout = 1
140+
sandbox_client.api_probe_interval = 0
141+
142+
dns_attempts = {"count": 0}
143+
http_attempts = {"count": 0}
144+
145+
def fake_getaddrinfo(host: str, port: int, type: int):
146+
assert host == "sb-abc.sandbox-router"
147+
assert port == 80
148+
assert type == socket.SOCK_STREAM
149+
dns_attempts["count"] += 1
150+
if dns_attempts["count"] < 3:
151+
raise socket.gaierror(-3, "temporary failure in name resolution")
152+
return [("ok",)]
153+
154+
def fake_get(url: str, timeout: float, allow_redirects: bool):
155+
assert url == "http://sb-abc.sandbox-router"
156+
assert timeout == 1.5
157+
assert allow_redirects is False
158+
http_attempts["count"] += 1
159+
if http_attempts["count"] == 1:
160+
raise sandbox_client_module.requests.exceptions.ConnectionError(
161+
"connection refused"
162+
)
163+
response = MagicMock()
164+
response.status_code = 404
165+
return response
166+
167+
monkeypatch.setattr(sandbox_client_module.socket, "getaddrinfo", fake_getaddrinfo)
168+
monkeypatch.setattr(sandbox_client_module.requests, "get", fake_get)
169+
monkeypatch.setattr(sandbox_client_module.time, "sleep", lambda _: None)
170+
171+
sandbox_client._wait_for_api_ready()
172+
assert dns_attempts["count"] == 4
173+
assert http_attempts["count"] == 2
174+
175+
176+
def test_wait_for_api_ready_times_out(monkeypatch: pytest.MonkeyPatch):
177+
custom_objects_api = MagicMock()
178+
sandbox_client = _build_client(
179+
monkeypatch,
180+
custom_objects_api,
181+
api_url="http://sb-timeout.sandbox-router",
182+
)
183+
sandbox_client.api_ready_timeout = 0
184+
185+
cleanup = MagicMock()
186+
monkeypatch.setattr(sandbox_client, "__exit__", cleanup)
187+
188+
with pytest.raises(TimeoutError):
189+
sandbox_client._wait_for_api_ready()
190+
191+
cleanup.assert_called_once_with(None, None, None)

0 commit comments

Comments
 (0)