Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from prometheus_client import Histogram

DISCOVERY_LATENCY_MS = Histogram(
"sandbox_client_discovery_latency_ms",
"Total time in Gateway IP assignment or kubectl port-forward setup.",
["status", "mode"],
buckets=[100, 500, 1000, 5000, 10000, 30000, 60000]
)
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
SANDBOX_API_GROUP, SANDBOX_API_VERSION, SANDBOX_PLURAL_NAME,
POD_NAME_ANNOTATION,
)
from .metrics import DISCOVERY_LATENCY_MS

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
Expand Down Expand Up @@ -335,17 +336,34 @@ def __enter__(self) -> 'SandboxClient':
self._wait_for_sandbox_ready()

# STRATEGY SELECTION
if self.base_url:
# Case 1: API URL provided manually (DNS / Internal) -> Do nothing, just use it.
logging.info(f"Using configured API URL: {self.base_url}")

elif self.gateway_name:
# Case 2: Gateway Name provided -> Production Mode (Discovery)
self._wait_for_gateway_ip()
start_time = time.time()
is_preconfigured = bool(self.base_url)
mode = "unknown"

else:
# Case 3: No Gateway, No URL -> Developer Mode (Port Forward to Router)
self._start_and_wait_for_port_forward()
try:
if is_preconfigured:
# Case 1: API URL provided manually (DNS / Internal) -> Do nothing, just use it.
mode = "preconfigured"
logging.info(f"Using configured API URL: {self.base_url}")
# We do not record discovery latency for pre-configured URL
else:
if self.gateway_name:
# Case 2: Gateway Name provided -> Production Mode (Discovery)
mode = "gateway"
self._wait_for_gateway_ip()
else:
# Case 3: No Gateway, No URL -> Developer Mode (Port Forward to Router)
mode = "port_forward"
self._start_and_wait_for_port_forward()

latency_ms = (time.time() - start_time) * 1000
DISCOVERY_LATENCY_MS.labels(status="success", mode=mode).observe(latency_ms)

except Exception:
if not is_preconfigured:
latency_ms = (time.time() - start_time) * 1000
DISCOVERY_LATENCY_MS.labels(status="failure", mode=mode).observe(latency_ms)
raise

return self

Expand Down
1 change: 1 addition & 0 deletions clients/python/agentic-sandbox-client/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"kubernetes",
"requests",
"pydantic",
"prometheus-client",
]

[project.urls]
Expand Down
20 changes: 20 additions & 0 deletions clients/python/agentic-sandbox-client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pydantic import ValidationError
from k8s_agent_sandbox import SandboxClient
from k8s_agent_sandbox.sandbox_client import ExecutionResult, FileEntry
from k8s_agent_sandbox.metrics import DISCOVERY_LATENCY_MS

POD_NAME_ANNOTATION = "agents.x-k8s.io/pod-name"

Expand Down Expand Up @@ -164,6 +165,25 @@ async def main(template_name: str, gateway_name: str | None, api_url: str | None
sandbox._request = original_request
print("--- Pydantic Validation Tests Passed ---")

print("\n--- Testing Metrics ---")

# Count how many successful discovery latency metrics were recorded in total
total_discovery_metrics = 0
# We use collect() to safely access Prometheus metric values across all label combinations
for metric in DISCOVERY_LATENCY_MS.collect():
for sample in metric.samples:
if sample.name == "sandbox_client_discovery_latency_ms_count" and sample.labels.get("status") == "success":
total_discovery_metrics += sample.value

# As long as it's not preconfigured, it should have recorded discovery latency
if not api_url:
print(f"Total discovery latency metrics recorded: {total_discovery_metrics}")
assert total_discovery_metrics > 0, "Expected at least one discovery latency metric to be recorded"
else:
print("Skipping discovery latency check because api_url is preconfigured")

print("--- Metrics Tests Passed ---")

except Exception as e:
print(f"\n--- An error occurred during the test: {e} ---")
# The __exit__ method of the Sandbox class will handle cleanup.
Expand Down
142 changes: 142 additions & 0 deletions clients/python/agentic-sandbox-client/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import pytest
from unittest.mock import MagicMock, patch
import time

from k8s_agent_sandbox.sandbox_client import SandboxClient
from k8s_agent_sandbox.metrics import DISCOVERY_LATENCY_MS

@pytest.fixture
def mock_k8s_config():
with patch('k8s_agent_sandbox.sandbox_client.config.load_incluster_config'), \
patch('k8s_agent_sandbox.sandbox_client.config.load_kube_config'):
yield

@pytest.fixture
def mock_custom_objects_api():
with patch('k8s_agent_sandbox.sandbox_client.client.CustomObjectsApi') as mock_api:
yield mock_api

@pytest.fixture
def mock_create_claim():
with patch.object(SandboxClient, '_create_claim') as mock:
yield mock

@pytest.fixture
def mock_wait_ready():
with patch.object(SandboxClient, '_wait_for_sandbox_ready') as mock:
yield mock

@pytest.mark.parametrize(
"test_name, setup_kwargs, expected_url, should_fail, expected_mode",
[
(
"dev_mode_success",
{"template_name": "test-template"},
"http://127.0.0.1:12345",
False,
"port_forward"
),
(
"dev_mode_failure",
{"template_name": "test-template"},
None,
True,
"port_forward"
),
(
"gateway_mode_success",
{"template_name": "test-template", "gateway_name": "test-gw"},
"http://10.0.0.1",
False,
"gateway"
),
(
"base_url_mode_no_metric",
{"template_name": "test-template", "api_url": "http://custom-url"},
"http://custom-url",
False,
"preconfigured"
)
]
)
def test_discovery_latency_modes(
test_name, setup_kwargs, expected_url, should_fail,
expected_mode,
mock_k8s_config, mock_custom_objects_api, mock_create_claim, mock_wait_ready
):
with patch('k8s_agent_sandbox.sandbox_client.subprocess.Popen') as mock_popen, \
patch('k8s_agent_sandbox.sandbox_client.socket.socket') as mock_socket, \
patch('k8s_agent_sandbox.sandbox_client.socket.create_connection'), \
patch('k8s_agent_sandbox.sandbox_client.time.sleep'), \
patch('k8s_agent_sandbox.sandbox_client.watch.Watch') as mock_watch:

# Setup mocks based on the test case
if "dev_mode" in test_name:
mock_process = MagicMock()
if should_fail:
mock_process.poll.return_value = 1
mock_process.communicate.return_value = (b"", b"Crash")
else:
mock_process.poll.return_value = None
mock_popen.return_value = mock_process

mock_sock_instance = MagicMock()
mock_sock_instance.getsockname.return_value = ('0.0.0.0', 12345)
mock_socket.return_value.__enter__.return_value = mock_sock_instance

elif "gateway_mode" in test_name:
mock_w_instance = MagicMock()
mock_w_instance.stream.return_value = [{
"type": "ADDED",
"object": {
"status": {
"addresses": [{"value": "10.0.0.1"}]
}
}
}]
mock_watch.return_value = mock_w_instance

# Capture metrics before
try:
before_success = DISCOVERY_LATENCY_MS.labels(status="success", mode=expected_mode)._sum.get()
except:
before_success = 0.0

try:
before_failure = DISCOVERY_LATENCY_MS.labels(status="failure", mode=expected_mode)._sum.get()
except:
before_failure = 0.0

client = SandboxClient(**setup_kwargs)

if should_fail:
with pytest.raises(RuntimeError):
with client:
pass
else:
with client:
assert client.base_url == expected_url

# Capture metrics after
try:
after_success = DISCOVERY_LATENCY_MS.labels(status="success", mode=expected_mode)._sum.get()
except:
after_success = 0.0

try:
after_failure = DISCOVERY_LATENCY_MS.labels(status="failure", mode=expected_mode)._sum.get()
except:
after_failure = 0.0

# For preconfigured URLs, we never record a metric.
if expected_mode == "preconfigured":
assert after_success == before_success
assert after_failure == before_failure
else:
# For actual discovery modes, assert metric changes
if should_fail:
assert after_failure > before_failure
assert after_success == before_success
else:
assert after_success > before_success
assert after_failure == before_failure