Skip to content

Commit bc80ba5

Browse files
authored
feat: add lifespan rt metrics (alibaba#590)
1 parent b3b7754 commit bc80ba5

File tree

2 files changed

+132
-12
lines changed

2 files changed

+132
-12
lines changed

rock/sandbox/base_actor.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class BaseActor:
3737
_namespace = "default"
3838
_metrics_endpoint = ""
3939
_user_defined_tags: dict = {}
40+
_created_time: float = None
4041

4142
def __init__(
4243
self,
@@ -48,6 +49,7 @@ def __init__(
4849
self._gauges: dict[str, _Gauge] = {}
4950
if isinstance(config, DockerDeploymentConfig) and config.auto_clear_time:
5051
self._auto_clear_time_in_minutes = config.auto_clear_time
52+
self._created_time = time.monotonic()
5153
self._stop_time = datetime.datetime.now() + datetime.timedelta(minutes=self._auto_clear_time_in_minutes)
5254
# Initialize the user and environment info - can be overridden by subclasses
5355
self._role = "test"
@@ -103,6 +105,9 @@ def _init_monitor(self):
103105
self._gauges["net"] = self.meter.create_gauge(
104106
name="xrl_gateway.system.network", description="Network Usage", unit="1"
105107
)
108+
self._gauges["rt"] = self.meter.create_gauge(
109+
name="xrl_gateway.system.lifespan_rt", description="Life Span Rt", unit="1"
110+
)
106111

107112
async def _setup_monitor(self):
108113
if not env_vars.ROCK_MONITOR_ENABLE:
@@ -152,19 +157,20 @@ async def _collect_sandbox_metrics(self, sandbox_id: str):
152157
return
153158
logger.debug(f"sandbox [{sandbox_id}] metrics = {metrics}")
154159

160+
attributes = {
161+
"sandbox_id": sandbox_id,
162+
"env": self._env,
163+
"role": self._role,
164+
"host": self.host,
165+
"ip": self._ip,
166+
"user_id": self._user_id,
167+
"experiment_id": self._experiment_id,
168+
"namespace": self._namespace,
169+
}
170+
if self._user_defined_tags is not None:
171+
attributes.update(self._user_defined_tags)
172+
155173
if metrics.get("cpu") is not None:
156-
attributes = {
157-
"sandbox_id": sandbox_id,
158-
"env": self._env,
159-
"role": self._role,
160-
"host": self.host,
161-
"ip": self._ip,
162-
}
163-
if self._user_defined_tags is not None:
164-
attributes.update(self._user_defined_tags)
165-
attributes["user_id"] = self._user_id
166-
attributes["experiment_id"] = self._experiment_id
167-
attributes["namespace"] = self._namespace
168174
self._gauges["cpu"].set(metrics["cpu"], attributes=attributes)
169175
self._gauges["mem"].set(metrics["mem"], attributes=attributes)
170176
self._gauges["disk"].set(metrics["disk"], attributes=attributes)
@@ -173,6 +179,9 @@ async def _collect_sandbox_metrics(self, sandbox_id: str):
173179
logger.debug(f"Successfully reported metrics for sandbox: {sandbox_id}")
174180
else:
175181
logger.warning(f"No metrics returned for sandbox: {sandbox_id}")
182+
183+
life_span_rt = time.monotonic() - self._created_time
184+
self._gauges["rt"].set(life_span_rt, attributes=attributes)
176185
single_sandbox_report_rt = time.perf_counter() - start
177186
logger.debug(f"Single sandbox report rt:{single_sandbox_report_rt:.4f}s")
178187

tests/unit/test_base_actor.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1+
import datetime
2+
from unittest.mock import MagicMock
3+
14
import pytest
25
import ray
36

47
from rock.deployments.config import LocalDeploymentConfig
58
from rock.logger import init_logger
9+
from rock.sandbox.base_actor import BaseActor
610
from rock.sandbox.sandbox_actor import SandboxActor
711

812
logger = init_logger(__name__)
@@ -131,3 +135,110 @@ async def test_user_defined_tags_with_empty_dict(ray_init_shutdown):
131135
logger.info(f"Empty dict set successfully: {result}")
132136
finally:
133137
ray.kill(sandbox_actor)
138+
139+
140+
class ConcreteBaseActor(BaseActor):
141+
"""Minimal concrete subclass used only for unit testing BaseActor."""
142+
143+
async def get_sandbox_statistics(self):
144+
return {"cpu": 10.0, "mem": 20.0, "disk": 30.0, "net": 40.0}
145+
146+
147+
def _make_actor() -> ConcreteBaseActor:
148+
"""Create a ConcreteBaseActor with lightweight mocked dependencies."""
149+
config = MagicMock()
150+
config.container_name = "test-container"
151+
config.auto_clear_time = None # skip DockerDeploymentConfig branch
152+
153+
deployment = MagicMock()
154+
deployment.__class__ = object # make isinstance(deployment, DockerDeployment) return False
155+
156+
actor = ConcreteBaseActor(config, deployment)
157+
actor.host = "127.0.0.1"
158+
# Pre-populate all gauges with mocks so tests can override selectively
159+
for key in ("cpu", "mem", "disk", "net", "rt"):
160+
actor._gauges[key] = MagicMock()
161+
return actor
162+
163+
164+
@pytest.mark.asyncio
165+
async def test_life_span_rt_gauge_is_set_during_metrics_collection():
166+
"""life_span_rt gauge must be set with the elapsed timedelta after collection."""
167+
actor = _make_actor()
168+
mock_rt_gauge = MagicMock()
169+
actor._gauges["rt"] = mock_rt_gauge
170+
171+
await actor._collect_sandbox_metrics("test-container")
172+
173+
assert mock_rt_gauge.set.called, "life_span_rt gauge.set() was never called"
174+
life_span_rt_value = mock_rt_gauge.set.call_args[0][0]
175+
assert isinstance(life_span_rt_value, float), f"Expected float, got {type(life_span_rt_value)}"
176+
assert life_span_rt_value >= 0, "life_span_rt must be non-negative"
177+
178+
179+
@pytest.mark.asyncio
180+
async def test_life_span_rt_increases_over_time():
181+
"""life_span_rt reported on a second call must be >= the first call's value."""
182+
actor = _make_actor()
183+
mock_rt_gauge = MagicMock()
184+
actor._gauges["rt"] = mock_rt_gauge
185+
186+
await actor._collect_sandbox_metrics("test-container")
187+
first_rt: datetime.timedelta = mock_rt_gauge.set.call_args[0][0]
188+
189+
await actor._collect_sandbox_metrics("test-container")
190+
second_rt: datetime.timedelta = mock_rt_gauge.set.call_args[0][0]
191+
192+
assert second_rt >= first_rt, f"life_span_rt should be non-decreasing: first={first_rt}, second={second_rt}"
193+
194+
195+
@pytest.mark.asyncio
196+
async def test_life_span_rt_attributes_contain_expected_keys():
197+
"""Attributes passed to life_span_rt gauge must include all standard dimension keys."""
198+
actor = _make_actor()
199+
actor._env = "prod"
200+
actor._role = "worker"
201+
actor._user_id = "user-42"
202+
actor._experiment_id = "exp-7"
203+
actor._namespace = "ns-test"
204+
actor.host = "10.0.0.1"
205+
206+
mock_rt_gauge = MagicMock()
207+
actor._gauges["rt"] = mock_rt_gauge
208+
209+
await actor._collect_sandbox_metrics("test-container")
210+
211+
attributes = mock_rt_gauge.set.call_args[1]["attributes"]
212+
expected_keys = {"sandbox_id", "env", "role", "host", "ip", "user_id", "experiment_id", "namespace"}
213+
assert expected_keys.issubset(attributes.keys()), f"Missing attribute keys: {expected_keys - attributes.keys()}"
214+
assert attributes["env"] == "prod"
215+
assert attributes["role"] == "worker"
216+
assert attributes["user_id"] == "user-42"
217+
assert attributes["experiment_id"] == "exp-7"
218+
assert attributes["namespace"] == "ns-test"
219+
220+
221+
@pytest.mark.asyncio
222+
async def test_life_span_rt_set_even_when_no_cpu_metrics():
223+
"""life_span_rt must be reported even when get_sandbox_statistics returns no cpu data."""
224+
225+
class NoCpuActor(BaseActor):
226+
async def get_sandbox_statistics(self):
227+
return {} # cpu key absent
228+
229+
config = MagicMock()
230+
config.container_name = "no-cpu-container"
231+
config.auto_clear_time = None
232+
deployment = MagicMock()
233+
deployment.__class__ = object
234+
235+
actor = NoCpuActor(config, deployment)
236+
actor.host = "127.0.0.1"
237+
for key in ("cpu", "mem", "disk", "net", "rt"):
238+
actor._gauges[key] = MagicMock()
239+
240+
mock_rt_gauge = actor._gauges["rt"]
241+
242+
await actor._collect_sandbox_metrics("no-cpu-container")
243+
244+
assert mock_rt_gauge.set.called, "life_span_rt gauge.set() must be called even when cpu metrics are absent"

0 commit comments

Comments
 (0)