fix(BA-5297): exclude unmeasurable metrics from utilization idle check (#10316)

fregataa · claude · web-flow · commit 3e95c7988de5 · 2026-03-19T16:32:55.000+09:00
Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/changes/10316.fix.md b/changes/10316.fix.md
@@ -0,0 +1 @@
+Exclude unmeasurable metrics from utilization idle check instead of treating stat collection failures as 0% usage
diff --git a/src/ai/backend/manager/idle.py b/src/ai/backend/manager/idle.py
@@ -138,15 +138,17 @@ class UtilizationResourceReport(UserDict[str, UtilizationExtraInfo]):
     @classmethod
     def from_avg_threshold(
         cls,
-        avg_utils: Mapping[str, float],
+        avg_utils: Mapping[str, float | None],
         thresholds: ResourceThresholds,
         exclusions: set[str],
     ) -> UtilizationResourceReport:
         data: dict[str, UtilizationExtraInfo] = {}
         for metric_key, val in thresholds.items():
             if val.average is None or metric_key in exclusions:
                 continue
-            avg_util = avg_utils.get(metric_key, 0)
+            avg_util = avg_utils.get(metric_key)
+            if avg_util is None:
+                continue
             data[metric_key] = UtilizationExtraInfo(float(avg_util), float(val.average))
         return cls(data)
 
@@ -1119,13 +1121,15 @@ async def check_idleness(
         # Update utilization time-series data.
         raw_util_series = await self._redis_live.get_live_data(util_series_key)
 
-        def default_util_series() -> dict[str, list[float]]:
+        def default_util_series() -> dict[str, list[float | None]]:
             return {resource: [] for resource in current_utilizations.keys()}
 
         if raw_util_series is not None:
             try:
-                raw_data: dict[str, list[float]] = msgpack.unpackb(raw_util_series, use_list=True)
-                util_series: dict[str, list[float]] = {
+                raw_data: dict[str, list[float | None]] = msgpack.unpackb(
+                    raw_util_series, use_list=True
+                )
+                util_series: dict[str, list[float | None]] = {
                     metric_key: v for metric_key, v in raw_data.items()
                 }
             except TypeError:
@@ -1159,13 +1163,13 @@ def default_util_series() -> dict[str, list[float]]:
             ex=max(86400, int(self.time_window.total_seconds() * 2)),
         )
 
-        def _avg(util_list: list[float]) -> float:
-            try:
-                return sum(util_list) / len(util_list)
-            except ZeroDivisionError:
-                return 0.0
+        def _avg(util_list: list[float | None]) -> float | None:
+            filtered = [v for v in util_list if v is not None]
+            if not filtered:
+                return None
+            return sum(filtered) / len(filtered)
 
-        avg_utils: Mapping[str, float] = {k: _avg(v) for k, v in util_series.items()}
+        avg_utils: Mapping[str, float | None] = {k: _avg(v) for k, v in util_series.items()}
 
         util_avg_thresholds = UtilizationResourceReport.from_avg_threshold(
             avg_utils, self.resource_thresholds, excluded_resources
@@ -1208,14 +1212,20 @@ async def get_current_utilization(
         self,
         kernel_ids: Sequence[KernelId],
         occupied_slots: Mapping[str, Any],
-    ) -> Mapping[str, float] | None:
+    ) -> Mapping[str, float | None] | None:
         """
         Return the current utilization key-value pairs of multiple kernels, possibly the
         components of a cluster session. If there are multiple kernel_ids, this method
         will return the averaged values over the kernels for each utilization.
+
+        When a metric is missing from some kernels' stats (e.g., CUDA plugin failure),
+        the metric is averaged only over the kernels that reported it. If no kernel
+        reported a metric, it is returned as None (not 0.0) so that the idle checker
+        can exclude it from the idle decision rather than treating it as idle.
         """
         try:
-            utilizations: defaultdict[str, float] = defaultdict(float)
+            utilization_sums: defaultdict[str, float] = defaultdict(float)
+            utilization_counts: defaultdict[str, int] = defaultdict(int)
             live_stat = {}
             kernel_counter = 0
             for kernel_id in kernel_ids:
@@ -1227,28 +1237,35 @@ async def get_current_utilization(
                     continue
                 live_stat = raw_live_stat
                 kernel_utils = {
-                    k: float(nmget(live_stat, f"{k}.pct", 0.0))
-                    for k in self.resource_names_to_check
+                    k: nmget(live_stat, f"{k}.pct") for k in self.resource_names_to_check
                 }
 
                 for resource, val in kernel_utils.items():
-                    utilizations[resource] = utilizations[resource] + val
+                    if val is None:
+                        continue
+                    utilization_sums[resource] += float(val)
+                    utilization_counts[resource] += 1
 
                 # NOTE: Manual calculation of mem utilization.
                 # mem.capacity does not report total amount of memory allocated to
                 # the container, and mem.pct always report >90% even when nothing is
                 # executing. So, we just replace it with the value of occupied slot.
                 mem_slots = float(occupied_slots.get("mem", 0))
                 mem_current = float(nmget(live_stat, "mem.current", 0.0))
-                utilizations["mem"] = (
-                    utilizations["mem"] + mem_current / mem_slots * 100 if mem_slots > 0 else 0
-                )
+                if mem_slots > 0:
+                    utilization_sums["mem"] += mem_current / mem_slots * 100
 
                 kernel_counter += 1
             if kernel_counter == 0:
                 return None
-            divider = kernel_counter
-            return {k: v / divider for k, v in utilizations.items()}
+            result: dict[str, float | None] = {}
+            for resource in self.resource_names_to_check:
+                count = utilization_counts.get(resource, 0)
+                if count > 0:
+                    result[resource] = utilization_sums[resource] / count
+                else:
+                    result[resource] = None
+            return result
         except Exception as e:
             _msg = f"Unable to collect utilization for idleness check (kernels:{kernel_ids})"
             log.warning(_msg, exc_info=e)
diff --git a/tests/unit/manager/test_idle_checker.py b/tests/unit/manager/test_idle_checker.py
@@ -804,8 +804,8 @@ async def test_utilization_current(
         expected_utilization = {
             "cpu_util": current_test_config.expected_cpu_util,
             "mem": current_test_config.expected_mem_util,
-            "cuda_mem": 0.0,
-            "cuda_util": 0.0,
+            "cuda_mem": None,
+            "cuda_util": None,
         }
 
         # When
@@ -1235,3 +1235,91 @@ def mock_get_live_data_side_effect(key: str) -> bytes | None:
         assert should_alive is insufficient_test_config.expected_alive
         assert remaining == insufficient_test_config.expected_remaining
         assert util_info is not None
+
+    # Test 5: Missing metrics should be excluded from idle decision
+    @pytest.fixture
+    async def missing_cpu_stat_checker(
+        self,
+        base_time: datetime,
+        valkey_live: AsyncMock,
+        valkey_stat: AsyncMock,
+        event_producer: AsyncMock,
+        mocker: Any,
+    ) -> UtilizationIdleChecker:
+        """UtilizationIdleChecker where cpu_util stat is missing from live_stat.
+
+        Simulates a scenario where stat collection fails for cpu_util
+        but memory is collected normally with sufficient utilization.
+        Uses OR operator so that if cpu_util were treated as 0.0
+        (old behavior), the session would be falsely terminated.
+        """
+        elapsed_seconds = 50
+        time_window_seconds = 15
+        now = base_time + timedelta(seconds=elapsed_seconds)
+        valkey_live.get_server_time.return_value = now.timestamp()
+        mocker.patch("ai.backend.manager.idle.get_db_now", return_value=now)
+
+        # live_stat has Memory but NO cpu_util key
+        live_stat = {
+            "mem": {"current": "5.0", "pct": "10.0"},
+        }
+        valkey_stat.get_kernel_statistics.return_value = live_stat
+
+        util_first_collected = now.timestamp() - time_window_seconds
+
+        def get_live_data_side_effect(key: str) -> bytes | None:
+            if ".util_first_collected" in key:
+                return f"{util_first_collected:.06f}".encode()
+            if ".util_series" in key:
+                return msgpack.packb({"cpu_util": [], "mem": []})
+            if ".utilization_extra" in key:
+                return msgpack.packb({"resources": {}})
+            if ".utilization" in key:
+                return msgpack.packb(-1)
+            return None
+
+        valkey_live.get_live_data.side_effect = get_live_data_side_effect
+
+        checker = UtilizationIdleChecker(
+            IdleCheckerArgs(
+                event_producer=event_producer,
+                redis_live=valkey_live,
+                valkey_stat_client=valkey_stat,
+            )
+        )
+        await checker.populate_config({
+            "initial-grace-period": "0",
+            "resource-thresholds": {
+                "cpu_util": {"average": "10"},
+                "mem": {"average": "10"},
+            },
+            "thresholds-check-operator": "or",
+            "time-window": str(time_window_seconds),
+        })
+        return checker
+
+    async def test_missing_metrics_excluded_from_idle_decision(
+        self,
+        missing_cpu_stat_checker: UtilizationIdleChecker,
+        utilization_kernel_row: Any,
+        session_id: SessionId,
+        db_connection: AsyncMock,
+    ) -> None:
+        """Test that missing metrics (stat collection failure) are excluded from idle check.
+
+        With OR operator, ALL configured metrics must exceed their thresholds for
+        the session to stay alive. If missing cpu_util were treated as 0.0
+        (old behavior), the session would be falsely terminated because
+        0.0 < threshold (10%). With the fix, missing metrics are excluded from
+        the decision, so only memory (above 10%) is checked.
+        """
+        # When
+        should_alive = await missing_cpu_stat_checker.check_idleness(
+            utilization_kernel_row,
+            db_connection,
+            mock_row(idle_timeout=15),
+        )
+
+        # Then - session should stay alive because cpu_util is excluded,
+        # not treated as 0.0
+        assert should_alive is True

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Exclude unmeasurable metrics from utilization idle check instead of treating stat collection failures as 0% usage`