@@ -138,15 +138,17 @@ class UtilizationResourceReport(UserDict[str, UtilizationExtraInfo]):
138138 @classmethod
139139 def from_avg_threshold (
140140 cls ,
141- avg_utils : Mapping [str , float ],
141+ avg_utils : Mapping [str , float | None ],
142142 thresholds : ResourceThresholds ,
143143 exclusions : set [str ],
144144 ) -> UtilizationResourceReport :
145145 data : dict [str , UtilizationExtraInfo ] = {}
146146 for metric_key , val in thresholds .items ():
147147 if val .average is None or metric_key in exclusions :
148148 continue
149- avg_util = avg_utils .get (metric_key , 0 )
149+ avg_util = avg_utils .get (metric_key )
150+ if avg_util is None :
151+ continue
150152 data [metric_key ] = UtilizationExtraInfo (float (avg_util ), float (val .average ))
151153 return cls (data )
152154
@@ -1119,13 +1121,15 @@ async def check_idleness(
11191121 # Update utilization time-series data.
11201122 raw_util_series = await self ._redis_live .get_live_data (util_series_key )
11211123
1122- def default_util_series () -> dict [str , list [float ]]:
1124+ def default_util_series () -> dict [str , list [float | None ]]:
11231125 return {resource : [] for resource in current_utilizations .keys ()}
11241126
11251127 if raw_util_series is not None :
11261128 try :
1127- raw_data : dict [str , list [float ]] = msgpack .unpackb (raw_util_series , use_list = True )
1128- util_series : dict [str , list [float ]] = {
1129+ raw_data : dict [str , list [float | None ]] = msgpack .unpackb (
1130+ raw_util_series , use_list = True
1131+ )
1132+ util_series : dict [str , list [float | None ]] = {
11291133 metric_key : v for metric_key , v in raw_data .items ()
11301134 }
11311135 except TypeError :
@@ -1159,13 +1163,13 @@ def default_util_series() -> dict[str, list[float]]:
11591163 ex = max (86400 , int (self .time_window .total_seconds () * 2 )),
11601164 )
11611165
1162- def _avg (util_list : list [float ]) -> float :
1163- try :
1164- return sum ( util_list ) / len ( util_list )
1165- except ZeroDivisionError :
1166- return 0.0
1166+ def _avg (util_list : list [float | None ]) -> float | None :
1167+ filtered = [ v for v in util_list if v is not None ]
1168+ if not filtered :
1169+ return None
1170+ return sum ( filtered ) / len ( filtered )
11671171
1168- avg_utils : Mapping [str , float ] = {k : _avg (v ) for k , v in util_series .items ()}
1172+ avg_utils : Mapping [str , float | None ] = {k : _avg (v ) for k , v in util_series .items ()}
11691173
11701174 util_avg_thresholds = UtilizationResourceReport .from_avg_threshold (
11711175 avg_utils , self .resource_thresholds , excluded_resources
@@ -1208,14 +1212,20 @@ async def get_current_utilization(
12081212 self ,
12091213 kernel_ids : Sequence [KernelId ],
12101214 occupied_slots : Mapping [str , Any ],
1211- ) -> Mapping [str , float ] | None :
1215+ ) -> Mapping [str , float | None ] | None :
12121216 """
12131217 Return the current utilization key-value pairs of multiple kernels, possibly the
12141218 components of a cluster session. If there are multiple kernel_ids, this method
12151219 will return the averaged values over the kernels for each utilization.
1220+
1221+ When a metric is missing from some kernels' stats (e.g., CUDA plugin failure),
1222+ the metric is averaged only over the kernels that reported it. If no kernel
1223+ reported a metric, it is returned as None (not 0.0) so that the idle checker
1224+ can exclude it from the idle decision rather than treating it as idle.
12161225 """
12171226 try :
1218- utilizations : defaultdict [str , float ] = defaultdict (float )
1227+ utilization_sums : defaultdict [str , float ] = defaultdict (float )
1228+ utilization_counts : defaultdict [str , int ] = defaultdict (int )
12191229 live_stat = {}
12201230 kernel_counter = 0
12211231 for kernel_id in kernel_ids :
@@ -1227,28 +1237,35 @@ async def get_current_utilization(
12271237 continue
12281238 live_stat = raw_live_stat
12291239 kernel_utils = {
1230- k : float (nmget (live_stat , f"{ k } .pct" , 0.0 ))
1231- for k in self .resource_names_to_check
1240+ k : nmget (live_stat , f"{ k } .pct" ) for k in self .resource_names_to_check
12321241 }
12331242
12341243 for resource , val in kernel_utils .items ():
1235- utilizations [resource ] = utilizations [resource ] + val
1244+ if val is None :
1245+ continue
1246+ utilization_sums [resource ] += float (val )
1247+ utilization_counts [resource ] += 1
12361248
12371249 # NOTE: Manual calculation of mem utilization.
12381250 # mem.capacity does not report total amount of memory allocated to
12391251 # the container, and mem.pct always report >90% even when nothing is
12401252 # executing. So, we just replace it with the value of occupied slot.
12411253 mem_slots = float (occupied_slots .get ("mem" , 0 ))
12421254 mem_current = float (nmget (live_stat , "mem.current" , 0.0 ))
1243- utilizations ["mem" ] = (
1244- utilizations ["mem" ] + mem_current / mem_slots * 100 if mem_slots > 0 else 0
1245- )
1255+ if mem_slots > 0 :
1256+ utilization_sums ["mem" ] += mem_current / mem_slots * 100
12461257
12471258 kernel_counter += 1
12481259 if kernel_counter == 0 :
12491260 return None
1250- divider = kernel_counter
1251- return {k : v / divider for k , v in utilizations .items ()}
1261+ result : dict [str , float | None ] = {}
1262+ for resource in self .resource_names_to_check :
1263+ count = utilization_counts .get (resource , 0 )
1264+ if count > 0 :
1265+ result [resource ] = utilization_sums [resource ] / count
1266+ else :
1267+ result [resource ] = None
1268+ return result
12521269 except Exception as e :
12531270 _msg = f"Unable to collect utilization for idleness check (kernels:{ kernel_ids } )"
12541271 log .warning (_msg , exc_info = e )
0 commit comments