@@ -273,6 +273,22 @@ def get_statistics_dict(self) -> Dict[str, Any]:
273273 },
274274 }
275275
276+ # Retrieve KV cache information.
277+ kv_cache_config = self .kwargs .get ("kv_cache_config" , KvCacheConfig ())
278+ if isinstance (kv_cache_config , KvCacheConfig ):
279+ kv_cache_dtype = kv_cache_config .dtype
280+ kv_cache_mem_percent = kv_cache_config .free_gpu_memory_fraction
281+ elif isinstance (kv_cache_config , dict ):
282+ kv_cache_dtype = kv_cache_config .get ("dtype" , "auto" )
283+ kv_cache_mem_percent = kv_cache_config .get (
284+ "free_gpu_memory_fraction" )
285+ else :
286+ raise ValueError (
287+ f"Invalid kv_cache_config type: { type (kv_cache_config )} ." )
288+
289+ kv_cache_mem_percent = f"{ kv_cache_mem_percent * 100.0 :.2f} %" \
290+ if kv_cache_mem_percent is not None else "None"
291+
276292 # Engine/Backend details
277293 if self .rt_cfg .backend not in ('pytorch' , '_autodeploy' ):
278294 config_path = self .rt_cfg .engine_dir / "config.json"
@@ -302,15 +318,6 @@ def get_statistics_dict(self) -> Dict[str, Any]:
302318 model = self .rt_cfg .model_path or self .rt_cfg .model
303319 model_config = ModelConfig .from_pretrained (model ,
304320 trust_remote_code = True )
305- kv_cache_config = self .kwargs .get ("kv_cache_config" ,
306- KvCacheConfig ())
307- if isinstance (kv_cache_config , KvCacheConfig ):
308- kv_cache_dtype = kv_cache_config .dtype
309- elif isinstance (kv_cache_config , dict ):
310- kv_cache_dtype = kv_cache_config .get ("dtype" , "auto" )
311- else :
312- raise ValueError (
313- f"Invalid kv_cache_config type: { type (kv_cache_config )} ." )
314321
315322 validate_and_set_kv_cache_quant (model_config , kv_cache_dtype )
316323
@@ -336,8 +343,7 @@ def get_statistics_dict(self) -> Dict[str, Any]:
336343 "max_batch_size" : self .rt_cfg .settings_config .max_batch_size ,
337344 "max_num_tokens" : self .rt_cfg .settings_config .max_num_tokens ,
338345 "scheduling_policy" : self .rt_cfg .settings_config .scheduler_policy ,
339- "kv_cache_percentage" :
340- self .rt_cfg .settings_config .kv_cache_percent * 100.0 ,
346+ "kv_cache_percentage" : kv_cache_mem_percent ,
341347 "issue_rate" : self .convert_rate_to_s (self .statistics .issue_rate_ns )
342348 }
343349
@@ -526,7 +532,7 @@ def report_statistics(self) -> None:
526532 f"Max Runtime Batch Size: { world_info ['max_batch_size' ]} \n "
527533 f"Max Runtime Tokens: { world_info ['max_num_tokens' ]} \n "
528534 f"Scheduling Policy: { world_info ['scheduling_policy' ]} \n "
529- f"KV Memory Percentage: { world_info ['kv_cache_percentage' ]:.2f } % \n "
535+ f"KV Memory Percentage: { world_info ['kv_cache_percentage' ]} \n "
530536 f"Issue Rate (req/sec): { world_info ['issue_rate' ]:.4E} \n "
531537 f"\n " )
532538
0 commit comments