@@ -202,40 +202,46 @@ def __init__(self,
202
202
#
203
203
# GPU cache
204
204
#
205
- # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
206
- # TODO: in 0.10, only enable if show_hidden_metrics=True
207
- gauge_gpu_cache_usage = self ._gauge_cls (
208
- name = "vllm:gpu_cache_usage_perc" ,
209
- documentation = (
210
- "GPU KV-cache usage. 1 means 100 percent usage."
211
- "DEPRECATED: Use vllm:kv_cache_usage_perc instead." ),
212
- multiprocess_mode = "mostrecent" ,
213
- labelnames = labelnames )
214
- self .gauge_gpu_cache_usage = make_per_engine (gauge_gpu_cache_usage ,
215
- engine_indexes ,
216
- model_name )
217
-
218
- # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
219
- # TODO: in 0.10, only enable if show_hidden_metrics=True
220
- counter_gpu_prefix_cache_queries = self ._counter_cls (
221
- name = "vllm:gpu_prefix_cache_queries" ,
222
- documentation = (
223
- "GPU prefix cache queries, in terms of number of queried"
224
- "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." ),
225
- labelnames = labelnames )
226
- self .counter_gpu_prefix_cache_queries = make_per_engine (
227
- counter_gpu_prefix_cache_queries , engine_indexes , model_name )
228
-
229
- # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
230
- # TODO: in 0.10, only enable if show_hidden_metrics=True
231
- counter_gpu_prefix_cache_hits = self ._counter_cls (
232
- name = "vllm:gpu_prefix_cache_hits" ,
233
- documentation = (
234
- "GPU prefix cache hits, in terms of number of cached "
235
- "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead." ),
236
- labelnames = labelnames )
237
- self .counter_gpu_prefix_cache_hits = make_per_engine (
238
- counter_gpu_prefix_cache_hits , engine_indexes , model_name )
205
+ # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
206
+ # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
207
+ # TODO: remove in 0.12.0
208
+ if self .show_hidden_metrics :
209
+ gauge_gpu_cache_usage = self ._gauge_cls (
210
+ name = "vllm:gpu_cache_usage_perc" ,
211
+ documentation = (
212
+ "GPU KV-cache usage. 1 means 100 percent usage."
213
+ "DEPRECATED: Use vllm:kv_cache_usage_perc instead." ),
214
+ multiprocess_mode = "mostrecent" ,
215
+ labelnames = labelnames )
216
+ self .gauge_gpu_cache_usage = make_per_engine (
217
+ gauge_gpu_cache_usage , engine_indexes , model_name )
218
+
219
+ # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
220
+ # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
221
+ # TODO: remove in 0.12.0
222
+ if self .show_hidden_metrics :
223
+ counter_gpu_prefix_cache_queries = self ._counter_cls (
224
+ name = "vllm:gpu_prefix_cache_queries" ,
225
+ documentation = (
226
+ "GPU prefix cache queries, in terms of number of queried"
227
+ "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
228
+ ),
229
+ labelnames = labelnames )
230
+ self .counter_gpu_prefix_cache_queries = make_per_engine (
231
+ counter_gpu_prefix_cache_queries , engine_indexes , model_name )
232
+
233
+ # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
234
+ # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
235
+ # TODO: remove in 0.12.0
236
+ if self .show_hidden_metrics :
237
+ counter_gpu_prefix_cache_hits = self ._counter_cls (
238
+ name = "vllm:gpu_prefix_cache_hits" ,
239
+ documentation = (
240
+ "GPU prefix cache hits, in terms of number of cached "
241
+ "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead." ),
242
+ labelnames = labelnames )
243
+ self .counter_gpu_prefix_cache_hits = make_per_engine (
244
+ counter_gpu_prefix_cache_hits , engine_indexes , model_name )
239
245
240
246
gauge_kv_cache_usage = self ._gauge_cls (
241
247
name = "vllm:kv_cache_usage_perc" ,
@@ -509,15 +515,17 @@ def record(self,
509
515
self .gauge_scheduler_waiting [engine_idx ].set (
510
516
scheduler_stats .num_waiting_reqs )
511
517
512
- self .gauge_gpu_cache_usage [engine_idx ].set (
513
- scheduler_stats .kv_cache_usage )
518
+ if self .show_hidden_metrics :
519
+ self .gauge_gpu_cache_usage [engine_idx ].set (
520
+ scheduler_stats .kv_cache_usage )
514
521
self .gauge_kv_cache_usage [engine_idx ].set (
515
522
scheduler_stats .kv_cache_usage )
516
523
517
- self .counter_gpu_prefix_cache_queries [engine_idx ].inc (
518
- scheduler_stats .prefix_cache_stats .queries )
519
- self .counter_gpu_prefix_cache_hits [engine_idx ].inc (
520
- scheduler_stats .prefix_cache_stats .hits )
524
+ if self .show_hidden_metrics :
525
+ self .counter_gpu_prefix_cache_queries [engine_idx ].inc (
526
+ scheduler_stats .prefix_cache_stats .queries )
527
+ self .counter_gpu_prefix_cache_hits [engine_idx ].inc (
528
+ scheduler_stats .prefix_cache_stats .hits )
521
529
522
530
self .counter_prefix_cache_queries [engine_idx ].inc (
523
531
scheduler_stats .prefix_cache_stats .queries )
0 commit comments