@@ -80,53 +80,13 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
8080 multiprocess_mode = "livemostrecent" ,
8181 )
8282
83- # Deprecated in 0.8 - KV cache offloading is not used in V1
84- # Hidden in 0.9, due to be removed in 0.10
85- if self .show_hidden_metrics :
86- self .gauge_scheduler_swapped = self ._gauge_cls (
87- name = "vllm:num_requests_swapped" ,
88- documentation = (
89- "Number of requests swapped to CPU. "
90- "DEPRECATED: KV cache offloading is not used in V1" ),
91- labelnames = labelnames ,
92- multiprocess_mode = "sum" )
93-
9483 # KV Cache Usage in %
9584 self .gauge_gpu_cache_usage = self ._gauge_cls (
9685 name = "vllm:gpu_cache_usage_perc" ,
9786 documentation = "GPU KV-cache usage. 1 means 100 percent usage." ,
9887 labelnames = labelnames ,
9988 multiprocess_mode = "sum" )
10089
101- # Deprecated in 0.8 - KV cache offloading is not used in V1
102- # Hidden in 0.9, due to be removed in 0.10
103- if self .show_hidden_metrics :
104- self .gauge_cpu_cache_usage = self ._gauge_cls (
105- name = "vllm:cpu_cache_usage_perc" ,
106- documentation = (
107- "CPU KV-cache usage. 1 means 100 percent usage. "
108- "DEPRECATED: KV cache offloading is not used in V1" ),
109- labelnames = labelnames ,
110- multiprocess_mode = "sum" )
111- self .gauge_cpu_prefix_cache_hit_rate = self ._gauge_cls (
112- name = "vllm:cpu_prefix_cache_hit_rate" ,
113- documentation = (
114- "CPU prefix cache block hit rate. "
115- "DEPRECATED: KV cache offloading is not used in V1" ),
116- labelnames = labelnames ,
117- multiprocess_mode = "sum" )
118-
119- # Deprecated in 0.8 - replaced by queries+hits counters in V1
120- # Hidden in 0.9, due to be removed in 0.10
121- if self .show_hidden_metrics :
122- self .gauge_gpu_prefix_cache_hit_rate = self ._gauge_cls (
123- name = "vllm:gpu_prefix_cache_hit_rate" ,
124- documentation = ("GPU prefix cache block hit rate. "
125- "DEPRECATED: use vllm:gpu_prefix_cache_queries "
126- "and vllm:gpu_prefix_cache_queries in V1" ),
127- labelnames = labelnames ,
128- multiprocess_mode = "sum" )
129-
13090 # Iteration stats
13191 self .counter_num_preemption = self ._counter_cls (
13292 name = "vllm:num_preemptions_total" ,
@@ -200,36 +160,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
200160 "Histogram of time spent in DECODE phase for request." ,
201161 labelnames = labelnames ,
202162 buckets = request_latency_buckets )
203- # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
204- # Hidden in 0.9, due to be removed in 0.10
205- if self .show_hidden_metrics :
206- self .histogram_time_in_queue_request = self ._histogram_cls (
207- name = "vllm:time_in_queue_requests" ,
208- documentation =
209- ("Histogram of time the request spent in the queue in seconds. "
210- "DEPRECATED: use vllm:request_queue_time_seconds instead." ),
211- labelnames = labelnames ,
212- buckets = request_latency_buckets )
213-
214- # Deprecated in 0.8 - use prefill/decode/inference time metrics
215- # Hidden in 0.9, due to be removed in 0.10
216- if self .show_hidden_metrics :
217- self .histogram_model_forward_time_request = self ._histogram_cls (
218- name = "vllm:model_forward_time_milliseconds" ,
219- documentation =
220- ("Histogram of time spent in the model forward pass in ms. "
221- "DEPRECATED: use prefill/decode/inference time metrics instead"
222- ),
223- labelnames = labelnames ,
224- buckets = build_1_2_3_5_8_buckets (3000 ))
225- self .histogram_model_execute_time_request = self ._histogram_cls (
226- name = "vllm:model_execute_time_milliseconds" ,
227- documentation =
228- ("Histogram of time spent in the model execute function in ms."
229- "DEPRECATED: use prefill/decode/inference time metrics instead"
230- ),
231- labelnames = labelnames ,
232- buckets = build_1_2_3_5_8_buckets (3000 ))
233163
234164 # Metadata
235165 self .histogram_num_prompt_tokens_request = self ._histogram_cls (
@@ -580,20 +510,10 @@ def _log_prometheus(self, stats: Stats) -> None:
580510 # System state data
581511 self ._log_gauge (self .metrics .gauge_scheduler_running ,
582512 stats .num_running_sys )
583- if self .metrics .show_hidden_metrics :
584- self ._log_gauge (self .metrics .gauge_scheduler_swapped ,
585- stats .num_swapped_sys )
586513 self ._log_gauge (self .metrics .gauge_scheduler_waiting ,
587514 stats .num_waiting_sys )
588515 self ._log_gauge (self .metrics .gauge_gpu_cache_usage ,
589516 stats .gpu_cache_usage_sys )
590- if self .metrics .show_hidden_metrics :
591- self ._log_gauge (self .metrics .gauge_cpu_cache_usage ,
592- stats .cpu_cache_usage_sys )
593- self ._log_gauge (self .metrics .gauge_cpu_prefix_cache_hit_rate ,
594- stats .cpu_prefix_cache_hit_rate )
595- self ._log_gauge (self .metrics .gauge_gpu_prefix_cache_hit_rate ,
596- stats .gpu_prefix_cache_hit_rate )
597517 # Including max-lora in metric, in future this property of lora
598518 # config maybe extended to be dynamic.
599519 lora_info = {
@@ -631,15 +551,6 @@ def _log_prometheus(self, stats: Stats) -> None:
631551 stats .time_prefill_requests )
632552 self ._log_histogram (self .metrics .histogram_decode_time_request ,
633553 stats .time_decode_requests )
634- if self .metrics .show_hidden_metrics :
635- self ._log_histogram (self .metrics .histogram_time_in_queue_request ,
636- stats .time_in_queue_requests )
637- self ._log_histogram (
638- self .metrics .histogram_model_forward_time_request ,
639- stats .model_forward_time_requests )
640- self ._log_histogram (
641- self .metrics .histogram_model_execute_time_request ,
642- stats .model_execute_time_requests )
643554 # Metadata
644555 finished_reason_counter = CollectionsCounter (
645556 stats .finished_reason_requests )
0 commit comments