@@ -49,12 +49,12 @@ def __init__(self, labels: List[str], max_model_len: int):
4949 description = "Number of generation tokens processed." ,
5050 kind = pb_utils .MetricFamily .COUNTER ,
5151 )
52- self .counter_preemption_tokens_family = pb_utils .MetricFamily (
52+ self .counter_num_preemption_family = pb_utils .MetricFamily (
5353 name = "vllm:num_preemptions_total" ,
5454 description = "Number of preemption tokens processed." ,
5555 kind = pb_utils .MetricFamily .COUNTER ,
5656 )
57- self .histogram_iteration_tokens_total_family = pb_utils .MetricFamily (
57+ self .histogram_iteration_tokens_family = pb_utils .MetricFamily (
5858 name = "vllm:iteration_tokens_total" ,
5959 description = "Histogram of number of tokens per engine_step." ,
6060 kind = pb_utils .MetricFamily .HISTOGRAM ,
@@ -124,33 +124,12 @@ def __init__(self, labels: List[str], max_model_len: int):
124124 description = "Number of requests waiting to be processed." ,
125125 kind = pb_utils .MetricFamily .GAUGE ,
126126 )
127- self .gauge_scheduler_swapped_family = pb_utils .MetricFamily (
128- name = "vllm:num_requests_swapped" ,
129- description = "Number of requests swapped to CPU." ,
130- kind = pb_utils .MetricFamily .GAUGE ,
131- )
132127 # KV Cache Usage in %
133128 self .gauge_gpu_cache_usage_family = pb_utils .MetricFamily (
134129 name = "vllm:gpu_cache_usage_perc" ,
135130 description = "GPU KV-cache usage. 1 means 100 percent usage." ,
136131 kind = pb_utils .MetricFamily .GAUGE ,
137132 )
138- self .gauge_cpu_cache_usage_family = pb_utils .MetricFamily (
139- name = "vllm:cpu_cache_usage_perc" ,
140- description = "CPU KV-cache usage. 1 means 100 percent usage." ,
141- kind = pb_utils .MetricFamily .GAUGE ,
142- )
143- # Prefix caching block hit rate
144- self .gauge_cpu_prefix_cache_hit_rate_family = pb_utils .MetricFamily (
145- name = "vllm:cpu_prefix_cache_hit_rate" ,
146- description = "CPU prefix cache block hit rate." ,
147- kind = pb_utils .MetricFamily .GAUGE ,
148- )
149- self .gauge_gpu_prefix_cache_hit_rate_family = pb_utils .MetricFamily (
150- name = "vllm:gpu_prefix_cache_hit_rate" ,
151- description = "GPU prefix cache block hit rate." ,
152- kind = pb_utils .MetricFamily .GAUGE ,
153- )
154133
155134 # Initialize metrics
156135 # Iteration stats
@@ -160,17 +139,15 @@ def __init__(self, labels: List[str], max_model_len: int):
160139 self .counter_generation_tokens = self .counter_generation_tokens_family .Metric (
161140 labels = labels
162141 )
163- self .counter_preemption_tokens = self .counter_preemption_tokens_family .Metric (
142+ self .counter_num_preemption = self .counter_num_preemption_family .Metric (
164143 labels = labels
165144 )
166145
167146 # Use the same bucket boundaries from vLLM sample metrics as an example.
168147 # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
169- self .histogram_iteration_tokens_total = (
170- self .histogram_iteration_tokens_total_family .Metric (
171- labels = labels ,
172- buckets = [1 , 8 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 ],
173- )
148+ self .histogram_iteration_tokens = self .histogram_iteration_tokens_family .Metric (
149+ labels = labels ,
150+ buckets = [1 , 8 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 ],
174151 )
175152
176153 self .histogram_time_to_first_token = (
@@ -218,32 +195,55 @@ def __init__(self, labels: List[str], max_model_len: int):
218195 )
219196 # Request stats
220197 # Latency
198+ request_latency_buckets = [
199+ 0.3 ,
200+ 0.5 ,
201+ 0.8 ,
202+ 1.0 ,
203+ 1.5 ,
204+ 2.0 ,
205+ 2.5 ,
206+ 5.0 ,
207+ 10.0 ,
208+ 15.0 ,
209+ 20.0 ,
210+ 30.0 ,
211+ 40.0 ,
212+ 50.0 ,
213+ 60.0 ,
214+ 120.0 ,
215+ 240.0 ,
216+ 480.0 ,
217+ 960.0 ,
218+ 1920.0 ,
219+ 7680.0 ,
220+ ]
221221 self .histogram_e2e_time_request = self .histogram_e2e_time_request_family .Metric (
222222 labels = labels ,
223- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
223+ buckets = request_latency_buckets ,
224224 )
225225 self .histogram_prefill_time_request = (
226226 self .histogram_prefill_time_request_family .Metric (
227227 labels = labels ,
228- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
228+ buckets = request_latency_buckets ,
229229 )
230230 )
231231 self .histogram_decode_time_request = (
232232 self .histogram_decode_time_request_family .Metric (
233233 labels = labels ,
234- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
234+ buckets = request_latency_buckets ,
235235 )
236236 )
237237 self .histogram_inference_time_request = (
238238 self .histogram_inference_time_request_family .Metric (
239239 labels = labels ,
240- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
240+ buckets = request_latency_buckets ,
241241 )
242242 )
243243 self .histogram_queue_time_request = (
244244 self .histogram_queue_time_request_family .Metric (
245245 labels = labels ,
246- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
246+ buckets = request_latency_buckets ,
247247 )
248248 )
249249 # Metadata
@@ -265,29 +265,16 @@ def __init__(self, labels: List[str], max_model_len: int):
265265 )
266266 # System stats
267267 # Scheduler State
268- self .gauge_num_requests_running = self .gauge_scheduler_running_family .Metric (
269- labels = labels
270- )
271- self .gauge_num_requests_waiting = self .gauge_scheduler_waiting_family .Metric (
268+ self .gauge_scheduler_running = self .gauge_scheduler_running_family .Metric (
272269 labels = labels
273270 )
274- self .gauge_num_requests_swapped = self .gauge_scheduler_swapped_family .Metric (
271+ self .gauge_scheduler_waiting = self .gauge_scheduler_waiting_family .Metric (
275272 labels = labels
276273 )
277274 # KV Cache Usage in %
278275 self .gauge_gpu_cache_usage = self .gauge_gpu_cache_usage_family .Metric (
279276 labels = labels
280277 )
281- self .gauge_cpu_cache_usage = self .gauge_cpu_cache_usage_family .Metric (
282- labels = labels
283- )
284- # Prefix caching block hit rate
285- self .gauge_cpu_prefix_cache_hit_rate = (
286- self .gauge_cpu_prefix_cache_hit_rate_family .Metric (labels = labels )
287- )
288- self .gauge_gpu_prefix_cache_hit_rate = (
289- self .gauge_gpu_prefix_cache_hit_rate_family .Metric (labels = labels )
290- )
291278
292279
293280class VllmStatLogger (VllmStatLoggerBase ):
@@ -394,19 +381,9 @@ def log(self, stats: VllmStats) -> None:
394381 (self .metrics .histogram_n_request , stats .n_requests ),
395382 ]
396383 gauge_metrics = [
397- (self .metrics .gauge_num_requests_running , stats .num_running_sys ),
398- (self .metrics .gauge_num_requests_waiting , stats .num_waiting_sys ),
399- (self .metrics .gauge_num_requests_swapped , stats .num_swapped_sys ),
384+ (self .metrics .gauge_scheduler_running , stats .num_running_sys ),
385+ (self .metrics .gauge_scheduler_waiting , stats .num_waiting_sys ),
400386 (self .metrics .gauge_gpu_cache_usage , stats .gpu_cache_usage_sys ),
401- (self .metrics .gauge_cpu_cache_usage , stats .cpu_cache_usage_sys ),
402- (
403- self .metrics .gauge_cpu_prefix_cache_hit_rate ,
404- stats .cpu_prefix_cache_hit_rate ,
405- ),
406- (
407- self .metrics .gauge_gpu_prefix_cache_hit_rate ,
408- stats .gpu_prefix_cache_hit_rate ,
409- ),
410387 ]
411388 for metric , data in counter_metrics :
412389 self ._log_counter (metric , data )
0 commit comments