Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!--
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -284,13 +284,6 @@ vllm:request_generation_tokens_sum{model="vllm_model",version="1"} 16
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="1"} 0
...
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
# HELP vllm:request_params_best_of Histogram of the best_of request parameter.
# TYPE vllm:request_params_best_of histogram
vllm:request_params_best_of_count{model="vllm_model",version="1"} 1
vllm:request_params_best_of_sum{model="vllm_model",version="1"} 1
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="1"} 1
...
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="+Inf"} 1
# HELP vllm:request_params_n Histogram of the n request parameter.
# TYPE vllm:request_params_n histogram
vllm:request_params_n_count{model="vllm_model",version="1"} 1
Expand Down
22 changes: 4 additions & 18 deletions ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -189,24 +189,10 @@ def test_custom_sampling_params(self):
model_name=self.vllm_model_name,
)
metrics_dict = self.parse_vllm_metrics()
total_prompts = len(self.prompts)

# vllm:request_params_best_of
"""
self.assertEqual(
metrics_dict["vllm:request_params_best_of_count"], total_prompts
)
self.assertEqual(
metrics_dict["vllm:request_params_best_of_sum"], best_of * total_prompts
)
self.assertEqual(
metrics_dict["vllm:request_params_best_of_bucket"], total_prompts
)
"""
# vllm:request_params_n
self.assertEqual(metrics_dict["vllm:request_params_n_count"], total_prompts)
# self.assertEqual(metrics_dict["vllm:request_params_n_sum"], n * total_prompts)
self.assertEqual(metrics_dict["vllm:request_params_n_bucket"], total_prompts)
self.assertIn("vllm:request_params_n_count", metrics_dict)
self.assertIn("vllm:request_params_n_sum", metrics_dict)
self.assertIn("vllm:request_params_n_bucket", metrics_dict)

def test_vllm_metrics_disabled(self):
# Test vLLM metrics
Expand Down
6 changes: 2 additions & 4 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,8 @@ def _setup_metrics(self):
"version": self.args["model_version"],
}
# Add vLLM custom metrics
engine_config = self._llm_engine.engine.model_config
self._vllm_metrics = VllmStatLogger(
labels, engine_config.max_model_len, self.logger
)
vllm_config = self._llm_engine.engine.vllm_config
self._vllm_metrics = VllmStatLogger(labels, vllm_config, self.logger)
self._llm_engine.add_logger("triton", self._vllm_metrics)
except pb_utils.TritonModelException as e:
if "metrics not supported" in str(e):
Expand Down
11 changes: 7 additions & 4 deletions src/utils/metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -29,6 +29,7 @@
from typing import Dict, List, Union

import triton_python_backend_utils as pb_utils
from vllm.config import VllmConfig
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
from vllm.engine.metrics import Stats as VllmStats
from vllm.engine.metrics import SupportsMetricsInfo, build_1_2_5_buckets
Expand Down Expand Up @@ -163,11 +164,13 @@ def __init__(self, labels: List[str], max_model_len: int):
class VllmStatLogger(VllmStatLoggerBase):
"""StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""

def __init__(self, labels: Dict, max_model_len: int, log_logger) -> None:
def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
# Tracked stats over current local logging interval.
# local_interval not used here. It's for vLLM logs to stdout.
super().__init__(local_interval=0)
self.metrics = TritonMetrics(labels, max_model_len)
super().__init__(local_interval=0, vllm_config=vllm_config)
self.metrics = TritonMetrics(
labels=labels, max_model_len=vllm_config.model_config.max_model_len
)
self.log_logger = log_logger

# Starting the metrics thread. It allows vLLM to keep making progress
Expand Down
Loading