triton-inference-server · nv-kmcgill53 · Feb 18, 2025 · Feb 13, 2025 · Feb 14, 2025
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -284,13 +284,6 @@ vllm:request_generation_tokens_sum{model="vllm_model",version="1"} 16
 vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="1"} 0
 ...
 vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
-# HELP vllm:request_params_best_of Histogram of the best_of request parameter.
-# TYPE vllm:request_params_best_of histogram
-vllm:request_params_best_of_count{model="vllm_model",version="1"} 1
-vllm:request_params_best_of_sum{model="vllm_model",version="1"} 1
-vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="1"} 1
-...
-vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="+Inf"} 1
 # HELP vllm:request_params_n Histogram of the n request parameter.
 # TYPE vllm:request_params_n histogram
 vllm:request_params_n_count{model="vllm_model",version="1"} 1

diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -189,24 +189,10 @@ def test_custom_sampling_params(self):
             model_name=self.vllm_model_name,
         )
         metrics_dict = self.parse_vllm_metrics()
-        total_prompts = len(self.prompts)
-
-        # vllm:request_params_best_of
-        """
-        self.assertEqual(
-            metrics_dict["vllm:request_params_best_of_count"], total_prompts
-        )
-        self.assertEqual(
-            metrics_dict["vllm:request_params_best_of_sum"], best_of * total_prompts
-        )
-        self.assertEqual(
-            metrics_dict["vllm:request_params_best_of_bucket"], total_prompts
-        )
-        """
         # vllm:request_params_n
-        self.assertEqual(metrics_dict["vllm:request_params_n_count"], total_prompts)
-        # self.assertEqual(metrics_dict["vllm:request_params_n_sum"], n * total_prompts)
-        self.assertEqual(metrics_dict["vllm:request_params_n_bucket"], total_prompts)
+        self.assertIn("vllm:request_params_n_count", metrics_dict)
+        self.assertIn("vllm:request_params_n_sum", metrics_dict)
+        self.assertIn("vllm:request_params_n_bucket", metrics_dict)
 
     def test_vllm_metrics_disabled(self):
         # Test vLLM metrics

diff --git a/src/model.py b/src/model.py
@@ -359,10 +359,8 @@ def _setup_metrics(self):
                     "version": self.args["model_version"],
                 }
                 # Add vLLM custom metrics
-                engine_config = self._llm_engine.engine.model_config
-                self._vllm_metrics = VllmStatLogger(
-                    labels, engine_config.max_model_len, self.logger
-                )
+                vllm_config = self._llm_engine.engine.vllm_config
+                self._vllm_metrics = VllmStatLogger(labels, vllm_config, self.logger)
                 self._llm_engine.add_logger("triton", self._vllm_metrics)
             except pb_utils.TritonModelException as e:
                 if "metrics not supported" in str(e):

diff --git a/src/utils/metrics.py b/src/utils/metrics.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,6 +29,7 @@
 from typing import Dict, List, Union
 
 import triton_python_backend_utils as pb_utils
+from vllm.config import VllmConfig
 from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
 from vllm.engine.metrics import Stats as VllmStats
 from vllm.engine.metrics import SupportsMetricsInfo, build_1_2_5_buckets
@@ -163,11 +164,13 @@ def __init__(self, labels: List[str], max_model_len: int):
 class VllmStatLogger(VllmStatLoggerBase):
     """StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
 
-    def __init__(self, labels: Dict, max_model_len: int, log_logger) -> None:
+    def __init__(self, labels: Dict, vllm_config: VllmConfig, log_logger) -> None:
         # Tracked stats over current local logging interval.
         # local_interval not used here. It's for vLLM logs to stdout.
-        super().__init__(local_interval=0)
-        self.metrics = TritonMetrics(labels, max_model_len)
+        super().__init__(local_interval=0, vllm_config=vllm_config)
+        self.metrics = TritonMetrics(
+            labels=labels, max_model_len=vllm_config.model_config.max_model_len
+        )
         self.log_logger = log_logger
 
         # Starting the metrics thread. It allows vLLM to keep making progress