Skip to content

Commit e867687

Browse files
committed
Add first supported metrics
1 parent 05c5a8b commit e867687

File tree

2 files changed

+292
-0
lines changed

2 files changed

+292
-0
lines changed

src/metrics.py

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
import triton_python_backend_utils as pb_utils
28+
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
29+
from vllm.engine.metrics import Stats as VllmStats
30+
from vllm.engine.metrics import SupportsMetricsInfo
31+
from typing import Dict, Union
32+
33+
# begin-metrics-definitions
34+
class TritonMetrics:
35+
36+
def __init__(self, labels):
37+
# System stats
38+
# Scheduler State
39+
self.gauge_scheduler_running_family = pb_utils.MetricFamily(
40+
name="vllm:num_requests_running",
41+
description="Number of requests currently running on GPU.",
42+
kind=pb_utils.MetricFamily.GAUGE)
43+
self.gauge_scheduler_waiting_family = pb_utils.MetricFamily(
44+
name="vllm:num_requests_waiting",
45+
description="Number of requests waiting to be processed.",
46+
kind=pb_utils.MetricFamily.GAUGE)
47+
self.gauge_scheduler_swapped_family = pb_utils.MetricFamily(
48+
name="vllm:num_requests_swapped",
49+
description="Number of requests swapped to CPU.",
50+
kind=pb_utils.MetricFamily.GAUGE)
51+
# KV Cache Usage in %
52+
self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
53+
name="vllm:gpu_cache_usage_perc",
54+
description="GPU KV-cache usage. 1 means 100 percent usage.",
55+
kind=pb_utils.MetricFamily.GAUGE)
56+
self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily(
57+
name="vllm:cpu_cache_usage_perc",
58+
description="CPU KV-cache usage. 1 means 100 percent usage.",
59+
kind=pb_utils.MetricFamily.GAUGE)
60+
61+
# Iteration stats
62+
self.counter_num_preemption_family = pb_utils.MetricFamily(
63+
name="vllm:num_preemptions_total",
64+
description="Cumulative number of preemption from the engine.",
65+
kind=pb_utils.MetricFamily.COUNTER)
66+
self.counter_prompt_tokens_family = pb_utils.MetricFamily(
67+
name="vllm:prompt_tokens_total",
68+
description="Number of prefill tokens processed.",
69+
kind=pb_utils.MetricFamily.COUNTER)
70+
self.counter_generation_tokens_family = pb_utils.MetricFamily(
71+
name="vllm:generation_tokens_total",
72+
description="Number of generation tokens processed.",
73+
kind=pb_utils.MetricFamily.COUNTER)
74+
# self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
75+
# name="vllm:time_to_first_token_seconds",
76+
# description="Histogram of time to first token in seconds.",
77+
# kind=pb_utils.MetricFamily.HISTOGRAM,
78+
# buckets=[
79+
# 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
80+
# 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
81+
# ])
82+
# self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
83+
# name="vllm:time_per_output_token_seconds",
84+
# description="Histogram of time per output token in seconds.",
85+
# kind=pb_utils.MetricFamily.HISTOGRAM,
86+
# buckets=[
87+
# 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
88+
# 1.0, 2.5
89+
# ])
90+
91+
# Request stats
92+
# Latency
93+
# self.histogram_e2e_time_request_family = pb_utils.MetricFamily(
94+
# name="vllm:e2e_request_latency_seconds",
95+
# description="Histogram of end to end request latency in seconds.",
96+
# kind=pb_utils.MetricFamily.HISTOGRAM,
97+
# buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
98+
# # Metadata
99+
# self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
100+
# name="vllm:request_prompt_tokens",
101+
# description="Number of prefill tokens processed.",
102+
# kind=pb_utils.MetricFamily.HISTOGRAM,
103+
# buckets=build_1_2_5_buckets(max_model_len),
104+
# )
105+
# self.histogram_num_generation_tokens_request_family = \
106+
# pb_utils.MetricFamily(
107+
# name="vllm:request_generation_tokens",
108+
# description="Number of generation tokens processed.",
109+
# kind=pb_utils.MetricFamily.HISTOGRAM,
110+
# buckets=build_1_2_5_buckets(max_model_len),
111+
# )
112+
# self.histogram_best_of_request_family = pb_utils.MetricFamily(
113+
# name="vllm:request_params_best_of",
114+
# description="Histogram of the best_of request parameter.",
115+
# kind=pb_utils.MetricFamily.HISTOGRAM,
116+
# buckets=[1, 2, 5, 10, 20],
117+
# )
118+
# self.histogram_n_request_family = pb_utils.MetricFamily(
119+
# name="vllm:request_params_n",
120+
# description="Histogram of the n request parameter.",
121+
# kind=pb_utils.MetricFamily.HISTOGRAM,
122+
# buckets=[1, 2, 5, 10, 20],
123+
# )
124+
# self.counter_request_success_family = pb_utils.MetricFamily(
125+
# name="vllm:request_success_total",
126+
# description="Count of successfully processed requests.",
127+
# kind=pb_utils.MetricFamily.COUNTER)
128+
129+
# Speculatie decoding stats
130+
# self.gauge_spec_decode_draft_acceptance_rate_family = pb_utils.MetricFamily(
131+
# name="vllm:spec_decode_draft_acceptance_rate",
132+
# description="Speculative token acceptance rate.",
133+
# kind=pb_utils.MetricFamily.GAUGE)
134+
# self.gauge_spec_decode_efficiency_family = pb_utils.MetricFamily(
135+
# name="vllm:spec_decode_efficiency",
136+
# description="Speculative decoding system efficiency.",
137+
# kind=pb_utils.MetricFamily.GAUGE)
138+
# self.counter_spec_decode_num_accepted_tokens_family = pb_utils.MetricFamily(
139+
# name="vllm:spec_decode_num_accepted_tokens_total",
140+
# description="Number of accepted tokens.",
141+
# kind=pb_utils.MetricFamily.COUNTER)
142+
# self.counter_spec_decode_num_draft_tokens_family = pb_utils.MetricFamily(
143+
# name="vllm:spec_decode_num_draft_tokens_total",
144+
# description="Number of draft tokens.",
145+
# kind=pb_utils.MetricFamily.COUNTER)
146+
# self.counter_spec_decode_num_emitted_tokens_family = pb_utils.MetricFamily(
147+
# name="vllm:spec_decode_num_emitted_tokens_total",
148+
# description="Number of emitted tokens.",
149+
# kind=pb_utils.MetricFamily.COUNTER)
150+
151+
# System stats
152+
# Scheduler State
153+
self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric(
154+
labels=labels
155+
)
156+
self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric(
157+
labels=labels
158+
)
159+
self.gauge_scheduler_swapped = self.gauge_scheduler_swapped_family.Metric(
160+
labels=labels
161+
)
162+
# KV Cache Usage in %
163+
self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
164+
labels=labels
165+
)
166+
self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric(
167+
labels=labels
168+
)
169+
170+
# Iteration stats
171+
self.counter_num_preemption = self.counter_num_preemption_family.Metric(
172+
labels=labels
173+
)
174+
self.counter_prompt_tokens = self.counter_prompt_tokens_family.Metric(
175+
labels=labels
176+
)
177+
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
178+
labels=labels
179+
)
180+
# self.histogram_time_to_first_token = self.histogram_time_to_first_token_family.Metric(
181+
# labels=labels
182+
# )
183+
# self.histogram_time_per_output_token = self.histogram_time_per_output_token_family.Metric(
184+
# labels=labels
185+
# )
186+
187+
# Request stats
188+
# Latency
189+
# self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
190+
# labels=labels
191+
# )
192+
# # Metadata
193+
# self.histogram_num_prompt_tokens_request = self.histogram_num_prompt_tokens_request_family.Metric(
194+
# labels=labels
195+
# )
196+
# self.histogram_num_generation_tokens_request = self.histogram_num_generation_tokens_request_family.Metric(
197+
# labels=labels
198+
# )
199+
# self.histogram_best_of_request = self.histogram_best_of_request_family.Metric(
200+
# labels=labels
201+
# )
202+
# self.histogram_n_request = self.histogram_n_request_family.Metric(
203+
# labels=labels
204+
# )
205+
# self.counter_request_success = self.counter_request_success_family.Metric(
206+
# labels=labels
207+
# )
208+
209+
# Speculatie decoding stats
210+
# self.gauge_spec_decode_draft_acceptance_rate_ = self.gauge_spec_decode_draft_acceptance_rate_family.Metric(
211+
# labels=labels
212+
# )
213+
# self.gauge_spec_decode_efficiency = self.gauge_spec_decode_efficiency_family.Metric(
214+
# labels=labels
215+
# )
216+
# self.counter_spec_decode_num_accepted_tokens = self.counter_spec_decode_num_accepted_tokens_family.Metric(
217+
# labels=labels
218+
# )
219+
# self.counter_spec_decode_num_draft_tokens = self.counter_spec_decode_num_draft_tokens_family.Metric(
220+
# labels=labels
221+
# )
222+
# self.counter_spec_decode_num_emitted_tokens = self.counter_spec_decode_num_emitted_tokens_family.Metric(
223+
# labels=labels
224+
# )
225+
226+
class VllmStatLogger(VllmStatLoggerBase):
227+
""" StatLoggeris used as adapter between vLLM stats collector and Triton metrics provider."""
228+
229+
# local_interval not used here. It's for vLLM logs to stdout.
230+
def __init__(self, labels: Dict, local_interval: float=0) -> None:
231+
# Tracked stats over current local logging interval.
232+
super().__init__(local_interval)
233+
self.metrics = TritonMetrics(labels=labels)
234+
235+
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
236+
raise NotImplementedError
237+
238+
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
239+
# Convenience function for logging to gauge.
240+
gauge.set(data)
241+
242+
def _log_counter(self, counter, data: Union[int, float]) -> None:
243+
# Convenience function for logging to counter.
244+
counter.increment(data)
245+
246+
# def _log_histogram(self, histogram, data: Union[List[int],
247+
# List[float]]) -> None:
248+
# # Convenience function for logging list to histogram.
249+
# for datum in data:
250+
# histogram.labels(**self.labels).observe(datum)
251+
252+
def log(self, stats: VllmStats) -> None:
253+
254+
# self.maybe_update_spec_decode_metrics(stats)
255+
256+
# System state data
257+
self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys)
258+
self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys)
259+
self._log_gauge(self.metrics.gauge_scheduler_swapped, stats.num_swapped_sys)
260+
self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys)
261+
self._log_gauge(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys)
262+
263+
# Iteration level data
264+
self._log_counter(self.metrics.counter_num_preemption, stats.num_preemption_iter)
265+
self._log_counter(self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter)
266+
self._log_counter(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter)
267+
# self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter)
268+
# self._log_histogram(self.metrics.histogram_time_per_output_token, stats.time_per_output_tokens_iter)
269+
270+
# Request level data
271+
# Latency
272+
# self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests)
273+
# Metadata
274+
# self._log_histogram(self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests)
275+
# self._log_histogram(self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests)
276+
# self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests)
277+
# self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
278+
# self._log_histogram(self.metrics.counter_request_success, stats.finished_reason_requests)
279+
280+
# Speculatie decoding stats
281+
# if self.spec_decode_metrics is not None:
282+
# self._log_gauge(self.metrics.gauge_spec_decode_draft_acceptance_rate, self.spec_decode_metrics.draft_acceptance_rate)
283+
# self._log_gauge(self.metrics.gauge_spec_decode_efficiency, self.spec_decode_metrics.system_efficiency)
284+
# self._log_counter(self.metrics.counter_spec_decode_num_accepted_tokens, self.spec_decode_metrics.accepted_tokens)
285+
# self._log_counter(self.metrics.counter_spec_decode_num_draft_tokens, self.spec_decode_metrics.draft_tokens)
286+
# self._log_counter(self.metrics.counter_spec_decode_num_emitted_tokens, self.spec_decode_metrics.emitted_tokens)

src/model.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
import numpy as np
3434
import torch
35+
from metrics import VllmStatLogger
3536
import triton_python_backend_utils as pb_utils
3637
from vllm.engine.arg_utils import AsyncEngineArgs
3738
from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -151,6 +152,11 @@ def init_engine(self):
151152
AsyncEngineArgs(**self.vllm_engine_config)
152153
)
153154

155+
# Create vLLM custom Metrics
156+
labels = {"model": "vllm_metrics", "version": "1"}
157+
logger = VllmStatLogger(vllm_labels=labels)
158+
self.llm_engine.add_logger("triton", logger)
159+
154160
def setup_lora(self):
155161
self.enable_lora = False
156162

0 commit comments

Comments
 (0)