Skip to content

Commit d95bb2c

Browse files
committed
Minor update
1 parent 21e2356 commit d95bb2c

File tree

1 file changed

+5
-150
lines changed

1 file changed

+5
-150
lines changed

src/metrics.py

Lines changed: 5 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,14 @@
2424
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27-
from typing import Dict, Union
27+
from typing import Dict, List, Union
2828

2929
import triton_python_backend_utils as pb_utils
3030
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
3131
from vllm.engine.metrics import Stats as VllmStats
3232
from vllm.engine.metrics import SupportsMetricsInfo
3333

3434

35-
# begin-metrics-definitions
3635
class TritonMetrics:
3736
def __init__(self, labels):
3837
# System stats
@@ -80,82 +79,6 @@ def __init__(self, labels):
8079
description="Number of generation tokens processed.",
8180
kind=pb_utils.MetricFamily.COUNTER,
8281
)
83-
# self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
84-
# name="vllm:time_to_first_token_seconds",
85-
# description="Histogram of time to first token in seconds.",
86-
# kind=pb_utils.MetricFamily.HISTOGRAM,
87-
# buckets=[
88-
# 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
89-
# 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
90-
# ])
91-
# self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
92-
# name="vllm:time_per_output_token_seconds",
93-
# description="Histogram of time per output token in seconds.",
94-
# kind=pb_utils.MetricFamily.HISTOGRAM,
95-
# buckets=[
96-
# 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
97-
# 1.0, 2.5
98-
# ])
99-
100-
# Request stats
101-
# Latency
102-
# self.histogram_e2e_time_request_family = pb_utils.MetricFamily(
103-
# name="vllm:e2e_request_latency_seconds",
104-
# description="Histogram of end to end request latency in seconds.",
105-
# kind=pb_utils.MetricFamily.HISTOGRAM,
106-
# buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
107-
# # Metadata
108-
# self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
109-
# name="vllm:request_prompt_tokens",
110-
# description="Number of prefill tokens processed.",
111-
# kind=pb_utils.MetricFamily.HISTOGRAM,
112-
# buckets=build_1_2_5_buckets(max_model_len),
113-
# )
114-
# self.histogram_num_generation_tokens_request_family = \
115-
# pb_utils.MetricFamily(
116-
# name="vllm:request_generation_tokens",
117-
# description="Number of generation tokens processed.",
118-
# kind=pb_utils.MetricFamily.HISTOGRAM,
119-
# buckets=build_1_2_5_buckets(max_model_len),
120-
# )
121-
# self.histogram_best_of_request_family = pb_utils.MetricFamily(
122-
# name="vllm:request_params_best_of",
123-
# description="Histogram of the best_of request parameter.",
124-
# kind=pb_utils.MetricFamily.HISTOGRAM,
125-
# buckets=[1, 2, 5, 10, 20],
126-
# )
127-
# self.histogram_n_request_family = pb_utils.MetricFamily(
128-
# name="vllm:request_params_n",
129-
# description="Histogram of the n request parameter.",
130-
# kind=pb_utils.MetricFamily.HISTOGRAM,
131-
# buckets=[1, 2, 5, 10, 20],
132-
# )
133-
# self.counter_request_success_family = pb_utils.MetricFamily(
134-
# name="vllm:request_success_total",
135-
# description="Count of successfully processed requests.",
136-
# kind=pb_utils.MetricFamily.COUNTER)
137-
138-
# Speculatie decoding stats
139-
# self.gauge_spec_decode_draft_acceptance_rate_family = pb_utils.MetricFamily(
140-
# name="vllm:spec_decode_draft_acceptance_rate",
141-
# description="Speculative token acceptance rate.",
142-
# kind=pb_utils.MetricFamily.GAUGE)
143-
# self.gauge_spec_decode_efficiency_family = pb_utils.MetricFamily(
144-
# name="vllm:spec_decode_efficiency",
145-
# description="Speculative decoding system efficiency.",
146-
# kind=pb_utils.MetricFamily.GAUGE)
147-
# self.counter_spec_decode_num_accepted_tokens_family = pb_utils.MetricFamily(
148-
# name="vllm:spec_decode_num_accepted_tokens_total",
149-
# description="Number of accepted tokens.",
150-
# kind=pb_utils.MetricFamily.COUNTER)
151-
# self.counter_spec_decode_num_draft_tokens_family = pb_utils.MetricFamily(
152-
# name="vllm:spec_decode_num_draft_tokens_total",
153-
# description="Number of draft tokens.",
154-
# kind=pb_utils.MetricFamily.COUNTER)
155-
# self.counter_spec_decode_num_emitted_tokens_family = pb_utils.MetricFamily(
156-
# name="vllm:spec_decode_num_emitted_tokens_total",
157-
# description="Number of emitted tokens.",
158-
# kind=pb_utils.MetricFamily.COUNTER)
15982

16083
# System stats
16184
# Scheduler State
@@ -186,51 +109,6 @@ def __init__(self, labels):
186109
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
187110
labels=labels
188111
)
189-
# self.histogram_time_to_first_token = self.histogram_time_to_first_token_family.Metric(
190-
# labels=labels
191-
# )
192-
# self.histogram_time_per_output_token = self.histogram_time_per_output_token_family.Metric(
193-
# labels=labels
194-
# )
195-
196-
# Request stats
197-
# Latency
198-
# self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
199-
# labels=labels
200-
# )
201-
# # Metadata
202-
# self.histogram_num_prompt_tokens_request = self.histogram_num_prompt_tokens_request_family.Metric(
203-
# labels=labels
204-
# )
205-
# self.histogram_num_generation_tokens_request = self.histogram_num_generation_tokens_request_family.Metric(
206-
# labels=labels
207-
# )
208-
# self.histogram_best_of_request = self.histogram_best_of_request_family.Metric(
209-
# labels=labels
210-
# )
211-
# self.histogram_n_request = self.histogram_n_request_family.Metric(
212-
# labels=labels
213-
# )
214-
# self.counter_request_success = self.counter_request_success_family.Metric(
215-
# labels=labels
216-
# )
217-
218-
# Speculatie decoding stats
219-
# self.gauge_spec_decode_draft_acceptance_rate_ = self.gauge_spec_decode_draft_acceptance_rate_family.Metric(
220-
# labels=labels
221-
# )
222-
# self.gauge_spec_decode_efficiency = self.gauge_spec_decode_efficiency_family.Metric(
223-
# labels=labels
224-
# )
225-
# self.counter_spec_decode_num_accepted_tokens = self.counter_spec_decode_num_accepted_tokens_family.Metric(
226-
# labels=labels
227-
# )
228-
# self.counter_spec_decode_num_draft_tokens = self.counter_spec_decode_num_draft_tokens_family.Metric(
229-
# labels=labels
230-
# )
231-
# self.counter_spec_decode_num_emitted_tokens = self.counter_spec_decode_num_emitted_tokens_family.Metric(
232-
# labels=labels
233-
# )
234112

235113

236114
class VllmStatLogger(VllmStatLoggerBase):
@@ -253,15 +131,12 @@ def _log_counter(self, counter, data: Union[int, float]) -> None:
253131
# Convenience function for logging to counter.
254132
counter.increment(data)
255133

256-
# def _log_histogram(self, histogram, data: Union[List[int],
257-
# List[float]]) -> None:
258-
# # Convenience function for logging list to histogram.
259-
# for datum in data:
260-
# histogram.labels(**self.labels).observe(datum)
134+
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
135+
# Convenience function for logging list to histogram.
136+
for datum in data:
137+
histogram.observe(datum)
261138

262139
def log(self, stats: VllmStats) -> None:
263-
# self.maybe_update_spec_decode_metrics(stats)
264-
265140
# System state data
266141
self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys)
267142
self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys)
@@ -279,23 +154,3 @@ def log(self, stats: VllmStats) -> None:
279154
self._log_counter(
280155
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
281156
)
282-
# self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter)
283-
# self._log_histogram(self.metrics.histogram_time_per_output_token, stats.time_per_output_tokens_iter)
284-
285-
# Request level data
286-
# Latency
287-
# self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests)
288-
# Metadata
289-
# self._log_histogram(self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests)
290-
# self._log_histogram(self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests)
291-
# self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests)
292-
# self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
293-
# self._log_histogram(self.metrics.counter_request_success, stats.finished_reason_requests)
294-
295-
# Speculatie decoding stats
296-
# if self.spec_decode_metrics is not None:
297-
# self._log_gauge(self.metrics.gauge_spec_decode_draft_acceptance_rate, self.spec_decode_metrics.draft_acceptance_rate)
298-
# self._log_gauge(self.metrics.gauge_spec_decode_efficiency, self.spec_decode_metrics.system_efficiency)
299-
# self._log_counter(self.metrics.counter_spec_decode_num_accepted_tokens, self.spec_decode_metrics.accepted_tokens)
300-
# self._log_counter(self.metrics.counter_spec_decode_num_draft_tokens, self.spec_decode_metrics.draft_tokens)
301-
# self._log_counter(self.metrics.counter_spec_decode_num_emitted_tokens, self.spec_decode_metrics.emitted_tokens)

0 commit comments

Comments
 (0)