2424# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
27- from typing import Dict , Union
27+ from typing import Dict , List , Union
2828
2929import triton_python_backend_utils as pb_utils
3030from vllm .engine .metrics import StatLoggerBase as VllmStatLoggerBase
3131from vllm .engine .metrics import Stats as VllmStats
3232from vllm .engine .metrics import SupportsMetricsInfo
3333
3434
35- # begin-metrics-definitions
3635class TritonMetrics :
3736 def __init__ (self , labels ):
3837 # System stats
@@ -80,82 +79,6 @@ def __init__(self, labels):
8079 description = "Number of generation tokens processed." ,
8180 kind = pb_utils .MetricFamily .COUNTER ,
8281 )
83- # self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
84- # name="vllm:time_to_first_token_seconds",
85- # description="Histogram of time to first token in seconds.",
86- # kind=pb_utils.MetricFamily.HISTOGRAM,
87- # buckets=[
88- # 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
89- # 0.75, 1.0, 2.5, 5.0, 7.5, 10.0
90- # ])
91- # self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
92- # name="vllm:time_per_output_token_seconds",
93- # description="Histogram of time per output token in seconds.",
94- # kind=pb_utils.MetricFamily.HISTOGRAM,
95- # buckets=[
96- # 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
97- # 1.0, 2.5
98- # ])
99-
100- # Request stats
101- # Latency
102- # self.histogram_e2e_time_request_family = pb_utils.MetricFamily(
103- # name="vllm:e2e_request_latency_seconds",
104- # description="Histogram of end to end request latency in seconds.",
105- # kind=pb_utils.MetricFamily.HISTOGRAM,
106- # buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
107- # # Metadata
108- # self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
109- # name="vllm:request_prompt_tokens",
110- # description="Number of prefill tokens processed.",
111- # kind=pb_utils.MetricFamily.HISTOGRAM,
112- # buckets=build_1_2_5_buckets(max_model_len),
113- # )
114- # self.histogram_num_generation_tokens_request_family = \
115- # pb_utils.MetricFamily(
116- # name="vllm:request_generation_tokens",
117- # description="Number of generation tokens processed.",
118- # kind=pb_utils.MetricFamily.HISTOGRAM,
119- # buckets=build_1_2_5_buckets(max_model_len),
120- # )
121- # self.histogram_best_of_request_family = pb_utils.MetricFamily(
122- # name="vllm:request_params_best_of",
123- # description="Histogram of the best_of request parameter.",
124- # kind=pb_utils.MetricFamily.HISTOGRAM,
125- # buckets=[1, 2, 5, 10, 20],
126- # )
127- # self.histogram_n_request_family = pb_utils.MetricFamily(
128- # name="vllm:request_params_n",
129- # description="Histogram of the n request parameter.",
130- # kind=pb_utils.MetricFamily.HISTOGRAM,
131- # buckets=[1, 2, 5, 10, 20],
132- # )
133- # self.counter_request_success_family = pb_utils.MetricFamily(
134- # name="vllm:request_success_total",
135- # description="Count of successfully processed requests.",
136- # kind=pb_utils.MetricFamily.COUNTER)
137-
138- # Speculatie decoding stats
139- # self.gauge_spec_decode_draft_acceptance_rate_family = pb_utils.MetricFamily(
140- # name="vllm:spec_decode_draft_acceptance_rate",
141- # description="Speculative token acceptance rate.",
142- # kind=pb_utils.MetricFamily.GAUGE)
143- # self.gauge_spec_decode_efficiency_family = pb_utils.MetricFamily(
144- # name="vllm:spec_decode_efficiency",
145- # description="Speculative decoding system efficiency.",
146- # kind=pb_utils.MetricFamily.GAUGE)
147- # self.counter_spec_decode_num_accepted_tokens_family = pb_utils.MetricFamily(
148- # name="vllm:spec_decode_num_accepted_tokens_total",
149- # description="Number of accepted tokens.",
150- # kind=pb_utils.MetricFamily.COUNTER)
151- # self.counter_spec_decode_num_draft_tokens_family = pb_utils.MetricFamily(
152- # name="vllm:spec_decode_num_draft_tokens_total",
153- # description="Number of draft tokens.",
154- # kind=pb_utils.MetricFamily.COUNTER)
155- # self.counter_spec_decode_num_emitted_tokens_family = pb_utils.MetricFamily(
156- # name="vllm:spec_decode_num_emitted_tokens_total",
157- # description="Number of emitted tokens.",
158- # kind=pb_utils.MetricFamily.COUNTER)
15982
16083 # System stats
16184 # Scheduler State
@@ -186,51 +109,6 @@ def __init__(self, labels):
186109 self .counter_generation_tokens = self .counter_generation_tokens_family .Metric (
187110 labels = labels
188111 )
189- # self.histogram_time_to_first_token = self.histogram_time_to_first_token_family.Metric(
190- # labels=labels
191- # )
192- # self.histogram_time_per_output_token = self.histogram_time_per_output_token_family.Metric(
193- # labels=labels
194- # )
195-
196- # Request stats
197- # Latency
198- # self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
199- # labels=labels
200- # )
201- # # Metadata
202- # self.histogram_num_prompt_tokens_request = self.histogram_num_prompt_tokens_request_family.Metric(
203- # labels=labels
204- # )
205- # self.histogram_num_generation_tokens_request = self.histogram_num_generation_tokens_request_family.Metric(
206- # labels=labels
207- # )
208- # self.histogram_best_of_request = self.histogram_best_of_request_family.Metric(
209- # labels=labels
210- # )
211- # self.histogram_n_request = self.histogram_n_request_family.Metric(
212- # labels=labels
213- # )
214- # self.counter_request_success = self.counter_request_success_family.Metric(
215- # labels=labels
216- # )
217-
218- # Speculatie decoding stats
219- # self.gauge_spec_decode_draft_acceptance_rate_ = self.gauge_spec_decode_draft_acceptance_rate_family.Metric(
220- # labels=labels
221- # )
222- # self.gauge_spec_decode_efficiency = self.gauge_spec_decode_efficiency_family.Metric(
223- # labels=labels
224- # )
225- # self.counter_spec_decode_num_accepted_tokens = self.counter_spec_decode_num_accepted_tokens_family.Metric(
226- # labels=labels
227- # )
228- # self.counter_spec_decode_num_draft_tokens = self.counter_spec_decode_num_draft_tokens_family.Metric(
229- # labels=labels
230- # )
231- # self.counter_spec_decode_num_emitted_tokens = self.counter_spec_decode_num_emitted_tokens_family.Metric(
232- # labels=labels
233- # )
234112
235113
236114class VllmStatLogger (VllmStatLoggerBase ):
@@ -253,15 +131,12 @@ def _log_counter(self, counter, data: Union[int, float]) -> None:
253131 # Convenience function for logging to counter.
254132 counter .increment (data )
255133
256- # def _log_histogram(self, histogram, data: Union[List[int],
257- # List[float]]) -> None:
258- # # Convenience function for logging list to histogram.
259- # for datum in data:
260- # histogram.labels(**self.labels).observe(datum)
134+ def _log_histogram (self , histogram , data : Union [List [int ], List [float ]]) -> None :
135+ # Convenience function for logging list to histogram.
136+ for datum in data :
137+ histogram .observe (datum )
261138
262139 def log (self , stats : VllmStats ) -> None :
263- # self.maybe_update_spec_decode_metrics(stats)
264-
265140 # System state data
266141 self ._log_gauge (self .metrics .gauge_scheduler_running , stats .num_running_sys )
267142 self ._log_gauge (self .metrics .gauge_scheduler_waiting , stats .num_waiting_sys )
@@ -279,23 +154,3 @@ def log(self, stats: VllmStats) -> None:
279154 self ._log_counter (
280155 self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter
281156 )
282- # self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter)
283- # self._log_histogram(self.metrics.histogram_time_per_output_token, stats.time_per_output_tokens_iter)
284-
285- # Request level data
286- # Latency
287- # self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests)
288- # Metadata
289- # self._log_histogram(self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests)
290- # self._log_histogram(self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests)
291- # self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests)
292- # self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
293- # self._log_histogram(self.metrics.counter_request_success, stats.finished_reason_requests)
294-
295- # Speculatie decoding stats
296- # if self.spec_decode_metrics is not None:
297- # self._log_gauge(self.metrics.gauge_spec_decode_draft_acceptance_rate, self.spec_decode_metrics.draft_acceptance_rate)
298- # self._log_gauge(self.metrics.gauge_spec_decode_efficiency, self.spec_decode_metrics.system_efficiency)
299- # self._log_counter(self.metrics.counter_spec_decode_num_accepted_tokens, self.spec_decode_metrics.accepted_tokens)
300- # self._log_counter(self.metrics.counter_spec_decode_num_draft_tokens, self.spec_decode_metrics.draft_tokens)
301- # self._log_counter(self.metrics.counter_spec_decode_num_emitted_tokens, self.spec_decode_metrics.emitted_tokens)
0 commit comments