2929import  triton_python_backend_utils  as  pb_utils 
3030from  vllm .engine .metrics  import  StatLoggerBase  as  VllmStatLoggerBase 
3131from  vllm .engine .metrics  import  Stats  as  VllmStats 
32- from  vllm .engine .metrics  import  SupportsMetricsInfo 
32+ from  vllm .engine .metrics  import  SupportsMetricsInfo ,  build_1_2_5_buckets 
3333
3434
3535class  TritonMetrics :
36-     def  __init__ (self , labels ):
36+     def  __init__ (self , labels :  List [ str ],  max_model_len :  int ):
3737        # Initialize metric families 
3838        # Iteration stats 
3939        self .counter_prompt_tokens_family  =  pb_utils .MetricFamily (
@@ -56,6 +56,34 @@ def __init__(self, labels):
5656            description = "Histogram of time per output token in seconds." ,
5757            kind = pb_utils .MetricFamily .HISTOGRAM ,
5858        )
59+         # Request stats 
60+         #   Latency 
61+         self .histogram_e2e_time_request_family  =  pb_utils .MetricFamily (
62+             name = "vllm:e2e_request_latency_seconds" ,
63+             description = "Histogram of end to end request latency in seconds." ,
64+             kind = pb_utils .MetricFamily .HISTOGRAM ,
65+         )
66+         #   Metadata 
67+         self .histogram_num_prompt_tokens_request_family  =  pb_utils .MetricFamily (
68+             name = "vllm:request_prompt_tokens" ,
69+             description = "Number of prefill tokens processed." ,
70+             kind = pb_utils .MetricFamily .HISTOGRAM ,
71+         )
72+         self .histogram_num_generation_tokens_request_family  =  pb_utils .MetricFamily (
73+             name = "vllm:request_generation_tokens" ,
74+             description = "Number of generation tokens processed." ,
75+             kind = pb_utils .MetricFamily .HISTOGRAM ,
76+         )
77+         self .histogram_best_of_request_family  =  pb_utils .MetricFamily (
78+             name = "vllm:request_params_best_of" ,
79+             description = "Histogram of the best_of request parameter." ,
80+             kind = pb_utils .MetricFamily .HISTOGRAM ,
81+         )
82+         self .histogram_n_request_family  =  pb_utils .MetricFamily (
83+             name = "vllm:request_params_n" ,
84+             description = "Histogram of the n request parameter." ,
85+             kind = pb_utils .MetricFamily .HISTOGRAM ,
86+         )
5987
6088        # Initialize metrics 
6189        # Iteration stats 
@@ -65,7 +93,7 @@ def __init__(self, labels):
6593        self .counter_generation_tokens  =  self .counter_generation_tokens_family .Metric (
6694            labels = labels 
6795        )
68-         # Use the same bucket boundaries from vLLM sample metrics. 
96+         # Use the same bucket boundaries from vLLM sample metrics as an example . 
6997        # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 
7098        self .histogram_time_to_first_token  =  (
7199            self .histogram_time_to_first_token_family .Metric (
@@ -110,16 +138,43 @@ def __init__(self, labels):
110138                ],
111139            )
112140        )
141+         # Request stats 
142+         #   Latency 
143+         self .histogram_e2e_time_request  =  self .histogram_e2e_time_request_family .Metric (
144+             labels = labels ,
145+             buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
146+         )
147+         #   Metadata 
148+         self .histogram_num_prompt_tokens_request  =  (
149+             self .histogram_num_prompt_tokens_request_family .Metric (
150+                 labels = labels ,
151+                 buckets = build_1_2_5_buckets (max_model_len ),
152+             )
153+         )
154+         self .histogram_num_generation_tokens_request  =  (
155+             self .histogram_num_generation_tokens_request_family .Metric (
156+                 labels = labels ,
157+                 buckets = build_1_2_5_buckets (max_model_len ),
158+             )
159+         )
160+         self .histogram_best_of_request  =  self .histogram_best_of_request_family .Metric (
161+             labels = labels ,
162+             buckets = [1 , 2 , 5 , 10 , 20 ],
163+         )
164+         self .histogram_n_request  =  self .histogram_n_request_family .Metric (
165+             labels = labels ,
166+             buckets = [1 , 2 , 5 , 10 , 20 ],
167+         )
113168
114169
115170class  VllmStatLogger (VllmStatLoggerBase ):
116171    """StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider.""" 
117172
118173    # local_interval not used here. It's for vLLM logs to stdout. 
119-     def  __init__ (self , labels : Dict , local_interval :  float   =   0 ) ->  None :
174+     def  __init__ (self , labels : Dict , max_model_len :  int ) ->  None :
120175        # Tracked stats over current local logging interval. 
121-         super ().__init__ (local_interval )
122-         self .metrics  =  TritonMetrics (labels = labels )
176+         super ().__init__ (local_interval = 0 )
177+         self .metrics  =  TritonMetrics (labels ,  max_model_len )
123178
124179    def  info (self , type : str , obj : SupportsMetricsInfo ) ->  None :
125180        pass 
@@ -159,16 +214,35 @@ def log(self, stats: VllmStats) -> None:
159214        Returns: 
160215            None 
161216        """ 
162-         self ._log_counter (
163-             self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter 
164-         )
165-         self ._log_counter (
166-             self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter 
167-         )
168-         self ._log_histogram (
169-             self .metrics .histogram_time_to_first_token , stats .time_to_first_tokens_iter 
170-         )
171-         self ._log_histogram (
172-             self .metrics .histogram_time_per_output_token ,
173-             stats .time_per_output_tokens_iter ,
174-         )
217+         # The list of vLLM metrics reporting to Triton is also documented here. 
218+         # https://github.com/triton-inference-server/vllm_backend/blob/main/README.md#triton-metrics 
219+         counter_metrics  =  [
220+             (self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter ),
221+             (self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter ),
222+         ]
223+         histogram_metrics  =  [
224+             (
225+                 self .metrics .histogram_time_to_first_token ,
226+                 stats .time_to_first_tokens_iter ,
227+             ),
228+             (
229+                 self .metrics .histogram_time_per_output_token ,
230+                 stats .time_per_output_tokens_iter ,
231+             ),
232+             (self .metrics .histogram_e2e_time_request , stats .time_e2e_requests ),
233+             (
234+                 self .metrics .histogram_num_prompt_tokens_request ,
235+                 stats .num_prompt_tokens_requests ,
236+             ),
237+             (
238+                 self .metrics .histogram_num_generation_tokens_request ,
239+                 stats .num_generation_tokens_requests ,
240+             ),
241+             (self .metrics .histogram_best_of_request , stats .best_of_requests ),
242+             (self .metrics .histogram_n_request , stats .n_requests ),
243+         ]
244+ 
245+         for  metric , data  in  counter_metrics :
246+             self ._log_counter (metric , data )
247+         for  metric , data  in  histogram_metrics :
248+             self ._log_histogram (metric , data )
0 commit comments