@@ -32,6 +32,7 @@ use crate::{
32
32
validation:: { RequestSize , ValidationError } ,
33
33
GenerateParameters , GenerateRequest ,
34
34
} ;
35
+ use crate :: metrics:: { increment_counter, increment_labeled_counter, observe_histogram} ;
35
36
use crate :: pb:: fmaas:: tokenize_response:: Offset ;
36
37
37
38
/// Whether to fail if sampling parameters are provided in greedy-mode requests
@@ -67,8 +68,6 @@ pub(crate) async fn start_grpc_server<F: Future<Output = ()> + Send + 'static>(
67
68
let grpc_service = GenerationServicer {
68
69
state : shared_state,
69
70
tokenizer,
70
- input_counter : metrics:: register_counter!( "tgi_request_input_count" ) ,
71
- tokenize_input_counter : metrics:: register_counter!( "tgi_tokenize_request_input_count" ) ,
72
71
} ;
73
72
let grpc_server = builder
74
73
. add_service ( GenerationServiceServer :: new ( grpc_service) )
@@ -92,8 +91,6 @@ async fn load_pem(path: String, name: &str) -> Vec<u8> {
92
91
pub struct GenerationServicer {
93
92
state : ServerState ,
94
93
tokenizer : AsyncTokenizer ,
95
- input_counter : metrics:: Counter ,
96
- tokenize_input_counter : metrics:: Counter ,
97
94
}
98
95
99
96
#[ tonic:: async_trait]
@@ -124,20 +121,20 @@ impl GenerationService for GenerationServicer {
124
121
let br = request. into_inner ( ) ;
125
122
let batch_size = br. requests . len ( ) ;
126
123
let kind = if batch_size == 1 { "single" } else { "batch" } ;
127
- metrics :: increment_counter! ( "tgi_request_count" , "kind" => kind) ;
124
+ increment_labeled_counter ( "tgi_request_count" , & [ ( "kind" , kind) ] , 1 ) ;
128
125
if batch_size == 0 {
129
126
return Ok ( Response :: new ( BatchedGenerationResponse {
130
127
responses : vec ! [ ] ,
131
128
} ) ) ;
132
129
}
133
- self . input_counter . increment ( batch_size as u64 ) ;
130
+ increment_counter ( "tgi_request_input_count" , batch_size as u64 ) ;
134
131
// Limit concurrent requests by acquiring a permit from the semaphore
135
132
let _permit = self
136
133
. state
137
134
. limit_concurrent_requests
138
135
. try_acquire_many ( batch_size as u32 )
139
136
. map_err ( |_| {
140
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "conc_limit" ) ;
137
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "conc_limit" ) ] , 1 ) ;
141
138
tracing:: error!( "Model is overloaded" ) ;
142
139
Status :: resource_exhausted ( "Model is overloaded" )
143
140
} ) ?;
@@ -217,11 +214,11 @@ impl GenerationService for GenerationServicer {
217
214
}
218
215
. map_err ( |err| match err {
219
216
InferError :: RequestQueueFull ( ) => {
220
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "queue_full" ) ;
217
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "queue_full" ) ] , 1 ) ;
221
218
Status :: resource_exhausted ( err. to_string ( ) )
222
219
}
223
220
_ => {
224
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "generate" ) ;
221
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "generate" ) ] , 1 ) ;
225
222
tracing:: error!( "{err}" ) ;
226
223
Status :: from_error ( Box :: new ( err) )
227
224
}
@@ -254,15 +251,15 @@ impl GenerationService for GenerationServicer {
254
251
) -> Result < Response < Self :: GenerateStreamStream > , Status > {
255
252
let start_time = Instant :: now ( ) ;
256
253
let request = request. extract_context ( ) ;
257
- metrics :: increment_counter! ( "tgi_request_count" , "kind" => "stream" ) ;
258
- self . input_counter . increment ( 1 ) ;
254
+ increment_labeled_counter ( "tgi_request_count" , & [ ( "kind" , "stream" ) ] , 1 ) ;
255
+ increment_counter ( "tgi_request_input_count" , 1 ) ;
259
256
let permit = self
260
257
. state
261
258
. limit_concurrent_requests
262
259
. clone ( )
263
260
. try_acquire_owned ( )
264
261
. map_err ( |_| {
265
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "conc_limit" ) ;
262
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "conc_limit" ) ] , 1 ) ;
266
263
tracing:: error!( "Model is overloaded" ) ;
267
264
Status :: resource_exhausted ( "Model is overloaded" )
268
265
} ) ?;
@@ -292,7 +289,7 @@ impl GenerationService for GenerationServicer {
292
289
|ctx, count, reason, request_id, times, out, err| {
293
290
let _enter = ctx. span . enter ( ) ;
294
291
if let Some ( e) = err {
295
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "generate" ) ;
292
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "generate" ) ] , 1 ) ;
296
293
tracing:: error!(
297
294
"Streaming response failed after {count} tokens, \
298
295
output so far: '{:?}': {e}",
@@ -322,11 +319,11 @@ impl GenerationService for GenerationServicer {
322
319
. await
323
320
. map_err ( |err| match err {
324
321
InferError :: RequestQueueFull ( ) => {
325
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "queue_full" ) ;
322
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "queue_full" ) ] , 1 ) ;
326
323
Status :: resource_exhausted ( err. to_string ( ) )
327
324
}
328
325
_ => {
329
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "unknown" ) ;
326
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "unknown" ) ] , 1 ) ;
330
327
tracing:: error!( "{err}" ) ;
331
328
Status :: from_error ( Box :: new ( err) )
332
329
}
@@ -341,9 +338,9 @@ impl GenerationService for GenerationServicer {
341
338
request : Request < BatchedTokenizeRequest > ,
342
339
) -> Result < Response < BatchedTokenizeResponse > , Status > {
343
340
let br = request. into_inner ( ) ;
344
- metrics :: increment_counter! ( "tgi_tokenize_request_count" ) ;
341
+ increment_counter ( "tgi_tokenize_request_count" , 1 ) ;
345
342
let start_time = Instant :: now ( ) ;
346
- self . tokenize_input_counter . increment ( br. requests . len ( ) as u64 ) ;
343
+ increment_counter ( "tgi_tokenize_request_input_count" , br. requests . len ( ) as u64 ) ;
347
344
348
345
let truncate_to = match br. truncate_input_tokens {
349
346
0 => u32:: MAX ,
@@ -378,8 +375,8 @@ impl GenerationService for GenerationServicer {
378
375
. await ?;
379
376
380
377
let token_total: u32 = responses. iter ( ) . map ( |tr| tr. token_count ) . sum ( ) ;
381
- metrics :: histogram! ( "tgi_tokenize_request_tokens" , token_total as f64 ) ;
382
- metrics :: histogram! (
378
+ observe_histogram ( "tgi_tokenize_request_tokens" , token_total as f64 ) ;
379
+ observe_histogram (
383
380
"tgi_tokenize_request_duration" ,
384
381
start_time. elapsed ( ) . as_secs_f64 ( )
385
382
) ;
@@ -428,12 +425,12 @@ impl GenerationServicer {
428
425
Err ( err) => Err ( err) ,
429
426
}
430
427
. map_err ( |err| {
431
- metrics :: increment_counter! ( "tgi_request_failure" , "err" => "validation" ) ;
428
+ increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "validation" ) ] , 1 ) ;
432
429
tracing:: error!( "{err}" ) ;
433
430
Status :: invalid_argument ( err. to_string ( ) )
434
431
} )
435
432
. map ( |requests| {
436
- metrics :: histogram! (
433
+ observe_histogram (
437
434
"tgi_request_validation_duration" ,
438
435
start_time. elapsed ( ) . as_secs_f64 ( )
439
436
) ;
@@ -474,27 +471,27 @@ fn log_response(
474
471
span. record ( "total_time" , format ! ( "{total_time:?}" ) ) ;
475
472
span. record ( "input_toks" , input_tokens) ;
476
473
477
- metrics :: histogram! (
474
+ observe_histogram (
478
475
"tgi_request_inference_duration" ,
479
476
inference_time. as_secs_f64 ( )
480
477
) ;
481
- metrics :: histogram! (
478
+ observe_histogram (
482
479
"tgi_request_mean_time_per_token_duration" ,
483
480
time_per_token. as_secs_f64 ( )
484
481
) ;
485
482
}
486
483
487
484
// Metrics
488
485
match reason {
489
- Error => metrics :: increment_counter! ( "tgi_request_failure" , "err" => "generate" ) ,
486
+ Error => increment_labeled_counter ( "tgi_request_failure" , & [ ( "err" , "generate" ) ] , 1 ) ,
490
487
Cancelled => ( ) , // recorded where cancellation is detected
491
488
_ => {
492
- metrics :: increment_counter! (
493
- "tgi_request_success" , "stop_reason" => reason. as_str_name( ) , "kind" => kind
489
+ increment_labeled_counter (
490
+ "tgi_request_success" , & [ ( "stop_reason" , reason. as_str_name ( ) ) , ( "kind" , kind) ] , 1
494
491
) ;
495
- metrics :: histogram! ( "tgi_request_duration" , total_time. as_secs_f64( ) ) ;
496
- metrics :: histogram! ( "tgi_request_generated_tokens" , generated_tokens as f64 ) ;
497
- metrics :: histogram! (
492
+ observe_histogram ( "tgi_request_duration" , total_time. as_secs_f64 ( ) ) ;
493
+ observe_histogram ( "tgi_request_generated_tokens" , generated_tokens as f64 ) ;
494
+ observe_histogram (
498
495
"tgi_request_total_tokens" ,
499
496
( generated_tokens as usize + input_tokens) as f64
500
497
) ;
0 commit comments