63
63
"GenerativeRequestsStatsProgressAggregator" ,
64
64
"SchedulerStatsAggregator" ,
65
65
"add_aggregate_metric" ,
66
-
67
66
]
68
67
69
68
@@ -187,7 +186,7 @@ def __call__(
187
186
"worker_resolve_start_delay" ,
188
187
agg_state ,
189
188
request_info .scheduler_timings .resolve_start ,
190
- request_info .scheduler_timings .dequeued ,
189
+ request_info .scheduler_timings .scheduled ,
191
190
)
192
191
add_aggregate_metric (
193
192
"worker_resolve_time" ,
@@ -226,7 +225,7 @@ def __call__(
226
225
request_info .request_timings .request_start ,
227
226
)
228
227
add_aggregate_metric (
229
- "request_targeted_delay " ,
228
+ "request_targeted_start_delay " ,
230
229
agg_state ,
231
230
request_info .request_timings .request_start ,
232
231
request_info .scheduler_timings .targeted_start ,
@@ -324,6 +323,34 @@ def __call__(
324
323
if response is None :
325
324
return None
326
325
326
+ if (
327
+ request_info .status == "completed"
328
+ and request_info .request_timings .request_end is not None
329
+ ):
330
+ agg_state ["requests_per_second" ] = scheduler_state .successful_requests / (
331
+ request_info .request_timings .request_end - scheduler_state .start_time
332
+ )
333
+ add_aggregate_metric (
334
+ "request_latency" ,
335
+ agg_state ,
336
+ request_info .request_timings .request_end ,
337
+ request_info .request_timings .request_start ,
338
+ )
339
+
340
+ if (
341
+ request_info .status == "completed"
342
+ and request_info .request_timings .first_iteration is not None
343
+ and request_info .request_timings .last_iteration is not None
344
+ and response .output_tokens
345
+ ):
346
+ add_aggregate_metric (
347
+ "time_per_output_token" ,
348
+ agg_state ,
349
+ request_info .request_timings .last_iteration ,
350
+ request_info .request_timings .request_start ,
351
+ count = response .output_tokens ,
352
+ )
353
+
327
354
if (
328
355
request_info .request_timings .first_iteration is not None
329
356
and request_info .request_timings .request_start is not None
@@ -339,44 +366,57 @@ def __call__(
339
366
request_info .request_timings .first_iteration is not None
340
367
and request_info .request_timings .last_iteration is not None
341
368
and response .output_tokens is not None
369
+ and response .output_tokens > 1
342
370
):
343
371
add_aggregate_metric (
344
- "time_per_output_token " ,
372
+ "inter_token_latency " ,
345
373
agg_state ,
346
374
request_info .request_timings .last_iteration ,
347
- request_info .request_timings .request_start ,
348
- count = response .output_tokens ,
375
+ request_info .request_timings .first_iteration ,
376
+ count = response .output_tokens - 1 ,
349
377
)
350
378
351
- if response .output_tokens > 1 :
352
- add_aggregate_metric (
353
- "inter_token_latency" ,
354
- agg_state ,
355
- request_info .request_timings .last_iteration ,
356
- request_info .request_timings .first_iteration ,
357
- count = response .output_tokens - 1 ,
358
- )
359
-
360
379
if response .prompt_tokens is not None :
361
380
add_aggregate_metric (
362
381
"prompt_tokens" ,
363
382
agg_state ,
364
383
response .prompt_tokens ,
365
384
)
385
+ if request_info .request_timings .request_end is not None :
386
+ agg_state ["prompt_tokens_per_second" ] = agg_state [
387
+ "prompt_tokens_total"
388
+ ] / (
389
+ request_info .request_timings .request_end
390
+ - scheduler_state .start_time
391
+ )
366
392
367
393
if response .output_tokens is not None :
368
394
add_aggregate_metric (
369
395
"output_tokens" ,
370
396
agg_state ,
371
397
response .output_tokens ,
372
398
)
399
+ if request_info .request_timings .request_end is not None :
400
+ agg_state ["output_tokens_per_second" ] = agg_state [
401
+ "output_tokens_total"
402
+ ] / (
403
+ request_info .request_timings .request_end
404
+ - scheduler_state .start_time
405
+ )
373
406
374
407
if response .total_tokens is not None :
375
408
add_aggregate_metric (
376
409
"total_tokens" ,
377
410
agg_state ,
378
411
response .total_tokens ,
379
412
)
413
+ if request_info .request_timings .request_end is not None :
414
+ agg_state ["total_tokens_per_second" ] = agg_state [
415
+ "total_tokens_total"
416
+ ] / (
417
+ request_info .request_timings .request_end
418
+ - scheduler_state .start_time
419
+ )
380
420
381
421
return agg_state
382
422
@@ -411,6 +451,8 @@ class GenerativeRequestsAggregator(
411
451
default = None ,
412
452
description = "Cooldown duration in seconds to ignore at benchmark end" ,
413
453
)
454
+ _in_cooldown : bool = False
455
+ _in_warmup : bool = False
414
456
415
457
def __call__ (
416
458
self ,
@@ -433,44 +475,45 @@ def __call__(
433
475
:param scheduler_state: Current scheduler execution state.
434
476
:return: None, as this aggregator only collects for final compilation.
435
477
"""
478
+ status = {
479
+ "requests_in_warmup" : False ,
480
+ "requests_in_cooldown" : False ,
481
+ }
482
+
436
483
if (
437
484
response is None
438
485
or request_info .status not in {"completed" , "canceled" , "errored" }
439
486
or (request_info .status == "canceled" and request_info .started_at is None )
440
487
):
441
488
# Ignore requests that don't have a response yet.
442
489
# Ignore requests that were canceled before they started.
443
- return None
490
+ return status
444
491
445
492
if (
446
493
self .warmup_requests is not None
447
494
and self .warmup_requests >= scheduler_state .processed_requests
448
- ):
449
- return None
450
-
451
- if (
495
+ ) or (
452
496
self .warmup_duration is not None
453
497
and request_info .request_timings .request_end is not None
454
498
and (
455
499
scheduler_state .start_time + self .warmup_duration
456
500
>= request_info .request_timings .request_end
457
501
)
458
502
):
459
- return None
503
+ status ["requests_in_warmup" ] = True
504
+
505
+ return status
460
506
461
507
if (
462
508
self .cooldown_requests is not None
463
509
and scheduler_state .remaining_requests is not None
464
510
and self .cooldown_requests >= scheduler_state .remaining_requests
465
- ):
466
- return None
467
-
468
- if (
511
+ ) or (
469
512
self .cooldown_duration is not None
470
513
and scheduler_state .remaining_duration is not None
471
514
and self .cooldown_duration >= scheduler_state .remaining_duration
472
515
):
473
- return None
516
+ return status [ "requests_in_cooldown" ]
474
517
475
518
if "completed" not in agg_state :
476
519
agg_state ["completed" ] = []
@@ -484,7 +527,7 @@ def __call__(
484
527
else :
485
528
agg_state ["errored" ].append ((response , request , request_info ))
486
529
487
- return None
530
+ return status
488
531
489
532
def compile (
490
533
self , agg_state : dict [str , Any ], scheduler_state : SchedulerState
@@ -625,6 +668,22 @@ def compile(
625
668
],
626
669
)
627
670
),
671
+ total_token_count = (
672
+ StatusDistributionSummary .from_values (
673
+ value_types = [
674
+ type_
675
+ for type_ , req in zip (total_types , total )
676
+ if req .prompt_tokens is not None
677
+ or req .output_tokens is not None
678
+ ],
679
+ values = (
680
+ (req .prompt_tokens or 0 ) + (req .output_tokens or 0 )
681
+ for req in total
682
+ if req .prompt_tokens is not None
683
+ or req .output_tokens is not None
684
+ ),
685
+ )
686
+ ),
628
687
time_to_first_token_ms = (
629
688
StatusDistributionSummary .from_values (
630
689
value_types = [
0 commit comments