Skip to content

Commit 92b41e4

Browse files
committed
Refactor and updates for progress and outputs to work with the new structure
1 parent 764ea4a commit 92b41e4

File tree

9 files changed

+1440
-1391
lines changed

9 files changed

+1440
-1391
lines changed

src/guidellm/benchmark/aggregator.py

Lines changed: 86 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@
6363
"GenerativeRequestsStatsProgressAggregator",
6464
"SchedulerStatsAggregator",
6565
"add_aggregate_metric",
66-
6766
]
6867

6968

@@ -187,7 +186,7 @@ def __call__(
187186
"worker_resolve_start_delay",
188187
agg_state,
189188
request_info.scheduler_timings.resolve_start,
190-
request_info.scheduler_timings.dequeued,
189+
request_info.scheduler_timings.scheduled,
191190
)
192191
add_aggregate_metric(
193192
"worker_resolve_time",
@@ -226,7 +225,7 @@ def __call__(
226225
request_info.request_timings.request_start,
227226
)
228227
add_aggregate_metric(
229-
"request_targeted_delay",
228+
"request_targeted_start_delay",
230229
agg_state,
231230
request_info.request_timings.request_start,
232231
request_info.scheduler_timings.targeted_start,
@@ -324,6 +323,34 @@ def __call__(
324323
if response is None:
325324
return None
326325

326+
if (
327+
request_info.status == "completed"
328+
and request_info.request_timings.request_end is not None
329+
):
330+
agg_state["requests_per_second"] = scheduler_state.successful_requests / (
331+
request_info.request_timings.request_end - scheduler_state.start_time
332+
)
333+
add_aggregate_metric(
334+
"request_latency",
335+
agg_state,
336+
request_info.request_timings.request_end,
337+
request_info.request_timings.request_start,
338+
)
339+
340+
if (
341+
request_info.status == "completed"
342+
and request_info.request_timings.first_iteration is not None
343+
and request_info.request_timings.last_iteration is not None
344+
and response.output_tokens
345+
):
346+
add_aggregate_metric(
347+
"time_per_output_token",
348+
agg_state,
349+
request_info.request_timings.last_iteration,
350+
request_info.request_timings.request_start,
351+
count=response.output_tokens,
352+
)
353+
327354
if (
328355
request_info.request_timings.first_iteration is not None
329356
and request_info.request_timings.request_start is not None
@@ -339,44 +366,57 @@ def __call__(
339366
request_info.request_timings.first_iteration is not None
340367
and request_info.request_timings.last_iteration is not None
341368
and response.output_tokens is not None
369+
and response.output_tokens > 1
342370
):
343371
add_aggregate_metric(
344-
"time_per_output_token",
372+
"inter_token_latency",
345373
agg_state,
346374
request_info.request_timings.last_iteration,
347-
request_info.request_timings.request_start,
348-
count=response.output_tokens,
375+
request_info.request_timings.first_iteration,
376+
count=response.output_tokens - 1,
349377
)
350378

351-
if response.output_tokens > 1:
352-
add_aggregate_metric(
353-
"inter_token_latency",
354-
agg_state,
355-
request_info.request_timings.last_iteration,
356-
request_info.request_timings.first_iteration,
357-
count=response.output_tokens - 1,
358-
)
359-
360379
if response.prompt_tokens is not None:
361380
add_aggregate_metric(
362381
"prompt_tokens",
363382
agg_state,
364383
response.prompt_tokens,
365384
)
385+
if request_info.request_timings.request_end is not None:
386+
agg_state["prompt_tokens_per_second"] = agg_state[
387+
"prompt_tokens_total"
388+
] / (
389+
request_info.request_timings.request_end
390+
- scheduler_state.start_time
391+
)
366392

367393
if response.output_tokens is not None:
368394
add_aggregate_metric(
369395
"output_tokens",
370396
agg_state,
371397
response.output_tokens,
372398
)
399+
if request_info.request_timings.request_end is not None:
400+
agg_state["output_tokens_per_second"] = agg_state[
401+
"output_tokens_total"
402+
] / (
403+
request_info.request_timings.request_end
404+
- scheduler_state.start_time
405+
)
373406

374407
if response.total_tokens is not None:
375408
add_aggregate_metric(
376409
"total_tokens",
377410
agg_state,
378411
response.total_tokens,
379412
)
413+
if request_info.request_timings.request_end is not None:
414+
agg_state["total_tokens_per_second"] = agg_state[
415+
"total_tokens_total"
416+
] / (
417+
request_info.request_timings.request_end
418+
- scheduler_state.start_time
419+
)
380420

381421
return agg_state
382422

@@ -411,6 +451,8 @@ class GenerativeRequestsAggregator(
411451
default=None,
412452
description="Cooldown duration in seconds to ignore at benchmark end",
413453
)
454+
_in_cooldown: bool = False
455+
_in_warmup: bool = False
414456

415457
def __call__(
416458
self,
@@ -433,44 +475,45 @@ def __call__(
433475
:param scheduler_state: Current scheduler execution state.
434476
:return: None, as this aggregator only collects for final compilation.
435477
"""
478+
status = {
479+
"requests_in_warmup": False,
480+
"requests_in_cooldown": False,
481+
}
482+
436483
if (
437484
response is None
438485
or request_info.status not in {"completed", "canceled", "errored"}
439486
or (request_info.status == "canceled" and request_info.started_at is None)
440487
):
441488
# Ignore requests that don't have a response yet.
442489
# Ignore requests that were canceled before they started.
443-
return None
490+
return status
444491

445492
if (
446493
self.warmup_requests is not None
447494
and self.warmup_requests >= scheduler_state.processed_requests
448-
):
449-
return None
450-
451-
if (
495+
) or (
452496
self.warmup_duration is not None
453497
and request_info.request_timings.request_end is not None
454498
and (
455499
scheduler_state.start_time + self.warmup_duration
456500
>= request_info.request_timings.request_end
457501
)
458502
):
459-
return None
503+
status["requests_in_warmup"] = True
504+
505+
return status
460506

461507
if (
462508
self.cooldown_requests is not None
463509
and scheduler_state.remaining_requests is not None
464510
and self.cooldown_requests >= scheduler_state.remaining_requests
465-
):
466-
return None
467-
468-
if (
511+
) or (
469512
self.cooldown_duration is not None
470513
and scheduler_state.remaining_duration is not None
471514
and self.cooldown_duration >= scheduler_state.remaining_duration
472515
):
473-
return None
516+
return status["requests_in_cooldown"]
474517

475518
if "completed" not in agg_state:
476519
agg_state["completed"] = []
@@ -484,7 +527,7 @@ def __call__(
484527
else:
485528
agg_state["errored"].append((response, request, request_info))
486529

487-
return None
530+
return status
488531

489532
def compile(
490533
self, agg_state: dict[str, Any], scheduler_state: SchedulerState
@@ -625,6 +668,22 @@ def compile(
625668
],
626669
)
627670
),
671+
total_token_count=(
672+
StatusDistributionSummary.from_values(
673+
value_types=[
674+
type_
675+
for type_, req in zip(total_types, total)
676+
if req.prompt_tokens is not None
677+
or req.output_tokens is not None
678+
],
679+
values=(
680+
(req.prompt_tokens or 0) + (req.output_tokens or 0)
681+
for req in total
682+
if req.prompt_tokens is not None
683+
or req.output_tokens is not None
684+
),
685+
)
686+
),
628687
time_to_first_token_ms=(
629688
StatusDistributionSummary.from_values(
630689
value_types=[

0 commit comments

Comments
 (0)