diff --git a/src/guidellm/benchmark/__init__.py b/src/guidellm/benchmark/__init__.py index a4676c7e..76324a65 100644 --- a/src/guidellm/benchmark/__init__.py +++ b/src/guidellm/benchmark/__init__.py @@ -1,19 +1,31 @@ -from .aggregator import AggregatorT, BenchmarkAggregator, GenerativeBenchmarkAggregator -from .benchmark import ( +from .aggregator import ( + Aggregator, + AggregatorState, + CompilableAggregator, + GenerativeRequestsAggregator, + GenerativeStatsProgressAggregator, + InjectExtrasAggregator, + SchedulerStatsAggregator, + SerializableAggregator, +) +from .benchmarker import Benchmarker +from .entrypoints import benchmark_generative_text, reimport_benchmarks_report +from .objects import ( Benchmark, - BenchmarkArgs, BenchmarkMetrics, - BenchmarkRunStats, + BenchmarkSchedulerStats, BenchmarkT, GenerativeBenchmark, + GenerativeBenchmarksReport, GenerativeMetrics, - GenerativeTextErrorStats, - GenerativeTextResponseStats, - StatusBreakdown, + GenerativeRequestStats, +) +from .output import ( + GenerativeBenchmarkerConsole, + GenerativeBenchmarkerCSV, + GenerativeBenchmarkerHTML, + GenerativeBenchmarkerOutput, ) -from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker -from .entrypoints import benchmark_generative_text, reimport_benchmarks_report -from .output import GenerativeBenchmarksConsole, GenerativeBenchmarksReport from .profile import ( AsyncProfile, ConcurrentProfile, @@ -22,46 +34,45 @@ SweepProfile, SynchronousProfile, ThroughputProfile, - create_profile, ) from .progress import ( - BenchmarkerProgressDisplay, - BenchmarkerTaskProgressState, - GenerativeTextBenchmarkerProgressDisplay, - GenerativeTextBenchmarkerTaskProgressState, + BenchmarkerProgress, + BenchmarkerProgressGroup, + GenerativeConsoleBenchmarkerProgress, ) __all__ = [ - "AggregatorT", + "Aggregator", + "AggregatorState", "AsyncProfile", "Benchmark", - "BenchmarkAggregator", - "BenchmarkArgs", "BenchmarkMetrics", - "BenchmarkRunStats", + "BenchmarkSchedulerStats", "BenchmarkT", "Benchmarker", - "BenchmarkerProgressDisplay", - "BenchmarkerResult", - "BenchmarkerTaskProgressState", + "BenchmarkerProgress", + "BenchmarkerProgressGroup", + "CompilableAggregator", "ConcurrentProfile", "GenerativeBenchmark", - "GenerativeBenchmarkAggregator", - "GenerativeBenchmarker", - "GenerativeBenchmarksConsole", + "GenerativeBenchmarkerCSV", + "GenerativeBenchmarkerConsole", + "GenerativeBenchmarkerHTML", + "GenerativeBenchmarkerOutput", "GenerativeBenchmarksReport", + "GenerativeConsoleBenchmarkerProgress", "GenerativeMetrics", - "GenerativeTextBenchmarkerProgressDisplay", - "GenerativeTextBenchmarkerTaskProgressState", - "GenerativeTextErrorStats", - "GenerativeTextResponseStats", + "GenerativeRequestStats", + "GenerativeRequestsAggregator", + "GenerativeStatsProgressAggregator", + "InjectExtrasAggregator", "Profile", "ProfileType", - "StatusBreakdown", + "SchedulerStatsAggregator", + "SerializableAggregator", "SweepProfile", "SynchronousProfile", "ThroughputProfile", "benchmark_generative_text", - "create_profile", "reimport_benchmarks_report", ] diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index 9e6ffd68..9db93a12 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -1,760 +1,1260 @@ -import time +""" +Benchmark result aggregation and compilation interfaces. + +Provides protocols and implementations for collecting, processing, and compiling +benchmark data from scheduler executions into final metrics and statistics. + +Classes: + Aggregator: Protocol for processing benchmark data updates. + CompilableAggregator: Protocol for aggregators that can compile final results. + SchedulerStatsAggregator: Aggregates scheduler timing and performance metrics. + GenerativeRequestsStatsProgressAggregator: Tracks generation metrics during run. + GenerativeRequestsAggregator: Compiles complete generative benchmark results. + +Functions: + add_aggregate_metric: Helper for accumulating timing and count metrics. + +Type Variables: + RequestT: Generic request object type. + ResponseT: Generic response object type. + RequestTimingsT: Generic request timing object type. +""" + +from __future__ import annotations + +import math +import random from abc import ABC, abstractmethod -from pathlib import Path from typing import ( Any, + ClassVar, Generic, Literal, - Optional, - TypeVar, - Union, + Protocol, + runtime_checkable, ) -from pydantic import Field +from pydantic import Field, PrivateAttr -from guidellm.backend import ResponseSummary -from guidellm.benchmark.benchmark import ( - BenchmarkArgs, - BenchmarkRunStats, - BenchmarkT, - GenerativeBenchmark, - GenerativeTextErrorStats, - GenerativeTextResponseStats, +from guidellm.backends import ( + GenerationRequest, + GenerationResponse, ) -from guidellm.objects import ( - RunningStats, - StandardBaseModel, - StatusBreakdown, - TimeRunningStats, +from guidellm.benchmark.objects import ( + BenchmarkSchedulerStats, + GenerativeMetrics, + GenerativeRequestStats, ) -from guidellm.request import ( - GenerationRequest, - GenerativeRequestLoaderDescription, - RequestLoaderDescription, +from guidellm.scheduler import ( RequestT, ResponseT, -) -from guidellm.scheduler import ( - GenerativeRequestsWorkerDescription, - SchedulerRequestResult, - WorkerDescription, + ScheduledRequestInfo, + SchedulerState, ) from guidellm.settings import settings -from guidellm.utils import check_load_processor +from guidellm.utils import ( + InfoMixin, + PydanticClassRegistryMixin, + StatusBreakdown, + StatusDistributionSummary, + all_defined, + safe_divide, + safe_getattr, +) __all__ = [ - "AggregatorT", - "BenchmarkAggregator", - "GenerativeBenchmarkAggregator", + "Aggregator", + "AggregatorState", + "CompilableAggregator", + "GenerativeRequestsAggregator", + "GenerativeStatsProgressAggregator", + "InjectExtrasAggregator", + "SchedulerStatsAggregator", + "SerializableAggregator", ] -class SchedulerRunningStats(StandardBaseModel): +class AggregatorState(dict[str, Any]): + def add_metric( + self, + key: str, + value: int | float | None, + start_val: int | float | None = 0.0, + count: int | None = 1, + duration: float | None = None, + duration_div: Literal["total", "avg"] = "total", + prefix: str | None = None, + ): + """ + Add timing or count metrics to aggregation state. + """ + if prefix: + self.add_metric( + key=f"{prefix}_{key}", + value=value, + start_val=start_val, + count=count, + duration=duration, + duration_div=duration_div, + ) + return + + if not all_defined(value, start_val, count): + return + + delta_val = value - start_val + self[f"{key}_total"] = self.get(f"{key}_total", 0) + delta_val + self[f"{key}_count"] = self.get(f"{key}_count", 0) + count + self[f"{key}_avg"] = safe_divide( + self.get(f"{key}_total"), self.get(f"{key}_count") + ) + + if all_defined(duration): + self[f"{key}_duration"] = duration + self[f"{key}_rate"] = safe_divide( + self.get(f"{key}_{duration_div}"), duration + ) + + def set_metric( + self, + key: str, + value: int | float | None, + type_: Literal["total", "count", "avg", "duration", "rate"], + prefix: str | None = None, + ): + if prefix: + self.set_metric( + key=f"{prefix}_{key}", + value=value, + type_=type_, + prefix=None, + ) + return + + self[f"{key}_{type_}"] = value + + def get_metric( + self, + key: str, + type_: Literal["total", "count", "avg", "duration", "rate"], + default: int | float | None = None, + prefix: str | None = None, + ) -> int | float | None: + if prefix: + return self.get_metric( + key=f"{prefix}_{key}", + type_=type_, + default=default, + ) + + return self.get(f"{key}_{type_}", default) + + +@runtime_checkable +class Aggregator(Protocol[ResponseT, RequestT]): """ - The metrics for the scheduler stored as running statistics for easy calculations - of rates, averages, totals, etc. + Protocol for processing benchmark data updates during execution. + + Defines the interface for aggregators that collect and process request/response + data from scheduler executions. Implementations update aggregation state with + each completed request for eventual compilation into final metrics. """ - created_requests: RunningStats = Field( - description=( - "The running statistics for the number of requests created for this " - "benchmark run. This includes all requests created, regardless of " - "their status." - ), - default_factory=RunningStats, - ) - queued_requests: RunningStats = Field( - description=( - "The running statistics for the number of requests pending in queue " - "for this benchmark run. This includes requests that are waiting to " - "be scheduled." - ), - default_factory=RunningStats, - ) - scheduled_requests: RunningStats = Field( - description=( - "The running statistics for the number of requests scheduled (actively " - "running but waiting for the desired start time) for this benchmark run." - ), - default_factory=RunningStats, - ) - processing_requests: RunningStats = Field( - description=( - "The running statistics for the number of requests actively being " - "processed by the worker for this benchmark run." - ), - default_factory=RunningStats, - ) - completed_requests: RunningStats = Field( - description=( - "The running statistics for the number of requests completed for this " - "benchmark run. This includes requests within the warmup and cooldown " - "period, if any, along with the final results." - ), - default_factory=RunningStats, - ) + def __call__( + self, + state: AggregatorState, + response: ResponseT | None, + request: RequestT, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> dict[str, Any] | None: + """ + Process a completed request and update aggregation state. + + :param state: Current aggregation state to update in-place. + :param response: Response generated for the request, if successful. + :param request: The processed request object. + :param request_info: Scheduling metadata and timing information. + :param scheduler_state: Current scheduler execution state. + :return: Optional intermediate updates for progress reporting. + """ -class RequestsRunningStats(StandardBaseModel): +@runtime_checkable +class CompilableAggregator(Protocol[ResponseT, RequestT]): """ - The metrics for requests that have succeeded, been canceled, or errored stored - as running statistics for easy calculations of rates, averages, totals, etc. + Protocol for aggregators that compile final results from aggregated state. + + Extends the Aggregator protocol with the ability to transform accumulated + state into final benchmark results and metrics after execution completes. """ - totals: StatusBreakdown[RunningStats, RunningStats, RunningStats, RunningStats] = ( - Field( - description=( - "The running statistics for the total number of requests that " - "completed within the benchmark run." - ), - default_factory=lambda: StatusBreakdown( - successful=RunningStats(), - errored=RunningStats(), - incomplete=RunningStats(), - total=RunningStats(), - ), - ) - ) - queued_time: TimeRunningStats = Field( - description=( - "The running statistics for the time spent in queue for all requests that " - "completed within the benchmark run. This is the time from when the " - "request was created to when it was dequeued by the worker." - ), - default_factory=TimeRunningStats, - ) - scheduled_time_delay: TimeRunningStats = Field( - description=( - "The running statistics for the time spent from when a request was " - "dequeued by the worker to when it was actually scheduled by the worker" - "for all requests that completed within the benchmark run. " - "This should be as close to 0 as possible, any additional time is " - "overheads from the system or the worker." - ), - default_factory=TimeRunningStats, - ) - scheduled_time_sleep: TimeRunningStats = Field( - description=( - "The running statistics for the time for each request spent sleeping til " - "the desired start time was reached for all requests that completed within " - "the benchmark run. This is the time from when the request was scheduled " - "to when the desired start time was reached. " - ), - default_factory=TimeRunningStats, - ) - worker_start_delay: TimeRunningStats = Field( - description=( - "The running statistics for the time delay between when the request was " - "scheduled and when the worker actually started processing subtracting any " - "sleep time for all requests that completed within the benchmark run. " - "This should be as close to 0 as possible, any additional time is " - "overheads from the system or the worker." - ), - default_factory=TimeRunningStats, - ) - worker_time: TimeRunningStats = Field( - description=( - "The running statistics for the time spent processing all requests that " - "completed within the benchmark run. This is the time from when the " - "request was started to when it was completed." - ), - default_factory=TimeRunningStats, - ) - worker_start_time_targeted_delay: TimeRunningStats = Field( - description=( - "The running statistics for the delay between the targeted start time and " - "the actual start time for requests that completed within the benchmark " - "run. This represents delays from the best case desired start time. " - "For async strategies, this represents delays from the ideal system. " - "For sync strategies, since those are doubled in queue, this should be " - "as close to the time for a request to be processed as possible." - ), - default_factory=TimeRunningStats, - ) - request_start_time_delay: TimeRunningStats = Field( - description=( - "The running statistics for the delay between the actual request being " - "made and the time the worker started on the request for all requests " - "that completed within the benchmark run. This time should be as close to " - "0 as possible, any additional time is overhead from the system or " - "the worker." - ), - default_factory=TimeRunningStats, - ) - request_start_time_targeted_delay: TimeRunningStats = Field( - description=( - "The running statistics for the delay between the targeted start time and " - "the actual start time for all requests that completed within the " - "benchmark run. This represents delays from the best case desired start " - "time. For async strategies, this represents delays from the ideal system. " - "For sync strategies, since those are duplicated in queue, this should be " - "as close to the time for a request to be processed." - ), - default_factory=TimeRunningStats, - ) - request_time_delay: TimeRunningStats = Field( - description=( - "The running statistics for the delay in time between the total request " - "time and the worker time. This should be as close to 0 as possible, any " - "additional time is overhead from the system or the worker. " - ), - default_factory=TimeRunningStats, - ) - request_time: TimeRunningStats = Field( - description=( - "The running statistics for the time spent processing all requests that " - "completed within the benchmark run. This is the time from when the " - "request was created to when it was completed." - ), - default_factory=TimeRunningStats, - ) + def __call__( + self, + state: AggregatorState, + response: ResponseT | None, + request: RequestT, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> dict[str, Any] | None: + """ + Process a completed request and update aggregation state. + :param state: Current aggregation state to update in-place. + :param response: Response generated for the request, if successful. + :param request: The processed request object. + :param request_info: Scheduling metadata and timing information. + :param scheduler_state: Current scheduler execution state. + :return: Optional intermediate updates for progress reporting. + """ -class BenchmarkAggregator( - ABC, StandardBaseModel, Generic[BenchmarkT, RequestT, ResponseT] + def compile( + self, state: AggregatorState, scheduler_state: SchedulerState + ) -> dict[str, Any]: + """ + Compile aggregated state into final benchmark results. + + :param agg_state: The accumulated aggregation state. + :param scheduler_state: Final scheduler execution state. + :return: Compiled benchmark results and metrics. + """ + + +class SerializableAggregator( + PydanticClassRegistryMixin[type["SerializableAggregator"]], + ABC, + Generic[ResponseT, RequestT], ): + schema_discriminator: ClassVar[str] = "type_" + + @classmethod + def __pydantic_schema_base_type__(cls) -> type[SerializableAggregator]: + if cls.__name__ == "SerializableAggregator": + return cls + + return SerializableAggregator + + @classmethod + @abstractmethod + def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]: + """ + Validate and process arguments for constraint creation. + + Must be implemented by subclasses to handle their specific parameter patterns. + + :param args: Positional arguments passed to the constraint + :param kwargs: Keyword arguments passed to the constraint + :return: Validated dictionary of parameters for constraint creation + :raises NotImplementedError: Must be implemented by subclasses + """ + ... + + @classmethod + def resolve( + cls, + aggregators: dict[ + str, + Any | dict[str, Any] | Aggregator | CompilableAggregator, + ], + ) -> dict[str, Aggregator | CompilableAggregator]: + """ + Resolve mixed aggregator specifications to callable aggregators. + + :param aggregators: Dictionary mapping aggregator keys to specifications + :return: Dictionary mapping aggregator keys to callable functions + :raises ValueError: If any key is not registered in the factory + """ + resolved = {} + + for key, val in aggregators.items(): + if isinstance(val, (Aggregator, CompilableAggregator)): + resolved[key] = val + else: + aggregator_class = cls.get_registered_object(key) + kwargs = aggregator_class.validated_kwargs(**val) + resolved[key] = aggregator_class(**kwargs) + + return resolved + + type_: Literal["aggregator"] = Field(default="aggregator", description="") + + @abstractmethod + def __call__( + self, + state: AggregatorState, + response: ResponseT | None, + request: RequestT, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> dict[str, Any] | None: + """ + Process a completed request and update aggregation state. + + :param agg_state: Current aggregation state to update in-place. + :param response: Response generated for the request, if successful. + :param request: The processed request object. + :param request_info: Scheduling metadata and timing information. + :param scheduler_state: Current scheduler execution state. + :return: Optional intermediate updates for progress reporting. + """ + + @abstractmethod + def compile( + self, state: AggregatorState, scheduler_state: SchedulerState + ) -> dict[str, Any]: + """ + Compile aggregated state into final benchmark results. + + :param agg_state: The accumulated aggregation state. + :param scheduler_state: Final scheduler execution state. + :return: Compiled benchmark results and metrics. + """ + + +@SerializableAggregator.register("inject_extras") +class InjectExtrasAggregator(SerializableAggregator[ResponseT, RequestT], InfoMixin): """ - A pydantic base class representing the base class for aggregating benchmark results. - The purpose is to receive and process results from a Benchmarker as it iterates - through a Scheduler for an individual benchmark run. - As results are added, lightweight statistics are updated and stored for immediate - progress and informational updates to the caller. - Once the benchmark run is complete, the `compile` method is called to finalize - the benchmark and return a Benchmark object with all the results and statistics - fully calculated. + Aggregator for injecting extra metadata into the output. """ - type_: Literal["benchmark_aggregator"] = "benchmark_aggregator" - run_id: str = Field( - description=( - "The unique identifier for the encompasing benchmark run that this " - "benchmark was a part of." - ) - ) - args: BenchmarkArgs = Field( - description=( - "The arguments used to create the benchmark run that this benchmark was " - "a part of." - ) - ) - worker_description: Union[ - GenerativeRequestsWorkerDescription, WorkerDescription - ] = Field( - description=( - "The description and specifics for the worker used to resolve requests " - "for this benchmark." - ), - discriminator="type_", - ) - request_loader_description: Union[ - GenerativeRequestLoaderDescription, RequestLoaderDescription - ] = Field( - description=( - "The description and specifics for the request loader used to create " - "requests for this benchmark." - ), - discriminator="type_", - ) - extras: dict[str, Any] = Field( - description=( - "Any additional information or metadata that was passed for this benchmark." - ) - ) - in_warmup: bool = Field( - description=( - "A flag to indicate if the benchmark is currently in the warmup phase." - ), - default=False, - exclude=True, - ) - in_cooldown: bool = Field( - description=( - "A flag to indicate if the benchmark is currently in the cooldown phase." - ), - default=False, - exclude=True, - ) - scheduler_stats: SchedulerRunningStats = Field( - description=( - "The running statistics for the scheduler for this benchmark run. " - "This includes all requests created, regardless of their status." - ), - default_factory=SchedulerRunningStats, - ) - requests_stats: RequestsRunningStats = Field( - description=( - "The running statistics for the requests for this benchmark run. " - "This includes all requests created, regardless of their status." - ), - default_factory=RequestsRunningStats, - ) - results: StatusBreakdown[ - list[SchedulerRequestResult[RequestT, ResponseT]], - list[SchedulerRequestResult[RequestT, ResponseT]], - list[SchedulerRequestResult[RequestT, ResponseT]], - None, - ] = Field( - description=( - "The completed requests for this benchmark run broken down by status" - "and excluding warmup and cooldown requests." - ), - default_factory=lambda: StatusBreakdown( # type: ignore[arg-type] - successful=[], - errored=[], - incomplete=[], - total=None, - ), - ) + @classmethod + def validated_kwargs(cls, extras: dict[str, Any], **_kwargs) -> dict[str, Any]: + return {"extras": extras} - def add_result( + type_: Literal["inject_extras"] = Field(default="inject_extras") + extras: dict[str, Any] | None = Field(default_factory=None) + + def __call__( self, - result: SchedulerRequestResult[RequestT, ResponseT], - ) -> bool: + state: AggregatorState, + response: ResponseT | None, + request: RequestT, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> dict[str, Any] | None: """ - Add a result to the aggregator. This will update the internal statistics - and add the result to the list of results if it is not within the warmup or - cooldown period. - - :param result: The result to add to the aggregator. - :return: True if the result was added, False if it was added because it - did not fit within the warmup or cooldown period, was not requested, - or is not finished + Inject extra metadata into the aggregation state. + + :param agg_state: Current aggregation state to update. + :param response: Response generated for the request, if successful. + :param request: The processed request object. + :param request_info: Scheduling metadata and timing information. + :param scheduler_state: Current scheduler execution state. + :return: Updated aggregation state with injected extras. """ - # Add scheduler statistics - self.scheduler_stats.created_requests += max( - 0, result.run_info.created_requests - ) - self.scheduler_stats.queued_requests += max(0, result.run_info.queued_requests) - self.scheduler_stats.scheduled_requests += max( - 0, result.run_info.scheduled_requests - ) - self.scheduler_stats.processing_requests += max( - 0, result.run_info.processing_requests - ) - self.scheduler_stats.completed_requests += max( - 0, result.run_info.completed_requests - ) + _ = (state, response, request, request_info, scheduler_state) # unused + return None - if result.type_ != "request_complete" or ( - result.request_info.canceled and not result.request_info.requested - ): - # If the result is not completed yet, don't add to the results - # If the result was canceled and not started, ignore it - return False + def compile( + self, state: AggregatorState, scheduler_state: SchedulerState + ) -> dict[str, Any]: + _ = (state, scheduler_state) # unused + return {"extras": self.extras} if self.extras else {} - # Add request statistics - self.requests_stats.totals.total += 1 - if result.request_info.canceled: - self.requests_stats.totals.incomplete += 1 - elif result.request_info.errored: - self.requests_stats.totals.errored += 1 - elif result.request_info.completed: - self.requests_stats.totals.successful += 1 - else: - raise ValueError( - "Unexpected state: request_info must be either " - "completed, canceled, or errored. " - f"Got {result.request_info}" - ) - self.requests_stats.queued_time.update( - result.request_info.dequeued_time - result.request_info.queued_time - ) - self.requests_stats.scheduled_time_delay.update( - result.request_info.scheduled_time - result.request_info.dequeued_time +@SerializableAggregator.register("scheduler_stats") +class SchedulerStatsAggregator(SerializableAggregator[ResponseT, RequestT], InfoMixin): + """ + Aggregates scheduler timing and performance metrics. + + Collects timing data for various scheduler phases including queuing, + resolution, and processing delays to generate performance statistics. + """ + + @classmethod + def validated_kwargs(cls, *_args, **_kwargs) -> dict[str, Any]: + return {} + + type_: Literal["scheduler_stats"] = Field(default="scheduler_stats") + + def __call__( + self, + state: AggregatorState, + response: ResponseT | None, + request: RequestT, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> dict[str, Any] | None: + """ + Aggregate scheduler timing metrics for a completed request. + + :param agg_state: Current aggregation state to update. + :param response: Response generated for the request, if successful. + :param request: The processed request object. + :param request_info: Scheduling metadata and timing information. + :param scheduler_state: Current scheduler execution state. + :return: Updated aggregation state for intermediate reporting. + """ + _ = (response, request, scheduler_state) # unused + if request_info.status not in ("completed", "errored", "cancelled"): + # Only compile scheduler stats for processed requests + return None + + state["updated_scheduler_stats"] = True + state.add_metric( + key="queued_time", + value=request_info.scheduler_timings.dequeued, + start_val=request_info.scheduler_timings.queued, ) - sleep_time = max( - 0.0, - result.request_info.targeted_start_time - - result.request_info.scheduled_time, + state.add_metric( + key="worker_resolve_start_delay", + value=request_info.scheduler_timings.resolve_start, + start_val=request_info.scheduler_timings.scheduled_at, ) - self.requests_stats.scheduled_time_sleep.update(sleep_time) - time_to_worker_start = ( - result.request_info.worker_start - result.request_info.scheduled_time + state.add_metric( + key="worker_resolve_time", + value=request_info.scheduler_timings.resolve_end, + start_val=request_info.scheduler_timings.resolve_start, ) - self.requests_stats.worker_start_delay.update(time_to_worker_start - sleep_time) - self.requests_stats.worker_time.update( - result.request_info.worker_end - result.request_info.worker_start + state.add_metric( + key="worker_resolve_end_delay", + value=request_info.scheduler_timings.resolve_end, + start_val=safe_getattr(request_info.request_timings, "request_end"), ) - self.requests_stats.worker_start_time_targeted_delay.update( - result.request_info.worker_start - result.request_info.targeted_start_time + state.add_metric( + key="finalized_delay", + value=request_info.scheduler_timings.finalized, + start_val=request_info.scheduler_timings.resolve_end, ) - self.requests_stats.request_start_time_delay.update( - result.request_info.worker_start - result.request_info.targeted_start_time + state.add_metric( + key="worker_targeted_start_delay", + value=request_info.scheduler_timings.resolve_start, + start_val=request_info.scheduler_timings.targeted_start, ) - self.requests_stats.request_start_time_targeted_delay.update( - result.request_info.worker_start - result.request_info.targeted_start_time + state.add_metric( + key="request_start_delay", + value=request_info.scheduler_timings.resolve_start, + start_val=safe_getattr(request_info.request_timings, "request_start"), ) - self.requests_stats.request_time_delay.update( - (result.request_info.worker_end - result.request_info.worker_start) - - (result.request_info.worker_end - result.request_info.worker_start) + state.add_metric( + key="request_time", + value=safe_getattr(request_info.request_timings, "request_end"), + start_val=safe_getattr(request_info.request_timings, "request_start"), ) - self.requests_stats.request_time.update( - result.request_info.worker_end - result.request_info.worker_start + state.add_metric( + key="request_targeted_start_delay", + value=safe_getattr(request_info.request_timings, "request_start"), + start_val=request_info.scheduler_timings.targeted_start, ) - # Add result to the list of results provided we are not in warmup or cooldown - total_completed = self.requests_stats.totals.total.total - global_start_time = self.requests_stats.totals.total.start_time + return state - in_warmup_number = ( - self.args.warmup_number and total_completed <= self.args.warmup_number - ) - in_warmup_duration = ( - self.args.warmup_duration - and result.request_info.worker_start - <= (global_start_time + self.args.warmup_duration) - ) + def compile( + self, state: AggregatorState, scheduler_state: SchedulerState + ) -> dict[Literal["scheduler_stats"], BenchmarkSchedulerStats]: + """ + Compile scheduler timing metrics into benchmark statistics. + + :param agg_state: Accumulated timing data and counts. + :param scheduler_state: Final scheduler execution state. + :return: Dictionary containing compiled scheduler statistics. + """ + return { + "run_stats": BenchmarkSchedulerStats( + start_time=scheduler_state.start_time, + end_time=scheduler_state.end_time, + requests_made=StatusBreakdown[int, int, int, int]( + successful=scheduler_state.successful_requests, + incomplete=scheduler_state.cancelled_requests, + errored=scheduler_state.errored_requests, + total=( + scheduler_state.successful_requests + + scheduler_state.cancelled_requests + + scheduler_state.errored_requests + ), + ), + queued_time_avg=state.get_metric( + key="queued_time", type_="avg", default=0.0 + ), + worker_resolve_start_delay_avg=state.get_metric( + key="worker_resolve_start_delay", type_="avg", default=0.0 + ), + worker_resolve_time_avg=state.get_metric( + key="worker_resolve_time", type_="avg", default=0.0 + ), + worker_resolve_end_delay_avg=state.get_metric( + key="worker_resolve_end_delay", type_="avg" + ), + finalized_delay_avg=state.get_metric( + key="finalized_delay", type_="avg", default=0.0 + ), + worker_targeted_start_delay_avg=state.get_metric( + key="worker_targeted_start_delay", type_="avg", default=0.0 + ), + request_start_delay_avg=state.get_metric( + key="request_start_delay", type_="avg", default=0.0 + ), + request_time_avg=state.get_metric( + key="request_time", type_="avg", default=0.0 + ), + request_targeted_start_delay_avg=state.get_metric( + key="request_targeted_start_delay", type_="avg", default=0.0 + ), + ), + } - if in_warmup_number or in_warmup_duration: - self.in_warmup = True - return True - self.in_warmup = False - in_cooldown_number = ( - self.args.cooldown_number - and self.args.max_number - and total_completed > self.args.max_number - self.args.cooldown_number - ) - in_cooldown_duration = ( - self.args.cooldown_duration - and self.args.max_duration - and result.request_info.worker_start - > global_start_time + self.args.max_duration - self.args.cooldown_duration +@SerializableAggregator.register("generative_stats_progress") +class GenerativeStatsProgressAggregator( + SerializableAggregator[GenerationResponse, GenerationRequest] +): + """ + Tracks generative model metrics during benchmark execution. + + Aggregates token-level metrics including time to first token, inter-token + latency, and token counts for real-time progress monitoring. + """ + + @classmethod + def validated_kwargs(cls, *_args, **_kwargs) -> dict[str, Any]: + return {} + + type_: Literal["generative_stats_progress"] = Field( + default="generative_stats_progress" + ) + + def __call__( + self, + state: AggregatorState, + response: GenerationResponse | None, + request: GenerationRequest, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> dict[str, Any] | None: + """ + Aggregate generative model metrics for a completed request. + + :param agg_state: Current aggregation state to update. + :param response: Generation response with token and timing data. + :param request: The processed generation request. + :param request_info: Scheduling metadata and timing information. + :param scheduler_state: Current scheduler execution state. + :return: Updated aggregation state for progress reporting. + """ + _ = (request,) # unused + if request_info.status not in {"completed", "errored", "cancelled"}: + # Only compile progress stats for processed requests + return None + + state["updated_generative_stats"] = True + start_time = scheduler_state.start_time + end_time = ( + safe_getattr(request_info.request_timings, "request_end") + or request_info.scheduler_timings.resolve_end ) + duration = end_time - start_time if end_time else None - if in_cooldown_number or in_cooldown_duration: - self.in_cooldown = True - return True + for prefix in (request_info.status, None): + requests_count = ( + scheduler_state.processed_requests + if prefix is None + else scheduler_state.successful_requests + if request_info.status == "completed" + else scheduler_state.cancelled_requests + if request_info.status == "cancelled" + else scheduler_state.errored_requests + ) - self.in_cooldown = False + # Requests per Second + if duration is not None: + state.set_metric( + key="requests", + value=safe_divide(requests_count, duration), + type_="rate", + prefix=prefix, + ) - if result.request_info.canceled: - self.results.incomplete.append(result) - elif result.request_info.errored: - self.results.errored.append(result) - elif result.request_info.completed: - self.results.successful.append(result) - else: - raise ValueError( - "Unexpected state: request_info must be either " - "completed, canceled, or errored. " - f"Got {result.request_info}" + # Request Concurrency + state.set_metric( + key="requests", + value=scheduler_state.processing_requests, + type_="avg", + prefix=prefix, ) - return True + # Request Latency + state.add_metric( + key="request_latency", + value=safe_getattr(request_info.request_timings, "request_end"), + start_val=safe_getattr(request_info.request_timings, "request_start"), + prefix=prefix, + ) - @abstractmethod - def compile(self) -> BenchmarkT: - """ - Compile the benchmark results and statistics into a Benchmark object. - This is required to be implemented by subclasses to finalize the benchmark - and return the compiled object. + # Time to First Token + state.add_metric( + key="time_to_first_token", + value=safe_getattr(request_info.request_timings, "first_iteration"), + start_val=safe_getattr(request_info.request_timings, "request_start"), + prefix=prefix, + ) + + output_tokens = safe_getattr(response, "output_tokens") + prompt_tokens = safe_getattr(response, "prompt_tokens") + + # Inter Token Latency + state.add_metric( + key="inter_token_latency", + value=safe_getattr(request_info.request_timings, "last_iteration"), + start_val=safe_getattr(request_info.request_timings, "first_iteration"), + count=( + output_tokens - 1 if output_tokens and output_tokens > 1 else None + ), + prefix=prefix, + ) + + # Time per Output Token + state.add_metric( + key="time_per_output_token", + value=safe_getattr(request_info.request_timings, "request_start"), + start_val=safe_getattr(request_info.request_timings, "last_iteration"), + count=output_tokens, + prefix=prefix, + ) + + # Prompt Tokens + state.add_metric( + key="prompt_tokens", + value=prompt_tokens, + duration=duration, + prefix=prefix, + ) + + # Output Tokens + state.add_metric( + key="output_tokens", + value=output_tokens, + duration=duration, + prefix=prefix, + ) + + # Total Tokens + state.add_metric( + key="total_tokens", + value=( + prompt_tokens + output_tokens + if all_defined(prompt_tokens, output_tokens) + else prompt_tokens + if all_defined(prompt_tokens) + else output_tokens + if all_defined(output_tokens) + else None + ), + duration=duration, + prefix=prefix, + ) + + return state + + def compile( + self, state: AggregatorState, scheduler_state: SchedulerState + ) -> dict[str, Any]: """ - ... + Compile progress metrics into final results. + GenerativeStatsProgressAggregator is primarily for progress tracking, + so compilation returns the aggregated state as-is. -AggregatorT = TypeVar("AggregatorT", bound=BenchmarkAggregator) + :param agg_state: The accumulated aggregation state. + :param scheduler_state: Final scheduler execution state. + :return: The aggregated state as final results. + """ + _ = (state, scheduler_state) # unused + return {} -class GenerativeRequestsRunningStats(RequestsRunningStats): +@SerializableAggregator.register("generative_requests") +class GenerativeRequestsAggregator( + SerializableAggregator[GenerationResponse, GenerationRequest], +): """ - The metrics for generative requests that have succeeded, been canceled, or errored - stored as running statistics for easy calculations of rates, averages, totals, etc. + Compiles complete generative benchmark results with warmup/cooldown filtering. + + Aggregates request data during execution and compiles comprehensive metrics + including timing distributions, token statistics, and throughput measurements. + Supports filtering warmup and cooldown periods from final results. """ - time_to_first_token: TimeRunningStats = Field( - description=( - "The running statistics for the time from the start of the request to the " - "first token being generated for all requests that completed within the " - "benchmark run." - ), - default_factory=TimeRunningStats, - ) - inter_token_latency: TimeRunningStats = Field( - description=( - "The running statistics for the time between each token being generated " - "for all requests that completed within the benchmark run." - ), - default_factory=TimeRunningStats, - ) - prompt_tokens: RunningStats = Field( - description=( - "The running statistics for the token count for the prompt for all " - "requests that completed, if available in the response." - ), - default_factory=RunningStats, - ) - output_tokens: RunningStats = Field( - description=( - "The running statistics for the token count for the output for all " - "requests that completed, if available in the response." - ), - default_factory=RunningStats, - ) - total_tokens: RunningStats = Field( - description=( - "The running statistics for the total token count for all requests that " - "completed, if available in the response." - ), - default_factory=RunningStats, - ) + @classmethod + def validated_kwargs( + cls, + request_samples: int | None = 20, + warmup: int | float | None = None, + cooldown: int | float | None = None, + **_kwargs, + ) -> dict[str, Any]: + return { + "request_samples": request_samples, + "warmup": warmup, + "cooldown": cooldown, + } + type_: Literal["generative_requests"] = Field(default="generative_requests") -class GenerativeBenchmarkAggregator( - BenchmarkAggregator[GenerativeBenchmark, GenerationRequest, ResponseSummary] -): - type_: Literal["generative_benchmark_aggregator"] = ( - "generative_benchmark_aggregator" # type: ignore[assignment] - ) - processor: Optional[Union[str, Path, Any]] = Field( - description=( - "The tokenizer to use for calculating token counts when none are " - "avaiable that match the preferred source." - ) + request_samples: int | None = Field(default=20, description="") + warmup: int | float | None = Field( + default=None, + description="Number of warmup requests to ignore at benchmark start", ) - processor_args: Optional[dict[str, Any]] = Field( - description=( - "Additional arguments to pass to the tokenizer if it requires " - "any specific configuration for loading or processing." - ), - ) - worker_description: GenerativeRequestsWorkerDescription = Field( - description=( - "The description and specifics for the worker used to resolve requests " - "for this benchmark." - ), - discriminator="type_", - ) - request_loader_description: GenerativeRequestLoaderDescription = Field( - description=( - "The description and specifics for the request loader used to create " - "requests for this benchmark." - ), - discriminator="type_", - ) - requests_stats: GenerativeRequestsRunningStats = Field( - description=( - "The running statistics for the requests for this benchmark run. " - "This includes all requests created, regardless of their status." - ), - default_factory=GenerativeRequestsRunningStats, + cooldown: int | float | None = Field( + default=None, + description="Number of cooldown requests to ignore at benchmark end", ) + _in_cooldown: bool = PrivateAttr(False) + _in_warmup: bool = PrivateAttr(False) - def add_result( - self, result: SchedulerRequestResult[GenerationRequest, ResponseSummary] - ) -> bool: + def __call__( + self, + state: AggregatorState, + response: GenerationResponse | None, + request: GenerationRequest, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> dict[str, Any] | None: """ - Add a result to the aggregator. This will update the internal statistics - and add the result to the list of results if it is not within the warmup or - cooldown period. + Collect completed requests for final compilation. - :param result: The result to add to the aggregator. + Filters requests based on warmup/cooldown settings and categorizes by + completion status for comprehensive benchmark analysis. + + :param agg_state: Current aggregation state to update. + :param response: Generation response data. + :param request: The processed generation request. + :param request_info: Scheduling metadata and timing information. + :param scheduler_state: Current scheduler execution state. + :return: None, as this aggregator only collects for final compilation. """ - if not super().add_result(result): - return False + # Skip invalid requests + if request_info.status not in {"completed", "canceled", "errored"} or ( + request_info.status == "canceled" + and safe_getattr(request_info.scheduler_timings, "resolve_start") is None + # Canceled requests that never started should not be kept + ): + return None - if result.request is None: - raise ValueError("Request is None, cannot add result.") + status = { + "updated_generative_requests": True, + "requests_in_warmup": False, + "requests_in_cooldown": False, + } - if result.response is None: - raise ValueError("Response is None, cannot add result.") + if self._is_in_warmup(request_info, scheduler_state): + status["requests_in_warmup"] = True + return status - self.requests_stats.request_start_time_delay.update( - result.response.start_time - result.request_info.worker_start - ) - self.requests_stats.request_start_time_targeted_delay.update( - result.response.start_time - result.request_info.targeted_start_time - ) - self.requests_stats.request_time_delay.update( - (result.response.start_time - result.request_info.worker_start) - + result.request_info.worker_end - - result.response.end_time - ) - self.requests_stats.request_time.update( - result.response.end_time - result.response.start_time - ) - if result.response.first_iter_time: - self.requests_stats.time_to_first_token.update( - result.response.first_iter_time - result.response.start_time - ) - if result.response.last_iter_time and result.response.first_iter_time: - self.requests_stats.inter_token_latency.update( - result.response.last_iter_time - result.response.first_iter_time, - count=(result.response.output_tokens or 1) - 1, - ) - self.requests_stats.prompt_tokens += result.response.request_prompt_tokens or 0 - self.requests_stats.output_tokens += result.response.request_output_tokens or 0 - total_tokens = (result.response.request_prompt_tokens or 0) + ( - result.response.request_output_tokens or 0 - ) - self.requests_stats.total_tokens += total_tokens + if self._is_in_cooldown(request_info, scheduler_state): + status["requests_in_cooldown"] = True + return status - return True + if "completed" not in state: + state["completed"] = [] + state["errored"] = [] + state["incomplete"] = [] - def compile(self) -> GenerativeBenchmark: + # Categorize request by status + if request_info.status == "completed": + state["completed"].append((response, request, request_info)) + elif request_info.status == "canceled": + state["incomplete"].append((response, request, request_info)) + else: + state["errored"].append((response, request, request_info)) + + return status + + def compile( + self, + state: AggregatorState, + scheduler_state: SchedulerState, # noqa: ARG002 + ) -> dict[str, Any]: """ - Compile the benchmark results and statistics into a GenerativeBenchmark object. - This is required to be implemented by subclasses to finalize the benchmark - and return the compiled object. + Compile aggregated requests into comprehensive benchmark results. + + Transforms collected request data into detailed metrics including timing + distributions, token statistics, throughput measurements, and status breakdowns. + + :param agg_state: Accumulated request data categorized by completion status. + :param scheduler_state: Final scheduler execution state. + :return: Complete benchmark results with metrics and request statistics. """ - successful, incomplete, errored = self._compile_results() - - return GenerativeBenchmark.from_stats( - run_id=self.run_id, - successful=successful, - incomplete=incomplete, - errored=errored, - args=self.args, - run_stats=BenchmarkRunStats( - start_time=self.requests_stats.totals.total.start_time, - end_time=time.time(), - requests_made=StatusBreakdown( - successful=int(self.requests_stats.totals.successful.total), - errored=int(self.requests_stats.totals.errored.total), - incomplete=int(self.requests_stats.totals.incomplete.total), - total=int(self.requests_stats.totals.total.total), - ), - queued_time_avg=self.requests_stats.queued_time.mean, - scheduled_time_delay_avg=self.requests_stats.scheduled_time_delay.mean, - scheduled_time_sleep_avg=self.requests_stats.scheduled_time_sleep.mean, - worker_start_delay_avg=self.requests_stats.worker_start_delay.mean, - worker_time_avg=self.requests_stats.worker_time.mean, - worker_start_time_targeted_delay_avg=self.requests_stats.worker_start_time_targeted_delay.mean, - request_start_time_delay_avg=self.requests_stats.request_start_time_delay.mean, - request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean, - request_time_delay_avg=self.requests_stats.request_time_delay.mean, - request_time_avg=self.requests_stats.request_time.mean, - ), - worker=self.worker_description, - requests_loader=self.request_loader_description, - extras=self.extras, + successful: list[GenerativeRequestStats] = [ + self._create_generative_request_stats(response, request, request_info) + for (response, request, request_info) in state.get("completed", []) + ] + incomplete: list[GenerativeRequestStats] = [ + self._create_generative_request_stats(response, request, request_info) + for (response, request, request_info) in state.get("incomplete", []) + ] + errored: list[GenerativeRequestStats] = [ + self._create_generative_request_stats(response, request, request_info) + for (response, request, request_info) in state.get("errored", []) + ] + + # Use all requests for metrics calculations (not sampled) + total: list[GenerativeRequestStats] = successful + incomplete + errored + total_types: list[Literal["successful", "incomplete", "error"]] = [ + *["successful"] * len(successful), + *["incomplete"] * len(incomplete), + *["error"] * len(errored), + ] + start_time = min( + [math.inf] + + [ + req.scheduler_info.request_timings.request_start + for req in total + if req.scheduler_info.request_timings.request_start is not None + ] + ) + end_time = max( + [-1 * math.inf] + + [ + req.scheduler_info.request_timings.request_end + for req in total + if req.scheduler_info.request_timings.request_end is not None + ] ) - def _compile_results( - self, - ) -> tuple[ - list[GenerativeTextResponseStats], - list[GenerativeTextErrorStats], - list[GenerativeTextErrorStats], - ]: - successful: list[GenerativeTextResponseStats] = [ - GenerativeTextResponseStats( - request_id=result.request.request_id, - request_type=result.request.request_type, - scheduler_info=result.request_info, - prompt=str(result.request.content), - prompt_tokens=self._compile_tokens_count( - value=str(result.request.content), - requests_tokens=result.response.request_prompt_tokens, - response_tokens=result.response.response_prompt_tokens, - preferred_tokens_source=settings.preferred_prompt_tokens_source, - errored=False, + return { + "start_time": start_time, + "end_time": end_time, + "request_totals": StatusBreakdown[int, int, int, int]( + successful=len(successful), + incomplete=len(incomplete), + errored=len(errored), + total=len(total), + ), + "requests": StatusBreakdown[ + list[GenerativeRequestStats], + list[GenerativeRequestStats], + list[GenerativeRequestStats], + list[GenerativeRequestStats], + ]( + successful=self._sample_request_stats(successful, self.request_samples), + incomplete=self._sample_request_stats(incomplete, self.request_samples), + errored=self._sample_request_stats(errored, self.request_samples), + ), + "metrics": GenerativeMetrics( + requests_per_second=self._calculate_requests_per_second( + statuses=total_types, requests=total ), - output=result.response.value, - output_tokens=self._compile_tokens_count( - value=result.response.value, - requests_tokens=result.response.request_output_tokens, - response_tokens=result.response.response_output_tokens, - preferred_tokens_source=settings.preferred_output_tokens_source, - errored=False, + request_concurrency=self._calculate_request_concurrency( + statuses=total_types, requests=total ), - start_time=result.response.start_time, - end_time=result.response.end_time, - first_token_time=result.response.first_iter_time or -1.0, - last_token_time=result.response.last_iter_time or -1.0, - ) - for result in self.results.successful - if result.request and result.response - ] - incomplete: list[GenerativeTextErrorStats] = [ - GenerativeTextErrorStats( - error=result.response.error or "", - request_id=result.request.request_id, - request_type=result.request.request_type, - scheduler_info=result.request_info, - prompt=str(result.request.content), - prompt_tokens=self._compile_tokens_count( - value=str(result.request.content), - requests_tokens=result.response.request_prompt_tokens, - response_tokens=result.response.response_prompt_tokens, - preferred_tokens_source=settings.preferred_prompt_tokens_source, - errored=True, + request_latency=self._calculate_request_latency( + statuses=total_types, requests=total ), - output=result.response.value, - output_tokens=self._compile_tokens_count( - value=result.response.value, - requests_tokens=result.response.request_output_tokens, - response_tokens=result.response.response_output_tokens, - preferred_tokens_source=settings.preferred_output_tokens_source, - errored=True, + prompt_token_count=self._calculate_prompt_token_count( + statuses=total_types, requests=total ), - start_time=result.response.start_time, - end_time=result.response.end_time, - first_token_time=result.response.first_iter_time, - last_token_time=result.response.last_iter_time, - ) - for result in self.results.incomplete - if result.request and result.response - ] - error: list[GenerativeTextErrorStats] = [ - GenerativeTextErrorStats( - error=result.response.error or "", - request_id=result.request.request_id, - request_type=result.request.request_type, - scheduler_info=result.request_info, - prompt=str(result.request.content), - prompt_tokens=self._compile_tokens_count( - value=str(result.request.content), - requests_tokens=result.response.request_prompt_tokens, - response_tokens=result.response.response_prompt_tokens, - preferred_tokens_source=settings.preferred_prompt_tokens_source, - errored=True, + output_token_count=self._calculate_output_token_count( + statuses=total_types, requests=total + ), + total_token_count=self._calculate_total_token_count( + statuses=total_types, requests=total + ), + time_to_first_token_ms=self._calculate_time_to_first_token_ms( + statuses=total_types, requests=total + ), + time_per_output_token_ms=self._calculate_time_per_output_token_ms( + statuses=total_types, requests=total + ), + inter_token_latency_ms=self._calculate_inter_token_latency_ms( + statuses=total_types, requests=total ), - output=result.response.value, - output_tokens=self._compile_tokens_count( - value=result.response.value, - requests_tokens=result.response.request_output_tokens, - response_tokens=result.response.response_output_tokens, - preferred_tokens_source=settings.preferred_output_tokens_source, - errored=True, + output_tokens_per_second=self._calculate_output_tokens_per_second( + statuses=total_types, requests=total ), - start_time=result.response.start_time, - end_time=result.response.end_time, - first_token_time=result.response.first_iter_time, - last_token_time=result.response.last_iter_time, + tokens_per_second=self._calculate_tokens_per_second( + statuses=total_types, requests=total + ), + ), + } + + def _is_in_warmup( + self, + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> bool: + """Check if the current request is within the warmup period.""" + if self.warmup is None: + return False + + if 0 < self.warmup < 1: # Percentage-based warmup + return ( + scheduler_state.remaining_fraction is not None + and scheduler_state.remaining_fraction > (1 - self.warmup) + ) + + if self.warmup >= 1: # Count/time-based warmup + if scheduler_state.processed_requests < self.warmup: + return True + + current_time = request_info.scheduler_timings.targeted_start + return ( + current_time is not None + and (current_time - scheduler_state.start_time) < self.warmup ) - for result in self.results.errored - if result.request and result.response - ] - return successful, incomplete, error + return False - def _compile_tokens_count( + def _is_in_cooldown( self, - value: str, - requests_tokens: Optional[int], - response_tokens: Optional[int], - preferred_tokens_source: Optional[Literal["request", "response", "local"]], - errored: bool, - ) -> int: - if not errored and preferred_tokens_source == "response" and response_tokens: - return response_tokens or 0 - - if not errored and preferred_tokens_source == "request" and requests_tokens: - return requests_tokens or 0 - - if preferred_tokens_source in {"response", "request"} and ( - self.processor is None or errored or response_tokens or requests_tokens - ): - # we had a preferred tokens source that isn't local and we either - # have the data to return something or we don't have the ability - # to calculate locally - return response_tokens or requests_tokens or 0 - - self.processor = check_load_processor( - self.processor, - processor_args=self.processor_args, - error_msg="Processor/Tokenizer is required for calculating token counts.", + request_info: ScheduledRequestInfo, + scheduler_state: SchedulerState, + ) -> bool: + """Check if the current request is within the cooldown period.""" + if self.cooldown is None: + return False + + if 0 < self.cooldown < 1: # Percentage-based cooldown + return ( + scheduler_state.remaining_fraction is not None + and scheduler_state.remaining_fraction < self.cooldown + ) + + if self.cooldown >= 1: # Count/time-based cooldown + if scheduler_state.remaining_requests <= self.cooldown: + return True + + current_time = ( + request_info.scheduler_timings.resolve_end + or request_info.scheduler_timings.targeted_start + ) + return ( + current_time is not None + and scheduler_state.remaining_duration is not None + and scheduler_state.remaining_duration < self.cooldown + ) + + return False + + @classmethod + def _create_generative_request_stats( + cls, + response: GenerationResponse, + request: GenerationRequest, + request_info: ScheduledRequestInfo, + ) -> GenerativeRequestStats: + prompt_tokens = response.preferred_prompt_tokens( + settings.preferred_prompt_tokens_source + ) + output_tokens = response.preferred_output_tokens( + settings.preferred_output_tokens_source + ) + + return GenerativeRequestStats( + request_id=request.request_id, + request_type=request.request_type, + prompt=str(request.content), + request_args=response.request_args, + output=response.value, + iterations=response.iterations, + prompt_tokens=prompt_tokens, + output_tokens=output_tokens, + total_tokens=( + prompt_tokens + output_tokens + if prompt_tokens is not None and output_tokens is not None + else None + ), + scheduler_info=request_info, + ) + + @classmethod + def _sample_request_stats( + cls, stats: list[GenerativeRequestStats], sample_size: int | None + ) -> list[GenerativeRequestStats]: + if sample_size is None or sample_size <= 0 or not stats: + return stats + + return random.sample(stats, min(sample_size, len(stats))) + + @classmethod + def _calculate_requests_per_second( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_times = [] + + for status, request in zip(statuses, requests): + if not all_defined( + safe_getattr(request.scheduler_info.request_timings, "request_start"), + safe_getattr(request.scheduler_info.request_timings, "request_end"), + ): + continue + + filtered_statuses.append(status) + filtered_times.append( + ( + request.scheduler_info.request_timings.request_start, + request.scheduler_info.request_timings.request_end, + ) + ) + + return StatusDistributionSummary.from_request_times( + request_types=filtered_statuses, + requests=filtered_times, + distribution_type="rate", + ) + + @classmethod + def _calculate_request_concurrency( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_times = [] + + for status, request in zip(statuses, requests): + if not all_defined( + safe_getattr(request.scheduler_info.request_timings, "request_start"), + safe_getattr(request.scheduler_info.request_timings, "request_end"), + ): + continue + + filtered_statuses.append(status) + filtered_times.append( + ( + request.scheduler_info.request_timings.request_start, + request.scheduler_info.request_timings.request_end, + ) + ) + + return StatusDistributionSummary.from_request_times( + request_types=filtered_statuses, + requests=filtered_times, + distribution_type="concurrency", + ) + + @classmethod + def _calculate_request_latency( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_values = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.request_latency): + continue + + filtered_statuses.append(status) + filtered_values.append(request.request_latency) + + return StatusDistributionSummary.from_values( + value_types=filtered_statuses, + values=filtered_values, + ) + + @classmethod + def _calculate_prompt_token_count( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_values = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.prompt_tokens): + continue + + filtered_statuses.append(status) + filtered_values.append(request.prompt_tokens) + + return StatusDistributionSummary.from_values( + value_types=filtered_statuses, + values=filtered_values, + ) + + @classmethod + def _calculate_output_token_count( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_values = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.output_tokens): + continue + + filtered_statuses.append(status) + filtered_values.append(request.output_tokens) + + return StatusDistributionSummary.from_values( + value_types=filtered_statuses, + values=filtered_values, + ) + + @classmethod + def _calculate_total_token_count( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_values = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.total_tokens): + continue + + filtered_statuses.append(status) + filtered_values.append(request.total_tokens) + + return StatusDistributionSummary.from_values( + value_types=filtered_statuses, + values=filtered_values, + ) + + @classmethod + def _calculate_time_to_first_token_ms( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_values = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.time_to_first_token_ms): + continue + + filtered_statuses.append(status) + filtered_values.append(request.time_to_first_token_ms) + + return StatusDistributionSummary.from_values( + value_types=filtered_statuses, + values=filtered_values, + ) + + @classmethod + def _calculate_time_per_output_token_ms( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_values = [] + filtered_weights = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.time_to_first_token_ms): + continue + + # Add time to first token separately to better reflect in distribution + filtered_statuses.append(status) + filtered_values.append(request.time_to_first_token_ms) + filtered_weights.append(1) + + if not all_defined(request.inter_token_latency_ms): + continue + + # Add tokens after the first token to get the full distribution + filtered_statuses.append(status) + filtered_values.append(request.inter_token_latency_ms) + filtered_weights.append(request.output_tokens - 1) + + return StatusDistributionSummary.from_values( + value_types=filtered_statuses, + values=filtered_values, + weights=filtered_weights, + ) + + @classmethod + def _calculate_inter_token_latency_ms( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_values = [] + filtered_weights = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.inter_token_latency_ms): + continue + + filtered_statuses.append(status) + filtered_values.append(request.inter_token_latency_ms) + filtered_weights.append(request.output_tokens - 1) + + return StatusDistributionSummary.from_values( + value_types=filtered_statuses, + values=filtered_values, + weights=filtered_weights, + ) + + @classmethod + def _calculate_output_tokens_per_second( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_request_times = [] + filtered_first_iter_times = [] + filtered_iter_counts = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.output_tokens_per_second): + continue + + filtered_statuses.append(status) + filtered_request_times.append( + ( + request.scheduler_info.request_timings.request_start, + request.scheduler_info.request_timings.request_end, + ) + ) + filtered_first_iter_times.append( + request.scheduler_info.request_timings.first_iteration + ) + filtered_iter_counts.append(request.output_tokens) + + return StatusDistributionSummary.from_iterable_request_times( + request_types=filtered_statuses, + requests=filtered_request_times, + first_iter_times=filtered_first_iter_times, + iter_counts=filtered_iter_counts, + ) + + @classmethod + def _calculate_tokens_per_second( + cls, + statuses: list[Literal["successful", "incomplete", "error"]], + requests: list[GenerativeRequestStats], + ) -> StatusDistributionSummary: + filtered_statuses = [] + filtered_request_times = [] + filtered_first_iter_times = [] + filtered_iter_counts = [] + filtered_first_iter_counts = [] + + for status, request in zip(statuses, requests): + if not all_defined(request.tokens_per_second): + continue + + filtered_statuses.append(status) + filtered_request_times.append( + ( + request.scheduler_info.request_timings.request_start, + request.scheduler_info.request_timings.request_end, + ) + ) + filtered_first_iter_times.append( + request.scheduler_info.request_timings.first_iteration + ) + filtered_iter_counts.append(request.output_tokens - 1) + filtered_first_iter_counts.append(request.prompt_tokens + 1) + + return StatusDistributionSummary.from_iterable_request_times( + request_types=filtered_statuses, + requests=filtered_request_times, + first_iter_times=filtered_first_iter_times, + iter_counts=filtered_iter_counts, + first_iter_counts=filtered_first_iter_counts, ) - return len(self.processor.tokenize(value)) diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py deleted file mode 100644 index 02eea02b..00000000 --- a/src/guidellm/benchmark/benchmark.py +++ /dev/null @@ -1,837 +0,0 @@ -import random -import uuid -from typing import Any, Literal, Optional, TypeVar, Union - -from pydantic import Field, computed_field - -from guidellm.benchmark.profile import ( - AsyncProfile, - ConcurrentProfile, - Profile, - SweepProfile, - SynchronousProfile, - ThroughputProfile, -) -from guidellm.objects import ( - StandardBaseModel, - StatusBreakdown, - StatusDistributionSummary, -) -from guidellm.request import ( - GenerativeRequestLoaderDescription, - RequestLoaderDescription, -) -from guidellm.scheduler import ( - AsyncConstantStrategy, - AsyncPoissonStrategy, - ConcurrentStrategy, - GenerativeRequestsWorkerDescription, - SchedulerRequestInfo, - SchedulingStrategy, - SynchronousStrategy, - ThroughputStrategy, - WorkerDescription, -) - -__all__ = [ - "Benchmark", - "BenchmarkArgs", - "BenchmarkMetrics", - "BenchmarkRunStats", - "BenchmarkT", - "GenerativeBenchmark", - "GenerativeMetrics", - "GenerativeTextErrorStats", - "GenerativeTextResponseStats", - "StatusBreakdown", -] - - -class BenchmarkArgs(StandardBaseModel): - """ - A serializable model representing the arguments used to specify a benchmark run - and how data was collected for it. - """ - - profile: Union[ - AsyncProfile, - SweepProfile, - ConcurrentProfile, - ThroughputProfile, - SynchronousProfile, - Profile, - ] = Field( - description=( - "The profile used for the entire benchmark run that the strategy for " - "this benchmark was pulled from." - ), - discriminator="type_", - ) - strategy_index: int = Field( - description=( - "The index of the strategy in the profile that was used for this benchmark." - ) - ) - strategy: Union[ - ConcurrentStrategy, - SchedulingStrategy, - ThroughputStrategy, - SynchronousStrategy, - AsyncPoissonStrategy, - AsyncConstantStrategy, - SchedulingStrategy, - ] = Field( - description="The scheduling strategy used to run this benchmark. ", - discriminator="type_", - ) - max_number: Optional[int] = Field( - description="The maximum number of requests to run for this benchmark, if any." - ) - max_duration: Optional[float] = Field( - description="The maximum duration in seconds to run this benchmark, if any." - ) - warmup_number: Optional[int] = Field( - description=( - "The number of requests to run for the warmup phase of this benchmark, " - "if any. These are requests that were not included in the final results." - ) - ) - warmup_duration: Optional[float] = Field( - description=( - "The duration in seconds to run for the warmup phase of this benchmark, " - "if any. These are requests that were not included in the final results." - ) - ) - cooldown_number: Optional[int] = Field( - description=( - "The number of requests to run for the cooldown phase of this benchmark, " - "if any. These are requests that were not included in the final results." - ) - ) - cooldown_duration: Optional[float] = Field( - description=( - "The duration in seconds to run for the cooldown phase of this benchmark, " - "if any. These are requests that were not included in the final results." - ) - ) - - -class BenchmarkRunStats(StandardBaseModel): - """ - A serializable model representing the run process statistics for the - entire benchmark run across all requests including warmup and cooldown. - """ - - start_time: float = Field( - description="The start time of the benchmark run.", - ) - end_time: float = Field( - description="The end time of the benchmark run.", - ) - requests_made: StatusBreakdown[int, int, int, int] = Field( - description=( - "The number of requests made for the benchmark run broken down by " - "status including successful, incomplete, errored, and the sum of all three" - ) - ) - queued_time_avg: float = Field( - description=( - "The average time spent in the queue for each request in the benchmark " - "run until it was dequeued by a worker." - ) - ) - scheduled_time_delay_avg: float = Field( - description=( - "The average time delay between when a request was dequeued and when it " - "was scheduled to be processed by a worker in the benchmark run. " - "This should be as close to 0 as possible, any additional time is " - "overheads from the system or the worker." - ) - ) - scheduled_time_sleep_avg: float = Field( - description=( - "The average time spent sleeping til the desired start time was reached " - "after being scheduled by the worker in the benchmark run." - ) - ) - worker_start_delay_avg: float = Field( - description=( - "The average time delay between when a request was scheduled and when " - "the worker started processing it in the benchmark run. " - "This should be as close to 0 as possible, any additional time is " - "overheads from the system or the worker." - ) - ) - worker_time_avg: float = Field( - description=( - "The average time taken by the worker to process each request in the " - "benchmark run. This includes the time to generate the response and " - "any additional processing time." - ) - ) - worker_start_time_targeted_delay_avg: float = Field( - description=( - "The average time delay between when a request was targeted to start " - "and when the worker actually started processing it in the benchmark " - "run. For async strategies, this represents delays from the ideal " - "system. For sync strategies, since those are doubled in queue, " - "this should be as close to the time for a request to be processed " - "as possible. Any additional time is overhead from the system or " - "the worker." - ) - ) - request_start_time_delay_avg: float = Field( - description=( - "The average time delay between the actual request being made " - "and the time the worker started on the request for all requests " - "that completed within the benchmark run. This time should be as close " - "to 0 as possible, any additional time is overhead from the system or " - "the worker." - ) - ) - request_start_time_targeted_delay_avg: float = Field( - description=( - "The average time delay between when the targeted start time and " - "the actual start time for each request in the benchmark run. " - "For async strategies, this represents delays from the ideal " - "system. For sync strategies, this should be as close to the " - "time for a request to be processed as possible. Any additional " - "time is overhead from the system or the worker." - ) - ) - request_time_delay_avg: float = Field( - description=( - "The average time delay between the total request time and the " - "worker time. This should be as close to 0 as possible, any additional " - "time is overhead from the system or the worker. " - ) - ) - request_time_avg: float = Field( - description=( - "The average time spent processing all requests in the benchmark run. " - "This is the time from when the actual request was started to when " - "it was completed." - ) - ) - - -class BenchmarkMetrics(StandardBaseModel): - """ - A serializable model representing the metrics for a benchmark run. - """ - - requests_per_second: StatusDistributionSummary = Field( - description="The distribution of requests per second for the benchmark.", - ) - request_concurrency: StatusDistributionSummary = Field( - description="The distribution of requests concurrency for the benchmark.", - ) - - -class Benchmark(StandardBaseModel): - """ - The base serializable model representing a benchmark run and its results. - Specific benchmarker implementations should extend this model to include - additional information or metadata as needed. - - Note, requests_per_second and request_concurrency are kept at this level - and are expected to be populated by the subclass implementation to ensure - the logic for Profiles can include more complicated logic for determining - what rates and concurrency values to use for subsequent strategies. - """ - - type_: Literal["benchmark"] = "benchmark" - id_: str = Field( - default_factory=lambda: str(uuid.uuid4()), - description="The unique identifier for the benchmark.", - ) - run_id: str = Field( - description=( - "The unique identifier for the encompasing benchmark run that this " - "benchmark was a part of." - ) - ) - args: BenchmarkArgs = Field( - description=( - "The arguments used to specify how to run the benchmark and collect data." - ) - ) - run_stats: BenchmarkRunStats = Field( - description=( - "The process statistics for the entire benchmark run across all requests." - ) - ) - worker: Union[WorkerDescription] = Field( - description=( - "The description and specifics for the worker used to resolve requests " - "for this benchmark." - ), - ) - request_loader: Union[RequestLoaderDescription] = Field( - description=( - "The description and specifics for the request loader used to create " - "requests for this benchmark." - ), - ) - extras: dict[str, Any] = Field( - description=( - "Any additional information or metadata that was passed for this benchmark." - ) - ) - metrics: BenchmarkMetrics = Field( - description=( - "The metrics for the benchmark run represented as a distribution of " - "various per-request statistics." - ), - ) - - -BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark) - - -class GenerativeTextResponseStats(StandardBaseModel): - """ - A serializable model representing the request values, response values, and - statistics for a generative text response. - """ - - type_: Literal["generative_text_response"] = "generative_text_response" - request_id: Optional[str] = Field( - description="The unique identifier for the request.", - ) - request_type: Literal["text_completions", "chat_completions"] = Field( - description="The type of request made to the generative backend." - ) - scheduler_info: SchedulerRequestInfo = Field( - description=( - "The info about the request from the scheduler about how it was run." - ), - ) - prompt: str = Field( - description="The text prompt used for the generative request.", - ) - output: str = Field( - description="The generated text output from the generative request.", - ) - prompt_tokens: int = Field( - description="The number of tokens in the prompt text.", - ) - output_tokens: int = Field( - description="The number of tokens in the generated output text.", - ) - start_time: float = Field( - description="The time the request started.", - ) - end_time: float = Field( - description="The time the request ended.", - ) - first_token_time: float = Field( - description="The time the first token was received.", - ) - last_token_time: float = Field( - description="The time the last token was received.", - ) - - @computed_field # type: ignore[misc] - @property - def request_latency(self) -> float: - """ - :return: The duration of the request in seconds from the start to the end. - """ - return self.end_time - self.start_time - - @computed_field # type: ignore[misc] - @property - def time_to_first_token_ms(self) -> float: - """ - :return: The time in milliseconds from the start of the request to the first - token received. - """ - return 1000 * (self.first_token_time - self.start_time) - - @computed_field # type: ignore[misc] - @property - def time_per_output_token_ms(self) -> float: - """ - :return: The average time in milliseconds per output token generated. - This includes the time to generate the first token and all other tokens. - """ - if self.output_tokens == 0: - return 0.0 - - return ( - 1000 * (self.last_token_time - self.first_token_time) / self.output_tokens - ) - - @computed_field # type: ignore[misc] - @property - def inter_token_latency_ms(self) -> float: - """ - :return: The average time in milliseconds between generating tokens in the - output text. Note, does not include the time to generate the first token. - """ - if self.output_tokens <= 1: - return 0.0 - - return ( - 1000 - * (self.last_token_time - self.first_token_time) - / (self.output_tokens - 1) - ) - - @computed_field # type: ignore[misc] - @property - def tokens_per_second(self) -> float: - """ - :return: The average number of tokens generated per second in the prompt and - output text. - """ - if (latency := self.request_latency) == 0.0: - return 0.0 - - return (self.prompt_tokens + self.output_tokens) / latency - - @computed_field # type: ignore[misc] - @property - def output_tokens_per_second(self) -> float: - """ - :return: The average number of output tokens generated per second. - """ - if (latency := self.request_latency) == 0.0: - return 0.0 - - return self.output_tokens / latency - - -class GenerativeTextErrorStats(GenerativeTextResponseStats): - """ - A serializable model representing the request values, response values, and - statistics for a generative text response that errored. - Extends and overrides the GenerativeTextResponseStats model to include the - error message and optional properties given the error occurred. - """ - - type_: Literal["generative_text_error"] = "generative_text_error" # type: ignore[assignment] - error: str = Field( - description=( - "The error message for the error that occurred while making the request." - ) - ) - output: Optional[str] = Field( # type: ignore[assignment] - default=None, - description=( - "The generated text output from the generative request, if any, " - "before the error occurred." - ), - ) - first_token_time: Optional[float] = Field( # type: ignore[assignment] - default=None, - description=( - "The time the first token was received, if any, before the error occurred." - ), - ) - last_token_time: Optional[float] = Field( # type: ignore[assignment] - default=None, - description=( - "The time the last token was received, if any, before the error occurred." - ), - ) - - @computed_field # type: ignore[misc] - @property - def time_to_first_token_ms(self) -> Optional[float]: # type: ignore[override] - """ - :return: The time in milliseconds from the start of the request to the first - token received. None if the first token was not received. - """ - if self.first_token_time is None: - return None - - return super().time_to_first_token_ms - - @computed_field # type: ignore[misc] - @property - def time_per_output_token_ms(self) -> Optional[float]: # type: ignore[override] - """ - :return: The average time in milliseconds per output token generated. - This includes the time to generate the first token and all other tokens. - None if the output_tokens is None or 0. - """ - if ( - self.output_tokens is None - or self.output_tokens == 0 - or self.first_token_time is None - or self.last_token_time is None - ): - return None - - return super().time_per_output_token_ms - - @computed_field # type: ignore[misc] - @property - def inter_token_latency_ms(self) -> Optional[float]: # type: ignore[override] - """ - :return: The average time in milliseconds between generating tokens in the - output text. Note, does not include the time to generate the first token. - None if there were no output_tokens or the first token was not received. - """ - if ( - self.output_tokens is None - or self.first_token_time is None - or self.last_token_time is None - ): - return None - - return super().inter_token_latency_ms - - @computed_field # type: ignore[misc] - @property - def output_tokens_per_second(self) -> Optional[float]: # type: ignore[override] - """ - :return: The average number of tokens generated per second in the output text. - Note, does not include the time to generate the first token. None if there - were no output_tokens or the first token was not received. - """ - if self.inter_token_latency_ms is None: - return None - - return super().output_tokens_per_second - - -class GenerativeMetrics(BenchmarkMetrics): - """ - A serializable model representing the metrics for a generative benchmark run. - """ - - request_latency: StatusDistributionSummary = Field( - description="The distribution of latencies for the completed requests.", - ) - prompt_token_count: StatusDistributionSummary = Field( - description=( - "The distribution of token counts in the prompts for completed, " - "errored, and all requests." - ) - ) - output_token_count: StatusDistributionSummary = Field( - description=( - "The distribution of token counts in the outputs for completed, " - "errored, and all requests." - ) - ) - time_to_first_token_ms: StatusDistributionSummary = Field( - description=( - "The distribution of latencies to receiving the first token in " - "milliseconds for completed, errored, and all requests." - ), - ) - time_per_output_token_ms: StatusDistributionSummary = Field( - description=( - "The distribution of latencies per output token in milliseconds for " - "completed, errored, and all requests. " - "This includes the time to generate the first token and all other tokens." - ), - ) - inter_token_latency_ms: StatusDistributionSummary = Field( - description=( - "The distribution of latencies between tokens in milliseconds for " - "completed, errored, and all requests." - ), - ) - output_tokens_per_second: StatusDistributionSummary = Field( - description=( - "The distribution of output tokens per second for completed, " - "errored, and all requests." - ), - ) - tokens_per_second: StatusDistributionSummary = Field( - description=( - "The distribution of tokens per second, including prompt and output tokens " - "for completed, errored, and all requests." - ), - ) - - -class GenerativeBenchmark(Benchmark): - """ - A serializable model representing a benchmark run and its results for generative - requests and responses. Includes the completed and errored requests, the start - and end times for the benchmark, and the statistics for the requests and responses. - """ - - type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment] - start_time: float = Field( - description="The start time of the first request for the benchmark.", - ) - end_time: float = Field( - description="The end time of the last request for the benchmark.", - ) - - @computed_field # type: ignore[misc] - @property - def duration(self) -> float: - """ - :return: The duration of the benchmark in seconds from the start of the - first request to the end of the last request. - """ - return self.end_time - self.start_time - - worker: GenerativeRequestsWorkerDescription = Field( - description=( - "The description and specifics for the worker used to resolve requests " - "for this benchmark." - ), - ) - request_loader: GenerativeRequestLoaderDescription = Field( - description=( - "The description and specifics for the request loader used to create " - "requests for this benchmark." - ), - ) - metrics: GenerativeMetrics = Field( - description=( - "The metrics for the benchmark run represented as a distribution of " - "various per-request statistics." - ), - ) - # Output is ordered so keep the requests at the end for better readability in files - request_totals: StatusBreakdown[int, int, int, int] = Field( - description=( - "The number of requests made for the benchmark broken down by status " - "including successful, incomplete, errored, and the sum of all three" - ) - ) - request_samples: Optional[StatusBreakdown[int, int, int, None]] = Field( - description=( - "The number of requests that were randomly sampled for " - "the benchmark. None if no sampling was applied." - ), - default=None, - ) - requests: StatusBreakdown[ - list[GenerativeTextResponseStats], - list[GenerativeTextErrorStats], - list[GenerativeTextErrorStats], - None, - ] = Field( - description=( - "The breakdown of requests for the benchmark run including successful, " - "incomplete, and errored requests." - ), - ) - - def set_sample_size(self, sample_size: Optional[int]) -> "GenerativeBenchmark": - """ - Set the sample size for the benchmark. This will randomly sample the - requests for each status type to the given sample size or the maximum - number of requests for that status type, whichever is smaller. - This is applied to requests.successful, requests.errored, and - requests.incomplete. - If None, no sampling is applied and the state is kept. - - :param sample_size: The number of requests to sample for each status type. - :return: The benchmark with the sampled requests. - :raises ValueError: If the sample size is invalid. - """ - - if sample_size is not None: - if sample_size < 0 or not isinstance(sample_size, int): - raise ValueError( - f"Sample size must be non-negative integer, given {sample_size}" - ) - - sample_size = min(sample_size, len(self.requests.successful)) - error_sample_size = min(sample_size, len(self.requests.errored)) - incomplete_sample_size = min(sample_size, len(self.requests.incomplete)) - - self.requests.successful = random.sample( - self.requests.successful, sample_size - ) - self.requests.errored = random.sample( - self.requests.errored, error_sample_size - ) - self.requests.incomplete = random.sample( - self.requests.incomplete, incomplete_sample_size - ) - self.request_samples = StatusBreakdown( - successful=len(self.requests.successful), - incomplete=len(self.requests.incomplete), - errored=len(self.requests.errored), - ) - - return self - - @staticmethod - def from_stats( - run_id: str, - successful: list[GenerativeTextResponseStats], - incomplete: list[GenerativeTextErrorStats], - errored: list[GenerativeTextErrorStats], - args: BenchmarkArgs, - run_stats: BenchmarkRunStats, - worker: GenerativeRequestsWorkerDescription, - requests_loader: GenerativeRequestLoaderDescription, - extras: Optional[dict[str, Any]], - ) -> "GenerativeBenchmark": - """ - Create a GenerativeBenchmark instance from the given statistics and metadata. - Given the completed and errored requests, the benchmark will fill in the - remaining statistics for the various metrics required for a benchmark. - This is the preferred method for creating a GenerativeBenchmark instance - to ensure all statistics are properly calculated and populated. - - :param run_id: The unique identifier for the benchmark run. - :param completed: The list of completed requests. - :param errored: The list of errored requests. - :param args: The arguments used to specify how to run the benchmark - and collect data. - :param run_stats: The process statistics for the entire benchmark run across - all requests. - :param worker: The description and specifics for the worker used to resolve - requests. - :param requests_loader: The description and specifics for the request loader - used to create requests. - :param extras: Any additional information or metadata that was passed for - this benchmark. - :return: A GenerativeBenchmark instance with the given statistics and metadata - populated and calculated - """ - total = successful + incomplete + errored - total_types: list[Literal["successful", "incomplete", "error"]] = [ - *["successful"] * len(successful), # type: ignore[list-item] - *["incomplete"] * len(incomplete), # type: ignore[list-item] - *["error"] * len(errored), # type: ignore[list-item] - ] - start_time = min(req.start_time for req in total) - end_time = max(req.end_time for req in total) - - total_with_prompt, total_types_with_prompt = ( - zip(*filtered) - if ( - filtered := list( - filter(lambda val: bool(val[0].prompt), zip(total, total_types)) - ) - ) - else ([], []) - ) - total_with_output_first, total_types_with_output_first = ( - zip(*filtered) - if ( - filtered := list( - filter( - lambda val: bool(val[0].output_tokens > 0), - zip(total, total_types), - ) - ) - ) - else ([], []) - ) - total_with_output_multi, total_types_with_output_multi = ( - zip(*filtered) - if ( - filtered := list( - filter( - lambda val: bool(val[0].output_tokens > 1), - zip(total, total_types), - ) - ) - ) - else ([], []) - ) - - return GenerativeBenchmark( - run_id=run_id, - args=args, - run_stats=run_stats, - extras=extras or {}, - start_time=start_time, - end_time=end_time, - worker=worker, - request_loader=requests_loader, - metrics=GenerativeMetrics( - requests_per_second=StatusDistributionSummary.from_request_times( - request_types=total_types, - requests=[(req.start_time, req.end_time) for req in total], - distribution_type="rate", - ), - request_concurrency=StatusDistributionSummary.from_request_times( - request_types=total_types, - requests=[(req.start_time, req.end_time) for req in total], - distribution_type="concurrency", - ), - request_latency=StatusDistributionSummary.from_values( - value_types=total_types, - values=[req.request_latency for req in total], - ), - prompt_token_count=StatusDistributionSummary.from_values( - value_types=list(total_types_with_prompt), - values=[req.prompt_tokens for req in total_with_prompt], - ), - output_token_count=StatusDistributionSummary.from_values( - value_types=list(total_types_with_output_first), - values=[req.output_tokens for req in total_with_output_first], - ), - time_to_first_token_ms=StatusDistributionSummary.from_values( - value_types=list(total_types_with_output_first), - values=[ - req.time_to_first_token_ms or 0 - for req in total_with_output_first - ], - ), - time_per_output_token_ms=StatusDistributionSummary.from_values( - value_types=list(total_types_with_output_first), - values=[ - req.time_per_output_token_ms or 0 - for req in total_with_output_first - ], - weights=[req.output_tokens for req in total_with_output_first], - ), - inter_token_latency_ms=StatusDistributionSummary.from_values( - value_types=list(total_types_with_output_multi), - values=[ - req.inter_token_latency_ms or 0 - for req in total_with_output_multi - ], - weights=[req.output_tokens - 1 for req in total_with_output_multi], - ), - output_tokens_per_second=StatusDistributionSummary.from_iterable_request_times( - request_types=list(total_types_with_output_first), - requests=[ - (req.start_time, req.end_time) - for req in total_with_output_first - ], - first_iter_times=[ - req.first_token_time or req.start_time - for req in total_with_output_first - ], - iter_counts=[req.output_tokens for req in total_with_output_first], - ), - tokens_per_second=StatusDistributionSummary.from_iterable_request_times( - request_types=list(total_types_with_output_first), - requests=[ - (req.start_time, req.end_time) - for req in total_with_output_first - ], - first_iter_times=[ - req.first_token_time or req.start_time - for req in total_with_output_first - ], - iter_counts=[req.output_tokens for req in total_with_output_first], - first_iter_counts=[ - # prompt tokens + first token - req.prompt_tokens + 1 - for req in total_with_output_first - ], - ), - ), - request_totals=StatusBreakdown( - successful=len(successful), - incomplete=len(incomplete), - errored=len(errored), - total=len(total), - ), - requests=StatusBreakdown( - successful=successful, - incomplete=incomplete, - errored=errored, - ), - ) diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index 0e34e322..ae591c23 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -1,334 +1,266 @@ -import time +""" +Benchmark execution orchestration and lifecycle management. + +Provides the core benchmarking engine that coordinates request scheduling, +data aggregation, and result compilation across different execution strategies +and environments. + +Classes: + Benchmarker: Abstract benchmark orchestrator for request processing workflows. + +Type Variables: + BenchmarkT: Generic benchmark result type. + RequestT: Generic request object type. + RequestTimingsT: Generic request timing object type. + ResponseT: Generic response object type. +""" + +from __future__ import annotations + import uuid -from abc import ABC, abstractmethod -from collections.abc import AsyncGenerator, Iterable -from pathlib import Path +from abc import ABC +from collections.abc import AsyncIterator, Iterable from typing import ( Any, Generic, - Literal, - Optional, - Union, ) -from pydantic import Field -from transformers import PreTrainedTokenizerBase # type: ignore # noqa: PGH003 - -from guidellm.backend import Backend, ResponseSummary from guidellm.benchmark.aggregator import ( - AggregatorT, - BenchmarkT, - GenerativeBenchmarkAggregator, + Aggregator, + AggregatorState, + CompilableAggregator, ) -from guidellm.benchmark.benchmark import BenchmarkArgs, GenerativeBenchmark +from guidellm.benchmark.objects import BenchmarkerDict, BenchmarkT, SchedulerDict from guidellm.benchmark.profile import Profile -from guidellm.objects import StandardBaseModel -from guidellm.request import ( - GenerationRequest, - GenerativeRequestLoaderDescription, - RequestLoaderDescription, +from guidellm.scheduler import ( + BackendInterface, + Constraint, + Environment, + NonDistributedEnvironment, RequestT, ResponseT, -) -from guidellm.scheduler import ( - GenerativeRequestsWorker, - RequestsWorker, Scheduler, - SchedulerRequestResult, + SchedulerState, SchedulingStrategy, ) +from guidellm.utils import InfoMixin, ThreadSafeSingletonMixin +from guidellm.utils.pydantic_utils import StandardBaseDict -__all__ = ["Benchmarker", "BenchmarkerResult", "GenerativeBenchmarker"] +__all__ = ["Benchmarker"] -class BenchmarkerResult( - StandardBaseModel, Generic[AggregatorT, BenchmarkT, RequestT, ResponseT] +class Benchmarker( + Generic[BenchmarkT, RequestT, ResponseT], + ABC, + ThreadSafeSingletonMixin, ): - type_: Literal[ - "run_start", - "run_complete", - "scheduler_start", - "scheduler_update", - "scheduler_complete", - "benchmark_compiled", - ] - start_time: float - end_number: int - profile: Profile - current_index: int - current_strategy: Optional[SchedulingStrategy] = None - current_aggregator: Optional[AggregatorT] = None - current_benchmark: Optional[BenchmarkT] = None - current_result: Optional[SchedulerRequestResult[RequestT, ResponseT]] = None - - -class BenchmarkerStrategyLimits(StandardBaseModel): - requests_loader_size: Optional[int] = Field( - description="Size of the request loader.", - ) - max_number_per_strategy: Optional[int] = Field( - description="Maximum number of requests to process per strategy.", - ge=0, - ) - max_duration_per_strategy: Optional[float] = Field( - description="Maximum duration (in seconds) to process requests per strategy.", - ge=0, - ) - warmup_percent_per_strategy: Optional[float] = Field( - description="Percentage of requests to use for warmup.", - ge=0, - le=1, - ) - cooldown_percent_per_strategy: Optional[float] = Field( - description="Percentage of requests to use for cooldown.", - ge=0, - le=1, - ) - - @property - def max_number(self) -> Optional[int]: - if self.max_number_per_strategy is not None: - return self.max_number_per_strategy - - if self.requests_loader_size is not None: - return self.requests_loader_size - - return None - - @property - def max_duration(self) -> Optional[float]: - return self.max_duration_per_strategy + """ + Abstract benchmark orchestrator for request processing workflows. - @property - def warmup_number(self) -> Optional[int]: - if self.warmup_percent_per_strategy is None or self.max_number is None: - return None + Coordinates the execution of benchmarking runs across different scheduling + strategies, aggregating metrics and compiling results. Manages the complete + benchmark lifecycle from request submission through result compilation. - return int(self.warmup_percent_per_strategy * self.max_number) - - @property - def warmup_duration(self) -> Optional[float]: - if self.warmup_percent_per_strategy is None or self.max_duration is None: - return None - - return self.warmup_percent_per_strategy * self.max_duration - - @property - def cooldown_number(self) -> Optional[int]: - if self.cooldown_percent_per_strategy is None or self.max_number is None: - return None - - return int(self.cooldown_percent_per_strategy * self.max_number) - - @property - def cooldown_duration(self) -> Optional[float]: - if self.cooldown_percent_per_strategy is None or self.max_duration is None: - return None - - return self.cooldown_percent_per_strategy * self.max_duration - - -class Benchmarker(Generic[AggregatorT, BenchmarkT, RequestT, ResponseT], ABC): - def __init__( - self, - worker: RequestsWorker[RequestT, ResponseT], - request_loader: Iterable[RequestT], - requests_loader_description: RequestLoaderDescription, - benchmark_save_extras: Optional[dict[str, Any]] = None, - ): - self.worker = worker - self.scheduler: Scheduler[RequestT, ResponseT] = Scheduler( - worker=worker, request_loader=request_loader - ) - self.requests_loader_description = requests_loader_description - self.benchmark_save_extras = benchmark_save_extras + Implements thread-safe singleton pattern to ensure consistent state across + concurrent benchmark operations. + """ async def run( self, + requests: Iterable[RequestT | Iterable[RequestT | tuple[RequestT, float]]], + backend: BackendInterface[RequestT, ResponseT], profile: Profile, - max_number_per_strategy: Optional[int], - max_duration_per_strategy: Optional[float], - warmup_percent_per_strategy: Optional[float], - cooldown_percent_per_strategy: Optional[float], - ) -> AsyncGenerator[ - BenchmarkerResult[AggregatorT, BenchmarkT, RequestT, ResponseT], None + benchmark_class: type[BenchmarkT], + benchmark_aggregators: dict[ + str, + Aggregator[ResponseT, RequestT] | CompilableAggregator[ResponseT, RequestT], + ], + environment: Environment | None = None, + ) -> AsyncIterator[ + tuple[ + AggregatorState | None, + BenchmarkT | None, + SchedulingStrategy, + SchedulerState | None, + ] ]: - try: - requests_loader_size = len(self.scheduler.request_loader) # type: ignore[arg-type] - except Exception: # noqa: BLE001 - requests_loader_size = None - - strategy_limits = BenchmarkerStrategyLimits( - requests_loader_size=requests_loader_size, - max_number_per_strategy=max_number_per_strategy, - max_duration_per_strategy=max_duration_per_strategy, - warmup_percent_per_strategy=warmup_percent_per_strategy, - cooldown_percent_per_strategy=cooldown_percent_per_strategy, - ) - start_time = time.time() - end_number = len(profile.strategy_types) - current_index = -1 - run_id = str(uuid.uuid4()) - - yield BenchmarkerResult( - type_="run_start", - start_time=start_time, - end_number=end_number, - profile=profile, - current_index=current_index, - current_strategy=None, - current_aggregator=None, - current_benchmark=None, - current_result=None, - ) - - while scheduling_strategy := profile.next_strategy(): - current_index += 1 - aggregator = self.create_benchmark_aggregator( - run_id=run_id, + """ + Execute benchmark runs across multiple scheduling strategies. + + Orchestrates the complete benchmark workflow: iterates through scheduling + strategies from the profile, executes requests through the scheduler, + aggregates metrics, and compiles final benchmark results. + + :param requests: Request datasets for processing across strategies. + :param backend: Backend interface for request processing. + :param profile: Benchmark profile defining strategies and constraints. + :param environment: Execution environment for coordination. + :param benchmark_aggregators: Metric aggregation functions by name. + :param benchmark_class: Class for constructing final benchmark objects. + :yield: Tuples of (metrics_update, benchmark_result, strategy, state). + :raises Exception: If benchmark execution or compilation fails. + """ + with self.thread_lock: + if environment is None: + environment = NonDistributedEnvironment() + + run_id = str(uuid.uuid4()) + strategies_generator = profile.strategies_generator() + strategy, constraints = next(strategies_generator) + + while strategy is not None: + yield None, None, strategy, None + aggregators_state = { + key: AggregatorState() for key in benchmark_aggregators + } + + async for ( + response, + request, + request_info, + scheduler_state, + ) in Scheduler[RequestT, ResponseT]().run( + requests=requests, + backend=backend, + strategy=strategy, + env=environment, + **constraints, + ): + aggregators_update = AggregatorState() + for key, aggregator in benchmark_aggregators.items(): + update = aggregator( + aggregators_state[key], + response, + request, + request_info, + scheduler_state, + ) + if update: + aggregators_update.update(update) + yield aggregators_update, None, strategy, scheduler_state + + benchmark_kwargs = self._compile_benchmark_kwargs( + run_id=run_id, + run_index=len(profile.completed_strategies), + profile=profile, + requests=requests, + backend=backend, + environment=environment, + aggregators=benchmark_aggregators, + aggregators_state=aggregators_state, + strategy=strategy, + constraints=constraints, + scheduler_state=scheduler_state, + ) + benchmark = benchmark_class(**benchmark_kwargs) + yield None, benchmark, strategy, None + + try: + strategy, constraints = strategies_generator.send(benchmark) + except StopIteration: + strategy = None + constraints = None + + @classmethod + def _compile_benchmark_kwargs( + cls, + run_id: str, + run_index: int, + profile: Profile, + requests: Iterable[RequestT | Iterable[RequestT | tuple[RequestT, float]]], + backend: BackendInterface[RequestT, ResponseT], + environment: Environment, + aggregators: dict[ + str, + Aggregator[ResponseT, RequestT] | CompilableAggregator[ResponseT, RequestT], + ], + aggregators_state: dict[str, dict[str, Any]], + strategy: SchedulingStrategy, + constraints: dict[str, Any | dict[str, Any] | Constraint], + scheduler_state: SchedulerState | None, + ) -> dict[str, Any]: + """ + Compile benchmark construction parameters from execution results. + + Aggregates metadata from scheduler execution and compiles it into + structured parameters for benchmark object construction. + + :param run_id: Unique identifier for the benchmark run. + :param run_index: Index of this strategy in the benchmark profile. + :param profile: Benchmark profile containing strategy configuration. + :param requests: Request datasets used for the benchmark. + :param backend: Backend interface used for request processing. + :param environment: Execution environment for coordination. + :param aggregators: Metric aggregation functions by name. + :param aggregators_state: Current state of metric aggregators. + :param strategy: Scheduling strategy that was executed. + :param constraints: Runtime constraints applied during execution. + :param scheduler_state: Final state of scheduler execution. + :return: Dictionary of parameters for benchmark object construction. + :raises ValueError: If aggregator output conflicts with existing keys. + """ + benchmark_kwargs = { + "run_id": run_id, + "run_index": run_index, + "scheduler": SchedulerDict( + strategy=strategy, + constraints={ + key: InfoMixin.extract_from_obj(val) + for key, val in constraints.items() + }, + state=scheduler_state, + ), + "benchmarker": BenchmarkerDict( profile=profile, - strategy_index=current_index, - strategy=scheduling_strategy, - limits=strategy_limits, + requests=InfoMixin.extract_from_obj(requests), + backend=backend.info, + environment=environment.info, + aggregators={ + key: InfoMixin.extract_from_obj(aggregator) + for key, aggregator in aggregators.items() + }, + ), + "env_args": StandardBaseDict(), + "extras": StandardBaseDict(), + } + + def _combine( + existing: dict[str, Any] | StandardBaseDict, + addition: dict[str, Any] | StandardBaseDict, + ) -> dict[str, Any] | StandardBaseDict: + if not isinstance(existing, (dict, StandardBaseDict)): + raise ValueError( + f"Existing value {existing} (type: {type(existing).__name__}) " + f"is not a valid type for merging." + ) + if not isinstance(addition, (dict, StandardBaseDict)): + raise ValueError( + f"Addition value {addition} (type: {type(addition).__name__}) " + f"is not a valid type for merging." + ) + + add_kwargs = ( + addition if isinstance(addition, dict) else addition.model_dump() ) - async for result in self.scheduler.run( - scheduling_strategy=scheduling_strategy, - max_number=max_number_per_strategy, - max_duration=max_duration_per_strategy, - ): - if result.type_ == "run_start": - yield BenchmarkerResult( - type_="scheduler_start", - start_time=start_time, - end_number=end_number, - profile=profile, - current_index=current_index, - current_strategy=scheduling_strategy, - current_aggregator=aggregator, - current_benchmark=None, - current_result=None, - ) - elif result.type_ == "run_complete": - yield BenchmarkerResult( - type_="scheduler_complete", - start_time=start_time, - end_number=end_number, - profile=profile, - current_index=current_index, - current_strategy=scheduling_strategy, - current_aggregator=aggregator, - current_benchmark=None, - current_result=None, - ) - elif isinstance(result, SchedulerRequestResult): - aggregator.add_result(result) + if isinstance(existing, dict): + return {**add_kwargs, **existing} - yield BenchmarkerResult( - type_="scheduler_update", - start_time=start_time, - end_number=end_number, - profile=profile, - current_index=current_index, - current_strategy=scheduling_strategy, - current_aggregator=aggregator, - current_benchmark=None, - current_result=result, - ) - else: - raise ValueError(f"Unexpected result type: {type(result)}") + return existing.__class__(**{**add_kwargs, **existing.model_dump()}) - benchmark: BenchmarkT = aggregator.compile() - profile.completed_strategy( - average_rate=benchmark.metrics.requests_per_second.successful.mean, - average_concurrency=benchmark.metrics.request_concurrency.successful.mean, - ) - - yield BenchmarkerResult( - type_="benchmark_compiled", - start_time=start_time, - end_number=end_number, - profile=profile, - current_index=current_index, - current_strategy=scheduling_strategy, - current_aggregator=None, - current_benchmark=benchmark, - current_result=None, - ) + for key, aggregator in aggregators.items(): + if not isinstance(aggregator, CompilableAggregator): + continue - yield BenchmarkerResult( - type_="run_complete", - start_time=start_time, - end_number=end_number, - profile=profile, - current_index=current_index, - current_strategy=None, - current_aggregator=None, - current_benchmark=None, - current_result=None, - ) + compiled = aggregator.compile(aggregators_state[key], scheduler_state) - @abstractmethod - def create_benchmark_aggregator( - self, - run_id: str, - profile: Profile, - strategy_index: int, - strategy: SchedulingStrategy, - limits: BenchmarkerStrategyLimits, - ) -> AggregatorT: ... - - -class GenerativeBenchmarker( - Benchmarker[ - GenerativeBenchmarkAggregator, - GenerativeBenchmark, - GenerationRequest, - ResponseSummary, - ], -): - def __init__( - self, - backend: Backend, - request_loader: Iterable[GenerationRequest], - request_loader_description: GenerativeRequestLoaderDescription, - benchmark_save_extras: Optional[dict[str, Any]] = None, - processor: Optional[Union[str, Path, PreTrainedTokenizerBase]] = None, - processor_args: Optional[dict[str, Any]] = None, - ): - super().__init__( - worker=GenerativeRequestsWorker(backend), - request_loader=request_loader, - requests_loader_description=request_loader_description, - benchmark_save_extras=benchmark_save_extras, - ) - self.processor = processor - self.processor_args = processor_args + for field_name, field_val in compiled.items(): + if field_name in benchmark_kwargs: + # If the key already exists, merge the values + benchmark_kwargs[field_name] = _combine( + benchmark_kwargs[field_name], field_val + ) + else: + benchmark_kwargs[field_name] = field_val - def create_benchmark_aggregator( - self, - run_id: str, - profile: Profile, - strategy_index: int, - strategy: SchedulingStrategy, - limits: BenchmarkerStrategyLimits, - ) -> GenerativeBenchmarkAggregator: - return GenerativeBenchmarkAggregator( - run_id=run_id, - args=BenchmarkArgs( - profile=profile, - strategy_index=strategy_index, - strategy=strategy, - max_number=limits.max_number, - max_duration=limits.max_duration, - warmup_number=limits.warmup_number, - warmup_duration=limits.warmup_duration, - cooldown_number=limits.cooldown_number, - cooldown_duration=limits.cooldown_duration, - ), - worker_description=self.worker.description, # type: ignore[arg-type] - request_loader_description=self.requests_loader_description, # type: ignore[arg-type] - extras=self.benchmark_save_extras or {}, - processor=self.processor, - processor_args=self.processor_args, - ) + return benchmark_kwargs diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index 2ef85c3e..60077ee8 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -1,23 +1,56 @@ +from __future__ import annotations + from collections.abc import Iterable from pathlib import Path -from typing import Any, Literal, Optional, Union +from typing import Any, Literal from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict from transformers import ( # type: ignore[import] PreTrainedTokenizerBase, ) -from guidellm.backend import Backend, BackendType -from guidellm.benchmark.benchmarker import GenerativeBenchmarker +from guidellm.backends import ( + Backend, + BackendType, + GenerationRequest, + GenerationResponse, +) +from guidellm.benchmark.aggregator import ( + Aggregator, + CompilableAggregator, + GenerativeRequestsAggregator, + GenerativeStatsProgressAggregator, + SchedulerStatsAggregator, + SerializableAggregator, +) +from guidellm.benchmark.benchmarker import Benchmarker +from guidellm.benchmark.objects import GenerativeBenchmark, GenerativeBenchmarksReport from guidellm.benchmark.output import ( - GenerativeBenchmarksConsole, - GenerativeBenchmarksReport, + GenerativeBenchmarkerConsole, + GenerativeBenchmarkerOutput, +) +from guidellm.benchmark.profile import Profile, ProfileType +from guidellm.benchmark.progress import ( + BenchmarkerProgress, + BenchmarkerProgressGroup, ) -from guidellm.benchmark.profile import ProfileType, create_profile -from guidellm.benchmark.progress import GenerativeTextBenchmarkerProgressDisplay from guidellm.benchmark.scenario import GenerativeTextScenario, Scenario from guidellm.request import GenerativeRequestLoader -from guidellm.scheduler import StrategyType +from guidellm.scheduler import ( + ConstraintInitializer, + NonDistributedEnvironment, + StrategyType, +) +from guidellm.utils import Console, InfoMixin + +__all__ = [ + "benchmark_generative_text", + "benchmark_with_scenario", + "reimport_benchmarks_report", +] + + +_CURRENT_WORKING_DIR = Path.cwd() async def benchmark_with_scenario(scenario: Scenario, **kwargs): @@ -31,135 +64,250 @@ async def benchmark_with_scenario(scenario: Scenario, **kwargs): raise ValueError(f"Unsupported Scenario type {type(scenario)}") -async def benchmark_generative_text( +# @validate_call(config={"arbitrary_types_allowed": True}) +async def benchmark_generative_text( # noqa: C901 target: str, - backend_type: BackendType, - backend_args: Optional[dict[str, Any]], - model: Optional[str], - processor: Optional[Optional[Union[str, Path, PreTrainedTokenizerBase]]], - processor_args: Optional[dict[str, Any]], - data: Union[ - str, - Path, - Iterable[Union[str, dict[str, Any]]], - Dataset, - DatasetDict, - IterableDataset, - IterableDatasetDict, - ], - data_args: Optional[dict[str, Any]], - data_sampler: Optional[Literal["random"]], - rate_type: Union[StrategyType, ProfileType], - rate: Optional[Union[float, list[float]]], - max_seconds: Optional[float], - max_requests: Optional[int], - warmup_percent: Optional[float], - cooldown_percent: Optional[float], - output_path: Optional[Union[str, Path]], - output_extras: Optional[dict[str, Any]], - output_sampling: Optional[int], - random_seed: int, - show_progress: bool = True, - show_progress_scheduler_stats: bool = False, - output_console: bool = True, -) -> tuple[GenerativeBenchmarksReport, Optional[Path]]: - console = GenerativeBenchmarksConsole(enabled=show_progress) - console.print_line("Creating backend...") - backend = Backend.create( - backend_type, target=target, model=model, **(backend_args or {}) - ) - await backend.validate() - console.print_line( - f"Backend {backend_type} connected to {target} for model {backend.model}." - ) + data: ( + Iterable[str] + | Iterable[dict[str, Any]] + | Dataset + | DatasetDict + | IterableDataset + | IterableDatasetDict + | str + | Path + ), + profile: StrategyType | ProfileType | Profile, + rate: float | list[float] | None = None, + random_seed: int = 42, + # Backend configuration + backend: BackendType | Backend = "openai_http", + backend_kwargs: dict[str, Any] | None = None, + model: str | None = None, + # Data configuration + processor: str | Path | PreTrainedTokenizerBase | None = None, + processor_args: dict[str, Any] | None = None, + data_args: dict[str, Any] | None = None, + data_sampler: Literal["random"] | None = None, + # Output configuration + output_path: str | Path | None = _CURRENT_WORKING_DIR, + output_formats: ( + tuple[str, ...] + | list[str] + | dict[str, str | dict[str, Any] | GenerativeBenchmarkerOutput] + | None + ) = ("console", "json", "html", "csv"), + # Updates configuration + progress: tuple[str, ...] | list[str] | list[BenchmarkerProgress] | None = None, + print_updates: bool = False, + # Aggregators configuration + add_aggregators: ( + dict[str, str | dict[str, Any] | Aggregator | CompilableAggregator] | None + ) = None, + warmup: float | None = None, + cooldown: float | None = None, + request_samples: int | None = 20, + # Constraints configuration + max_seconds: int | float | None = None, + max_requests: int | None = None, + max_errors: int | None = None, + max_error_rate: float | None = None, + max_global_error_rate: float | None = None, + **constraints: dict[str, ConstraintInitializer | Any], +) -> tuple[GenerativeBenchmarksReport, dict[str, Any]]: + console = Console(quiet=not print_updates) - if processor is None: - processor = backend.model - - console.print_line("Creating request loader...") - request_loader = GenerativeRequestLoader( - data=data, - data_args=data_args, - processor=processor, - processor_args=processor_args, - shuffle=data_sampler == "random", - iter_type=( - "finite" # assume a finite dataset is our limit - if max_requests is None and max_seconds is None - else "infinite" # default to infinite so we don't run out of data - ), - random_seed=random_seed, - ) - unique_requests = request_loader.num_unique_items(raise_err=False) - console.print_line( - f"Created loader with {unique_requests} unique requests from {data}.\n\n" - if unique_requests > 0 - else f"Created loader with unknown number unique requests from {data}.\n\n" - ) + with console.print_update_step( + title=f"Initializing backend {backend}" + ) as console_step: + backend = ( + Backend.create( + backend, target=target, model=model, **(backend_kwargs or {}) + ) + if not isinstance(backend, Backend) + else backend + ) + console_step.update(f"{backend.__class__.__name__} backend initialized") + await backend.process_startup() + await backend.validate() + console_step.finish( + title=f"{backend.__class__.__name__} backend initialized", + details=backend.info, + status_level="success", + ) - profile = create_profile(rate_type=rate_type, rate=rate) - benchmarker = GenerativeBenchmarker( - backend=backend, - request_loader=request_loader, - request_loader_description=request_loader.description, - benchmark_save_extras=output_extras, - processor=processor, - processor_args=processor_args, - ) - progress = ( - GenerativeTextBenchmarkerProgressDisplay( - display_scheduler_stats=show_progress_scheduler_stats + with console.print_update_step(title="Resolving processor") as console_step: + if processor is not None: + console_step.finish( + title="Processor resolved", + details=f"Using processor '{processor}'", + status_level="success", + ) + elif model is not None: + console_step.finish( + title="Processor resolved", + details=f"Using model '{model}' as processor", + status_level="success", + ) + processor = model + else: + console_step.update( + title="Resolving processor from backend.default_model", + status_level="info", + ) + processor = await backend.default_model() + console_step.finish( + title="Processor resolved", + details=( + f"Using model '{processor}' from backend " + f"{backend.__class__.__name__} as processor" + ), + status_level="success", + ) + await backend.process_shutdown() + + with console.print_update_step( + title=f"Initializing request loader from {data}" + ) as console_step: + request_loader = GenerativeRequestLoader( + data=data, + data_args=data_args, + processor=processor, + processor_args=processor_args, + shuffle=data_sampler == "random", + random_seed=random_seed, + ) + unique_requests = request_loader.num_unique_items(raise_err=False) + console_step.finish( + title=( + f"Request loader initialized with {unique_requests} unique requests " + f"from {data}" + ), + details=InfoMixin.extract_from_obj(request_loader), + status_level="success", + ) + + with console.print_update_step( + title=f"Resolving profile {profile}" + ) as console_step: + for key, val in { + "max_seconds": max_seconds, + "max_requests": max_requests, + "max_errors": max_errors, + "max_error_rate": max_error_rate, + "max_global_error_rate": max_global_error_rate, + }.items(): + if val is not None: + constraints[key] = val + if not isinstance(profile, Profile): + profile = Profile.create( + rate_type=profile, + rate=rate, + random_seed=random_seed, + constraints={**constraints}, + ) + elif constraints: + raise ValueError( + "Constraints must be empty when providing a Profile instance. " + f"Provided constraints: {constraints} ; provided profile: {profile}" + ) + console_step.finish( + title=f"{profile.__class__.__name__} profile resolved", + details=InfoMixin.extract_from_obj(profile), + status_level="success", + ) + + with console.print_update_step( + title="Creating benchmark aggregators" + ) as console_step: + aggregators = { + "scheduler_stats": SchedulerStatsAggregator(), + "requests_progress": GenerativeStatsProgressAggregator(), + "requests": GenerativeRequestsAggregator( + request_samples=request_samples, + warmup=warmup, + cooldown=cooldown, + ), + **SerializableAggregator.resolve(add_aggregators or {}), + } + console_step.finish( + title="Benchmark aggregators created", + details={key: str(val) for key, val in aggregators.items()}, + status_level="success", + ) + + with console.print_update_step(title="Resolving output formats") as console_step: + output_formats = GenerativeBenchmarkerOutput.resolve( + output_formats=(output_formats or {}), output_path=output_path + ) + console_step.finish( + title="Output formats resolved", + details={key: str(val) for key, val in output_formats.items()}, + status_level="success", ) - if show_progress - else None + + progress_group = BenchmarkerProgressGroup( + instances=progress or [], enabled=bool(progress) ) report = GenerativeBenchmarksReport() + console.print_update( + title="Setup complete, starting benchmarks...", status="success" + ) + console.print("\n\n") - async for result in benchmarker.run( - profile=profile, - max_number_per_strategy=max_requests, - max_duration_per_strategy=max_seconds, - warmup_percent_per_strategy=warmup_percent, - cooldown_percent_per_strategy=cooldown_percent, + async for ( + _aggregator_update, + benchmark, + _strategy, + _scheduler_state, + ) in progress_group( + profile, + Benchmarker[ + GenerativeBenchmark, + GenerationRequest, + GenerationResponse, + ]().run( + requests=request_loader, + backend=backend, + profile=profile, + environment=NonDistributedEnvironment(), + benchmark_aggregators=aggregators, + benchmark_class=GenerativeBenchmark, + ), ): - if progress: - progress.update(result) - - if result.type_ == "benchmark_compiled": - if result.current_benchmark is None: - raise ValueError("Current benchmark is None") - report.benchmarks.append( - result.current_benchmark.set_sample_size(output_sampling) - ) + if benchmark: + report.benchmarks.append(benchmark) - if output_console: - console.benchmarks = report.benchmarks - console.print_full_report() + output_format_results = {} + for key, output in output_formats.items(): + output_result = await output.finalize(report) + output_format_results[key] = output_result - if output_path: - console.print_line("\nSaving benchmarks report...") - saved_path = report.save_file(output_path) - console.print_line(f"Benchmarks report saved to {saved_path}") - else: - saved_path = None - - console.print_line("\nBenchmarking complete.") + console.print("\n\n") + console.print_update( + title=f"Benchmarking complete, generated {len(report.benchmarks)} benchmark(s)", + status="success", + ) + for key, value in output_format_results.items(): + console.print_update(title=f" {key:<8}: {value}", status="debug") - return report, saved_path + return report, output_format_results -def reimport_benchmarks_report(file: Path, output_path: Optional[Path]) -> None: +def reimport_benchmarks_report(file: Path, output_path: Path | None) -> None: """ The command-line entry point for re-importing and displaying an existing benchmarks report. Can also specify Assumes the file provided exists. """ - console = GenerativeBenchmarksConsole(enabled=True) report = GenerativeBenchmarksReport.load_file(file) - console.benchmarks = report.benchmarks - console.print_full_report() + console_output = GenerativeBenchmarkerConsole() + console_output.finalize(report) + console = Console() if output_path: - console.print_line("\nSaving benchmarks report...") - saved_path = report.save_file(output_path) - console.print_line(f"Benchmarks report saved to {saved_path}") + with console.print_update_step( + title=f"Saving benchmarks report to {output_path}..." + ) as console_step: + saved_path = report.save_file(output_path) + console_step.finish(title=f"Benchmarks report saved to {saved_path}") diff --git a/src/guidellm/benchmark/objects.py b/src/guidellm/benchmark/objects.py new file mode 100644 index 00000000..8afabba9 --- /dev/null +++ b/src/guidellm/benchmark/objects.py @@ -0,0 +1,473 @@ +""" +Benchmark data models and metrics for performance measurement and analysis. + +Provides comprehensive data structures for capturing, storing, and analyzing +benchmark results from scheduler executions. Includes timing measurements, +token statistics, and performance metrics for generative AI workloads. + +Classes: + BenchmarkSchedulerStats: Scheduler timing and performance statistics. + BenchmarkMetrics: Core benchmark metrics and distributions. + BenchmarkRequestStats: Individual request processing statistics. + Benchmark: Base benchmark result container with generic metrics. + GenerativeRequestStats: Request statistics for generative AI workloads. + GenerativeMetrics: Comprehensive metrics for generative benchmarks. + GenerativeBenchmark: Complete generative benchmark results and analysis. + GenerativeBenchmarksReport: Container for multiple benchmark results. + +Type Variables: + BenchmarkMetricsT: Generic benchmark metrics type. + BenchmarkRequestStatsT: Generic request statistics type. + BenchmarkT: Generic benchmark container type. +""" + +from __future__ import annotations + +import json +import uuid +from pathlib import Path +from typing import Any, ClassVar, Generic, Literal, TypeVar + +import yaml +from pydantic import Field, computed_field + +from guidellm.benchmark.profile import ( + Profile, +) +from guidellm.scheduler import ( + ScheduledRequestInfo, + SchedulerState, + SchedulingStrategy, +) +from guidellm.utils import ( + StandardBaseDict, + StandardBaseModel, + StatusBreakdown, + StatusDistributionSummary, +) + +__all__ = [ + "Benchmark", + "BenchmarkMetrics", + "BenchmarkSchedulerStats", + "BenchmarkT", + "GenerativeBenchmark", + "GenerativeBenchmarksReport", + "GenerativeMetrics", + "GenerativeRequestStats", +] + + +class BenchmarkSchedulerStats(StandardBaseDict): + """Scheduler timing and performance statistics.""" + + start_time: float = Field( + description="Unix timestamp when the benchmark run started" + ) + end_time: float = Field(description="Unix timestamp when the benchmark run ended") + requests_made: StatusBreakdown[int, int, int, int] = Field( + description="Request counts by status: successful, incomplete, errored, total" + ) + queued_time_avg: float = Field( + description="Avg time requests spent in the queue (seconds)" + ) + worker_resolve_start_delay_avg: float = Field( + description="Avg delay before worker begins resolving req after dequeue (sec)" + ) + worker_resolve_time_avg: float = Field( + description="Avg time for worker to resolve requests (seconds)" + ) + worker_resolve_end_delay_avg: float = Field( + description="Avg delay after request end till worker resolves (seconds)" + ) + finalized_delay_avg: float = Field( + description="Avg delay after resolve til finalized with in scheduler (sec)" + ) + worker_targeted_start_delay_avg: float = Field( + description="Avg delay from targeted start to actual worker start (seconds)" + ) + request_start_delay_avg: float = Field( + description="Avg delay after resolve til request start (seconds)" + ) + request_time_avg: float = Field(description="Avg request processing time (seconds)") + request_targeted_start_delay_avg: float = Field( + description="Avg delay from targeted start to actual request start" + ) + + +class SchedulerDict(StandardBaseDict): + """Scheduler configuration and execution state dictionary.""" + + strategy: SchedulingStrategy + constraints: dict[str, dict[str, Any]] + state: SchedulerState + + +class BenchmarkerDict(StandardBaseDict): + """Benchmarker configuration and component settings dictionary.""" + + profile: Profile + requests: dict[str, Any] + backend: dict[str, Any] + environment: dict[str, Any] + aggregators: dict[str, dict[str, Any]] + + +class BenchmarkMetrics(StandardBaseDict): + """Core benchmark metrics and statistical distributions.""" + + requests_per_second: StatusDistributionSummary = Field( + description="Distribution of requests per second across benchmark execution" + ) + request_concurrency: StatusDistributionSummary = Field( + description="Distribution of concurrent request counts during execution" + ) + request_latency: StatusDistributionSummary = Field( + description="Distribution of request latencies for completed requests" + ) + + +BenchmarkMetricsT = TypeVar("BenchmarkMetricsT", bound=BenchmarkMetrics) + + +class BenchmarkRequestStats(StandardBaseDict): + """Individual request processing statistics and scheduling metadata.""" + + scheduler_info: ScheduledRequestInfo = Field( + description="Scheduler metadata and timing information for the request" + ) + + +BenchmarkRequestStatsT = TypeVar("BenchmarkRequestStatsT", bound=BenchmarkRequestStats) + + +class Benchmark(StandardBaseDict, Generic[BenchmarkMetricsT, BenchmarkRequestStatsT]): + """Base benchmark result container with execution metadata.""" + + type_: Literal["benchmark"] = "benchmark" + id_: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique identifier for this benchmark execution", + ) + run_id: str = Field( + description="Identifier for the benchmarker run containing this benchmark" + ) + run_index: int = Field( + description="Sequential index of this benchmark within the benchmarker run" + ) + scheduler: SchedulerDict = Field( + description="Scheduler configuration and execution state" + ) + benchmarker: BenchmarkerDict = Field( + description="Benchmarker configuration and component settings" + ) + env_args: StandardBaseDict = Field( + description="Environment arguments and runtime configuration" + ) + extras: StandardBaseDict = Field( + description="Additional metadata and custom benchmark parameters" + ) + run_stats: BenchmarkSchedulerStats = Field( + description="Scheduler timing and performance statistics" + ) + start_time: float = Field( + default=-1.0, description="Unix timestamp when the first request was initiated" + ) + end_time: float = Field( + default=-1.0, description="Unix timestamp when the last request completed" + ) + + @computed_field # type: ignore[misc] + @property + def duration(self) -> float: + """ + Benchmark execution duration in seconds. + + :return: Time elapsed from first request start to last request completion. + """ + return self.end_time - self.start_time + + metrics: BenchmarkMetricsT = Field( + description="Performance metrics and statistical distributions" + ) + request_totals: StatusBreakdown[int, int, int, int] = Field( + description="Request counts by status: successful, incomplete, errored, total" + ) + requests: StatusBreakdown[ + list[BenchmarkRequestStatsT], + list[BenchmarkRequestStatsT], + list[BenchmarkRequestStatsT], + None, + ] = Field( + description="Request details grouped by status: successful, incomplete, errored" + ) + + +BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark) + + +class GenerativeRequestStats(BenchmarkRequestStats): + """Request statistics for generative AI text generation workloads.""" + + type_: Literal["generative_request_stats"] = "generative_request_stats" + request_id: str = Field(description="Unique identifier for the request") + request_type: Literal["text_completions", "chat_completions"] = Field( + description="Type of generative request: text or chat completion" + ) + prompt: str = Field(description="Input text prompt for generation") + request_args: dict[str, Any] = Field( + description="Generation parameters and configuration options" + ) + output: str | None = Field( + description="Generated text output, if request completed successfully" + ) + iterations: int = Field( + description="Number of processing iterations for the request" + ) + prompt_tokens: int | None = Field( + description="Number of tokens in the input prompt" + ) + output_tokens: int | None = Field( + description="Number of tokens in the generated output" + ) + + @computed_field # type: ignore[misc] + @property + def total_tokens(self) -> int | None: + """ + Total token count including prompt and output tokens. + + :return: Sum of prompt and output tokens, or None if either is unavailable. + """ + if self.prompt_tokens is None and self.output_tokens is None: + return None + + return (self.prompt_tokens or 0) + (self.output_tokens or 0) + + @computed_field # type: ignore[misc] + @property + def request_latency(self) -> float | None: + """ + End-to-end request processing latency in seconds. + + :return: Duration from request start to completion, or None if unavailable. + """ + if ( + not self.scheduler_info.request_timings.request_end + or not self.scheduler_info.request_timings.request_start + ): + return None + + return ( + self.scheduler_info.request_timings.request_end + - self.scheduler_info.request_timings.request_start + ) + + @computed_field # type: ignore[misc] + @property + def time_to_first_token_ms(self) -> float | None: + """ + Time to first token generation in milliseconds. + + :return: Latency from request start to first token, or None if unavailable. + """ + if ( + not self.scheduler_info.request_timings.first_iteration + or not self.scheduler_info.request_timings.request_start + ): + return None + + return 1000 * ( + self.scheduler_info.request_timings.first_iteration + - self.scheduler_info.request_timings.request_start + ) + + @computed_field # type: ignore[misc] + @property + def time_per_output_token_ms(self) -> float | None: + """ + Average time per output token in milliseconds. + + Includes time for first token and all subsequent tokens. + + :return: Average milliseconds per output token, or None if unavailable. + """ + if ( + not self.scheduler_info.request_timings.request_start + or not self.scheduler_info.request_timings.last_iteration + or not self.output_tokens + ): + return None + + return ( + 1000 + * ( + self.scheduler_info.request_timings.last_iteration + - self.scheduler_info.request_timings.request_start + ) + / self.output_tokens + ) + + @computed_field # type: ignore[misc] + @property + def inter_token_latency_ms(self) -> float | None: + """ + Average inter-token latency in milliseconds. + + Measures time between token generations, excluding first token. + + :return: Average milliseconds between tokens, or None if unavailable. + """ + if ( + not self.scheduler_info.request_timings.first_iteration + or not self.scheduler_info.request_timings.last_iteration + or not self.output_tokens + or self.output_tokens <= 1 + ): + return None + + return ( + 1000 + * ( + self.scheduler_info.request_timings.last_iteration + - self.scheduler_info.request_timings.first_iteration + ) + / (self.output_tokens - 1) + ) + + @computed_field # type: ignore[misc] + @property + def tokens_per_second(self) -> float | None: + """ + Overall token throughput including prompt and output tokens. + + :return: Total tokens per second, or None if unavailable. + """ + if not (latency := self.request_latency) or not (tokens := self.total_tokens): + return None + + return tokens / latency + + @computed_field # type: ignore[misc] + @property + def output_tokens_per_second(self) -> float | None: + """ + Output token generation throughput. + + :return: Output tokens per second, or None if unavailable. + """ + if not (latency := self.request_latency) or not self.output_tokens: + return None + + return self.output_tokens / latency + + +class GenerativeMetrics(BenchmarkMetrics): + """Comprehensive metrics for generative AI benchmarks.""" + + prompt_token_count: StatusDistributionSummary = Field( + description="Distribution of prompt token counts by request status" + ) + output_token_count: StatusDistributionSummary = Field( + description="Distribution of output token counts by request status" + ) + total_token_count: StatusDistributionSummary = Field( + description="Distribution of total token counts by request status" + ) + time_to_first_token_ms: StatusDistributionSummary = Field( + description="Distribution of first token latencies in milliseconds" + ) + time_per_output_token_ms: StatusDistributionSummary = Field( + description="Distribution of average time per output token in milliseconds" + ) + inter_token_latency_ms: StatusDistributionSummary = Field( + description="Distribution of inter-token latencies in milliseconds" + ) + output_tokens_per_second: StatusDistributionSummary = Field( + description="Distribution of output token generation rates" + ) + tokens_per_second: StatusDistributionSummary = Field( + description="Distribution of total token throughput including prompt and output" + ) + + +class GenerativeBenchmark(Benchmark[GenerativeMetrics, GenerativeRequestStats]): + """Complete generative AI benchmark results with specialized metrics.""" + + type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment] + + +class GenerativeBenchmarksReport(StandardBaseModel): + """Container for multiple benchmark results with load/save functionality.""" + + DEFAULT_FILE: ClassVar[str] = "benchmarks.json" + + @staticmethod + def load_file( + path: str | Path, type_: Literal["json", "yaml"] | None = None + ) -> GenerativeBenchmarksReport: + """ + Load a report from a file. + + :param path: The path to load the report from. + :param type_: File type override, auto-detected from extension if None. + :return: The loaded report. + :raises ValueError: If file type is unsupported. + """ + path = Path(path) if not isinstance(path, Path) else path + + if path.is_dir(): + path = path / GenerativeBenchmarksReport.DEFAULT_FILE + + path.parent.mkdir(parents=True, exist_ok=True) + path_suffix = path.suffix.lower()[1:] + + with path.open("r") as file: + if (type_ or path_suffix) == "json": + model_dict = json.loads(file.read()) + elif (type_ or path_suffix) in ["yaml", "yml"]: + model_dict = yaml.safe_load(file) + else: + raise ValueError(f"Unsupported file type: {type_} for {path}.") + + return GenerativeBenchmarksReport.model_validate(model_dict) + + benchmarks: list[GenerativeBenchmark] = Field( + description="The list of completed benchmarks contained within the report.", + default_factory=list, + ) + + def save_file( + self, path: str | Path | None, type_: Literal["json", "yaml"] | None = None + ) -> Path: + """ + Save the report to a file. + + :param path: The path to save the report to. + :param type_: File type override, auto-detected from extension if None. + :return: The path to the saved report. + :raises ValueError: If file type is unsupported. + """ + if path is None: + path = Path.cwd() + elif not isinstance(path, Path): + path = Path(path) + + if path.is_dir(): + path = path / GenerativeBenchmarksReport.DEFAULT_FILE + + path.parent.mkdir(parents=True, exist_ok=True) + path_suffix = path.suffix.lower()[1:] + model_dict = self.model_dump() + + if (type_ or path_suffix) == "json": + save_str = json.dumps(model_dict) + elif (type_ or path_suffix) in ["yaml", "yml"]: + save_str = yaml.dump(model_dict) + else: + raise ValueError(f"Unsupported file type: {type_} for {path}.") + + with path.open("w") as file: + file.write(save_str) + + return path diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py index 6759f16f..95b51d70 100644 --- a/src/guidellm/benchmark/output.py +++ b/src/guidellm/benchmark/output.py @@ -1,429 +1,318 @@ +from __future__ import annotations + import csv import json import math +from abc import ABC, abstractmethod from collections import OrderedDict -from copy import deepcopy from datetime import datetime from pathlib import Path -from typing import Any, Literal, Optional, Union +from typing import Any, ClassVar -import yaml -from pydantic import Field +from pydantic import BaseModel, ConfigDict, Field from rich.console import Console from rich.padding import Padding from rich.text import Text -from guidellm.benchmark.benchmark import GenerativeBenchmark, GenerativeMetrics +from guidellm.benchmark.objects import ( + GenerativeBenchmark, + GenerativeBenchmarksReport, + GenerativeMetrics, +) from guidellm.benchmark.profile import ( AsyncProfile, ConcurrentProfile, SweepProfile, ThroughputProfile, ) -from guidellm.objects import ( - DistributionSummary, - StandardBaseModel, - StatusDistributionSummary, -) from guidellm.presentation import UIDataBuilder from guidellm.presentation.injector import create_report -from guidellm.scheduler import strategy_display_str from guidellm.settings import settings -from guidellm.utils import Colors, split_text_list_by_length -from guidellm.utils.dict import recursive_key_update -from guidellm.utils.text import camelize_str +from guidellm.utils import ( + Colors, + DistributionSummary, + RegistryMixin, + StatusDistributionSummary, + safe_format_timestamp, + split_text_list_by_length, +) __all__ = [ - "GenerativeBenchmarksConsole", - "GenerativeBenchmarksReport", + "GenerativeBenchmarkerCSV", + "GenerativeBenchmarkerConsole", + "GenerativeBenchmarkerHTML", + "GenerativeBenchmarkerOutput", ] -class GenerativeBenchmarksReport(StandardBaseModel): - """ - A pydantic model representing a completed benchmark report. - Contains a list of benchmarks along with convenience methods for finalizing - and saving the report. - """ - - @staticmethod - def load_file(path: Union[str, Path]) -> "GenerativeBenchmarksReport": - """ - Load a report from a file. The file type is determined by the file extension. - If the file is a directory, it expects a file named benchmarks.json under the - directory. - - :param path: The path to load the report from. - :return: The loaded report. - """ - path, type_ = GenerativeBenchmarksReport._file_setup(path) - - if type_ == "json": - with path.open("r") as file: - model_dict = json.load(file) - - return GenerativeBenchmarksReport.model_validate(model_dict) - - if type_ == "yaml": - with path.open("r") as file: - model_dict = yaml.safe_load(file) - - return GenerativeBenchmarksReport.model_validate(model_dict) - - if type_ == "csv": - raise ValueError(f"CSV file type is not supported for loading: {path}.") - - if type_ == "html": - raise ValueError(f"HTML file type is not supported for loading: {path}.") - - raise ValueError(f"Unsupported file type: {type_} for {path}.") - - benchmarks: list[GenerativeBenchmark] = Field( - description="The list of completed benchmarks contained within the report.", - default_factory=list, +class GenerativeBenchmarkerOutput( + BaseModel, RegistryMixin[type["GenerativeBenchmarkerOutput"]], ABC +): + model_config = ConfigDict( + extra="ignore", + arbitrary_types_allowed=True, + validate_assignment=True, + from_attributes=True, + use_enum_values=True, ) - def set_sample_size( - self, sample_size: Optional[int] - ) -> "GenerativeBenchmarksReport": + @classmethod + @abstractmethod + def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]: """ - Set the sample size for each benchmark in the report. In doing this, it will - reduce the contained requests of each benchmark to the sample size. - If sample size is None, it will return the report as is. + Validate and process arguments for constraint creation. - :param sample_size: The sample size to set for each benchmark. - If None, the report will be returned as is. - :return: The report with the sample size set for each benchmark. - """ - - if sample_size is not None: - for benchmark in self.benchmarks: - benchmark.set_sample_size(sample_size) + Must be implemented by subclasses to handle their specific parameter patterns. - return self - - def save_file(self, path: Union[str, Path]) -> Path: + :param args: Positional arguments passed to the constraint + :param kwargs: Keyword arguments passed to the constraint + :return: Validated dictionary of parameters for constraint creation + :raises NotImplementedError: Must be implemented by subclasses """ - Save the report to a file. The file type is determined by the file extension. - If the file is a directory, it will save the report to a file named - benchmarks.json under the directory. + ... - :param path: The path to save the report to. - :return: The path to the saved report. - """ - path, type_ = GenerativeBenchmarksReport._file_setup(path) - - if type_ == "json": - return self.save_json(path) - - if type_ == "yaml": - return self.save_yaml(path) - - if type_ == "csv": - return self.save_csv(path) - - if type_ == "html": - return self.save_html(path) - - raise ValueError(f"Unsupported file type: {type_} for {path}.") - - def save_json(self, path: Union[str, Path]) -> Path: - """ - Save the report to a JSON file containing all of the report data which is - reloadable using the pydantic model. If the file is a directory, it will save - the report to a file named benchmarks.json under the directory. - - :param path: The path to save the report to. - :return: The path to the saved report. - """ - path, type_ = GenerativeBenchmarksReport._file_setup(path, "json") - - if type_ != "json": - raise ValueError( - f"Unsupported file type for saving a JSON: {type_} for {path}." - ) - - model_dict = self.model_dump() - model_json = json.dumps(model_dict) - - with path.open("w") as file: - file.write(model_json) - - return path - - def save_yaml(self, path: Union[str, Path]) -> Path: - """ - Save the report to a YAML file containing all of the report data which is - reloadable using the pydantic model. If the file is a directory, it will save - the report to a file named benchmarks.yaml under the directory. - - :param path: The path to save the report to. - :return: The path to the saved report. - """ - - path, type_ = GenerativeBenchmarksReport._file_setup(path, "yaml") - - if type_ != "yaml": - raise ValueError( - f"Unsupported file type for saving a YAML: {type_} for {path}." - ) - - model_dict = self.model_dump() - model_yaml = yaml.dump(model_dict) - - with path.open("w") as file: - file.write(model_yaml) - - return path - - def save_csv(self, path: Union[str, Path]) -> Path: - """ - Save the report to a CSV file containing the summarized statistics and values - for each report. Note, this data is not reloadable using the pydantic model. - If the file is a directory, it will save the report to a file named - benchmarks.csv under the directory. - - :param path: The path to save the report to. - :return: The path to the saved report. - """ - path, type_ = GenerativeBenchmarksReport._file_setup(path, "csv") - - if type_ != "csv": - raise ValueError( - f"Unsupported file type for saving a CSV: {type_} for {path}." + @classmethod + def resolve( + cls, + output_formats: ( + tuple[str, ...] + | list[str] + | dict[ + str, + Any | dict[str, Any] | GenerativeBenchmarkerOutput, + ] + | None + ), + output_path: str | Path | None, + ) -> dict[str, GenerativeBenchmarkerOutput]: + if not output_formats: + return {} + + if isinstance(output_formats, (list, tuple)): + # support list of output keys: ["csv", "json"] + # support list of files: ["path/to/file.json", "path/to/file.csv"] + formats_list = output_formats + output_formats = {} + for output_format in formats_list: + if not isinstance(output_format, str): + raise TypeError( + f"Expected string format, got {type(output_format)} for " + f"{output_format} in {formats_list}" + ) + try: + if cls.is_registered(output_format): + output_formats[output_format] = {} + else: + # treat it as a file save location + path = Path(output_format) + format_type = path.suffix[1:].lower() + output_formats[format_type] = {"output_path": path} + + except Exception as err: + raise ValueError( + f"Failed to resolve output format '{output_format}': {err}" + ) from err + + resolved = {} + + for key, val in output_formats.items(): + if isinstance(val, GenerativeBenchmarkerOutput): + resolved[key] = val + else: + output_class = cls.get_registered_object(key) + kwargs = {"output_path": output_path} + + if isinstance(val, dict): + kwargs.update(val) + kwargs = output_class.validated_kwargs(**kwargs) + else: + kwargs = output_class.validated_kwargs(val, **kwargs) + + resolved[key] = output_class(**kwargs) + + return resolved + + @abstractmethod + async def finalize(self, report: GenerativeBenchmarksReport) -> Any: ... + + +@GenerativeBenchmarkerOutput.register(["json", "yaml"]) +class GenerativeBenchmarkerSerialized(GenerativeBenchmarkerOutput): + @classmethod + def validated_kwargs( + cls, output_path: str | Path | None, **_kwargs + ) -> dict[str, Any]: + new_kwargs = {} + if output_path is not None: + new_kwargs["output_path"] = ( + Path(output_path) if not isinstance(output_path, Path) else output_path ) + return new_kwargs - with path.open("w", newline="") as file: - writer = csv.writer(file) - headers: list[str] = [] - rows: list[list[Union[str, float, list[float]]]] = [] - - for benchmark in self.benchmarks: - benchmark_headers: list[str] = [] - benchmark_values: list[Union[str, float, list[float]]] = [] - - desc_headers, desc_values = self._benchmark_desc_headers_and_values( - benchmark - ) - benchmark_headers += desc_headers - benchmark_values += desc_values + output_path: Path = Field(default_factory=lambda: Path.cwd()) - for status in StatusDistributionSummary.model_fields: - status_headers, status_values = ( - self._benchmark_status_headers_and_values(benchmark, status) - ) - benchmark_headers += status_headers - benchmark_values += status_values + async def finalize(self, report: GenerativeBenchmarksReport) -> Path: + return report.save_file(self.output_path) - benchmark_extra_headers, benchmark_extra_values = ( - self._benchmark_extras_headers_and_values(benchmark) - ) - benchmark_headers += benchmark_extra_headers - benchmark_values += benchmark_extra_values - if not headers: - headers = benchmark_headers - rows.append(benchmark_values) +@GenerativeBenchmarkerOutput.register("console") +class GenerativeBenchmarkerConsole(GenerativeBenchmarkerOutput): + """Console output formatter for benchmark results with rich formatting.""" - writer.writerow(headers) - for row in rows: - writer.writerow(row) + @classmethod + def validated_kwargs(cls, *_args, **_kwargs) -> dict[str, Any]: + return {} - return path + console: Console = Field(default_factory=Console) - def save_html(self, path: Union[str, Path]) -> Path: + async def finalize(self, report: GenerativeBenchmarksReport) -> str: """ - Download html, inject report data and save to a file. + Print the complete benchmark report to the console. - :param path: The path to create the report at. - :return: The path to the report. + :param report: The completed benchmark report. + :return: """ + self._print_benchmarks_metadata(report.benchmarks) + self._print_benchmarks_info(report.benchmarks) + self._print_benchmarks_stats(report.benchmarks) - data_builder = UIDataBuilder(self.benchmarks) - data = data_builder.to_dict() - camel_data = recursive_key_update(deepcopy(data), camelize_str) - ui_api_data = {} - for k, v in camel_data.items(): - key = f"window.{k} = {{}};" - value = f"window.{k} = {json.dumps(v, indent=2)};\n" - ui_api_data[key] = value - return create_report(ui_api_data, path) - - @staticmethod - def _file_setup( - path: Union[str, Path], - default_file_type: Literal["json", "yaml", "csv", "html"] = "json", - ) -> tuple[Path, Literal["json", "yaml", "csv", "html"]]: - path = Path(path) if not isinstance(path, Path) else path + return "printed to console" - if path.is_dir(): - path = path / f"benchmarks.{default_file_type}" - - path.parent.mkdir(parents=True, exist_ok=True) - path_suffix = path.suffix.lower() - - if path_suffix == ".json": - return path, "json" - - if path_suffix in [".yaml", ".yml"]: - return path, "yaml" - - if path_suffix in [".csv"]: - return path, "csv" - - if path_suffix in [".html"]: - return path, "html" + def _print_benchmarks_metadata(self, benchmarks: list[GenerativeBenchmark]): + start_time = benchmarks[0].run_stats.start_time + end_time = benchmarks[-1].run_stats.end_time + duration = end_time - start_time - raise ValueError( - f"Unsupported file extension: {path_suffix} for {path}; " - "expected json, yaml, csv, or html." - ) + self._print_section_header("Benchmarks Metadata") + self._print_labeled_line("Run id", str(benchmarks[0].run_id)) + self._print_labeled_line("Duration", f"{duration:.1f} seconds") + self._print_labeled_line("Profile", self._get_profile_str(benchmarks[0])) - @staticmethod - def _benchmark_desc_headers_and_values( - benchmark: GenerativeBenchmark, - ) -> tuple[list[str], list[Union[str, float]]]: + def _print_benchmarks_info(self, benchmarks: list[GenerativeBenchmark]): + sections = { + "Metadata": (0, 3), + "Requests Made": (4, 6), + "Prompt Tok/Req": (7, 9), + "Output Tok/Req": (10, 12), + "Prompt Tok Total": (13, 15), + "Output Tok Total": (16, 18), + } headers = [ - "Type", - "Run Id", - "Id", - "Name", + "Benchmark", "Start Time", "End Time", - "Duration", - ] - values: list[Union[str, float]] = [ - benchmark.type_, - benchmark.run_id, - benchmark.id_, - strategy_display_str(benchmark.args.strategy), - datetime.fromtimestamp(benchmark.start_time).strftime("%Y-%m-%d %H:%M:%S"), - datetime.fromtimestamp(benchmark.end_time).strftime("%Y-%m-%d %H:%M:%S"), - benchmark.duration, - ] - - if len(headers) != len(values): - raise ValueError("Headers and values length mismatch.") - - return headers, values - - @staticmethod - def _benchmark_extras_headers_and_values( - benchmark: GenerativeBenchmark, - ) -> tuple[list[str], list[str]]: - headers = ["Args", "Worker", "Request Loader", "Extras"] - values: list[str] = [ - json.dumps(benchmark.args.model_dump()), - json.dumps(benchmark.worker.model_dump()), - json.dumps(benchmark.request_loader.model_dump()), - json.dumps(benchmark.extras), - ] - - if len(headers) != len(values): - raise ValueError("Headers and values length mismatch.") - - return headers, values - - @staticmethod - def _benchmark_status_headers_and_values( - benchmark: GenerativeBenchmark, status: str - ) -> tuple[list[str], list[Union[float, list[float]]]]: - headers = [ - f"{status.capitalize()} Requests", - ] - values = [ - getattr(benchmark.request_totals, status), + "Duration (s)", + "Comp", + "Inc", + "Err", + "Comp", + "Inc", + "Err", + "Comp", + "Inc", + "Err", + "Comp", + "Inc", + "Err", + "Comp", + "Inc", + "Err", ] - for metric in GenerativeMetrics.model_fields: - metric_headers, metric_values = ( - GenerativeBenchmarksReport._benchmark_status_metrics_stats( - benchmark, status, metric - ) + rows = [] + for benchmark in benchmarks: + rows.append( + [ + str(benchmark.scheduler.strategy), + safe_format_timestamp(benchmark.start_time), + safe_format_timestamp(benchmark.end_time), + f"{(benchmark.end_time - benchmark.start_time):.1f}", + f"{benchmark.request_totals.successful:.0f}", + f"{benchmark.request_totals.incomplete:.0f}", + f"{benchmark.request_totals.errored:.0f}", + f"{benchmark.metrics.prompt_token_count.successful.mean:.1f}", + f"{benchmark.metrics.prompt_token_count.incomplete.mean:.1f}", + f"{benchmark.metrics.prompt_token_count.errored.mean:.1f}", + f"{benchmark.metrics.output_token_count.successful.mean:.1f}", + f"{benchmark.metrics.output_token_count.incomplete.mean:.1f}", + f"{benchmark.metrics.output_token_count.errored.mean:.1f}", + f"{benchmark.metrics.prompt_token_count.successful.total_sum:.0f}", + f"{benchmark.metrics.prompt_token_count.incomplete.total_sum:.0f}", + f"{benchmark.metrics.prompt_token_count.errored.total_sum:.0f}", + f"{benchmark.metrics.output_token_count.successful.total_sum:.0f}", + f"{benchmark.metrics.output_token_count.incomplete.total_sum:.0f}", + f"{benchmark.metrics.output_token_count.errored.total_sum:.0f}", + ] ) - headers += metric_headers - values += metric_values - if len(headers) != len(values): - raise ValueError("Headers and values length mismatch.") - - return headers, values + self._print_table(headers, rows, "Benchmarks Info", sections) - @staticmethod - def _benchmark_status_metrics_stats( - benchmark: GenerativeBenchmark, - status: str, - metric: str, - ) -> tuple[list[str], list[Union[float, list[float]]]]: - status_display = status.capitalize() - metric_display = metric.replace("_", " ").capitalize() - status_dist_summary: StatusDistributionSummary = getattr( - benchmark.metrics, metric - ) - dist_summary: DistributionSummary = getattr(status_dist_summary, status) + def _print_benchmarks_stats(self, benchmarks: list[GenerativeBenchmark]): + sections = { + "Metadata": (0, 0), + "Request Stats": (1, 2), + "Out Tok/sec": (3, 3), + "Tot Tok/sec": (4, 4), + "Req Latency (sec)": (5, 7), + "TTFT (ms)": (8, 10), + "ITL (ms)": (11, 13), + "TPOT (ms)": (14, 16), + } headers = [ - f"{status_display} {metric_display} mean", - f"{status_display} {metric_display} median", - f"{status_display} {metric_display} std dev", - ( - f"{status_display} {metric_display} " - "[min, 0.1, 1, 5, 10, 25, 75, 90, 95, 99, max]" - ), - ] - values: list[Union[float, list[float]]] = [ - dist_summary.mean, - dist_summary.median, - dist_summary.std_dev, - [ - dist_summary.min, - dist_summary.percentiles.p001, - dist_summary.percentiles.p01, - dist_summary.percentiles.p05, - dist_summary.percentiles.p10, - dist_summary.percentiles.p25, - dist_summary.percentiles.p75, - dist_summary.percentiles.p90, - dist_summary.percentiles.p95, - dist_summary.percentiles.p99, - dist_summary.max, - ], + "Benchmark", + "Per Second", + "Concurrency", + "mean", + "mean", + "mean", + "median", + "p99", + "mean", + "median", + "p99", + "mean", + "median", + "p99", + "mean", + "median", + "p99", ] - if len(headers) != len(values): - raise ValueError("Headers and values length mismatch.") - - return headers, values - - -class GenerativeBenchmarksConsole: - """ - A class for outputting progress and benchmark results to the console. - Utilizes the rich library for formatting, enabling colored and styled output. - """ - - def __init__(self, enabled: bool = True): - """ - :param enabled: Whether to enable console output. Defaults to True. - If False, all console output will be suppressed. - """ - self.enabled = enabled - self.benchmarks: Optional[list[GenerativeBenchmark]] = None - self.console = Console() + rows = [] + for benchmark in benchmarks: + rows.append( + [ + str(benchmark.scheduler.strategy), + f"{benchmark.metrics.requests_per_second.successful.mean:.2f}", + f"{benchmark.metrics.request_concurrency.successful.mean:.2f}", + f"{benchmark.metrics.output_tokens_per_second.successful.mean:.1f}", + f"{benchmark.metrics.tokens_per_second.successful.mean:.1f}", + f"{benchmark.metrics.request_latency.successful.mean:.2f}", + f"{benchmark.metrics.request_latency.successful.median:.2f}", + f"{benchmark.metrics.request_latency.successful.percentiles.p99:.2f}", + f"{benchmark.metrics.time_to_first_token_ms.successful.mean:.1f}", + f"{benchmark.metrics.time_to_first_token_ms.successful.median:.1f}", + f"{benchmark.metrics.time_to_first_token_ms.successful.percentiles.p99:.1f}", + f"{benchmark.metrics.inter_token_latency_ms.successful.mean:.1f}", + f"{benchmark.metrics.inter_token_latency_ms.successful.median:.1f}", + f"{benchmark.metrics.inter_token_latency_ms.successful.percentiles.p99:.1f}", + f"{benchmark.metrics.time_per_output_token_ms.successful.mean:.1f}", + f"{benchmark.metrics.time_per_output_token_ms.successful.median:.1f}", + f"{benchmark.metrics.time_per_output_token_ms.successful.percentiles.p99:.1f}", + ] + ) - @property - def benchmarks_profile_str(self) -> str: - """ - :return: A string representation of the profile used for the benchmarks. - """ - profile = self.benchmarks[0].args.profile if self.benchmarks else None + self._print_table(headers, rows, "Benchmarks Stats", sections) + def _get_profile_str(self, benchmark: GenerativeBenchmark) -> str: + profile = benchmark.benchmarker.profile if profile is None: return "None" profile_args = OrderedDict( { "type": profile.type_, - "strategies": profile.strategy_types, + "strategies": getattr(profile, "strategy_types", []), } ) @@ -434,22 +323,13 @@ def benchmarks_profile_str(self) -> str: elif isinstance(profile, AsyncProfile): profile_args["max_concurrency"] = str(profile.max_concurrency) profile_args["rate"] = str(profile.rate) - profile_args["initial_burst"] = str(profile.initial_burst) elif isinstance(profile, SweepProfile): profile_args["sweep_size"] = str(profile.sweep_size) return ", ".join(f"{key}={value}" for key, value in profile_args.items()) - @property - def benchmarks_args_str(self) -> str: - """ - :return: A string representation of the arguments used for the benchmarks. - """ - args = self.benchmarks[0].args if self.benchmarks else None - - if args is None: - return "None" - + def _get_args_str(self, benchmark: GenerativeBenchmark) -> str: + args = benchmark.args args_dict = OrderedDict( { "max_number": args.max_number, @@ -460,111 +340,45 @@ def benchmarks_args_str(self) -> str: "cooldown_duration": args.cooldown_duration, } ) - return ", ".join(f"{key}={value}" for key, value in args_dict.items()) - @property - def benchmarks_worker_desc_str(self) -> str: - """ - :return: A string representation of the worker used for the benchmarks. - """ - return str(self.benchmarks[0].worker) if self.benchmarks else "None" - - @property - def benchmarks_request_loader_desc_str(self) -> str: - """ - :return: A string representation of the request loader used for the benchmarks. - """ - return str(self.benchmarks[0].request_loader) if self.benchmarks else "None" - - @property - def benchmarks_extras_str(self) -> str: - """ - :return: A string representation of the extras used for the benchmarks. - """ - extras = self.benchmarks[0].extras if self.benchmarks else None - - if not extras: - return "None" - - return ", ".join(f"{key}={value}" for key, value in extras.items()) - - def print_section_header(self, title: str, indent: int = 0, new_lines: int = 2): - """ - Print out a styled section header to the console. - The title is underlined, bolded, and colored with the INFO color. - - :param title: The title of the section. - :param indent: The number of spaces to indent the title. - Defaults to 0. - :param new_lines: The number of new lines to print before the title. - Defaults to 2. - """ - self.print_line( - value=f"{title}:", - style=f"bold underline {Colors.INFO}", + def _print_section_header(self, title: str, indent: int = 0, new_lines: int = 2): + self._print_line( + f"{title}:", + f"bold underline {Colors.info}", indent=indent, new_lines=new_lines, ) - def print_labeled_line( + def _print_labeled_line( self, label: str, value: str, indent: int = 4, new_lines: int = 0 ): - """ - Print out a styled, labeled line (label: value) to the console. - The label is bolded and colored with the INFO color, - and the value is italicized. - - :param label: The label of the line. - :param value: The value of the line. - :param indent: The number of spaces to indent the line. - Defaults to 4. - :param new_lines: The number of new lines to print before the line. - Defaults to 0. - """ - self.print_line( - value=[label + ":", value], - style=["bold " + Colors.INFO, "italic"], + self._print_line( + [label + ":", value], + ["bold " + Colors.info, "italic"], new_lines=new_lines, indent=indent, ) - def print_line( + def _print_line( self, - value: Union[str, list[str]], - style: Union[str, list[str]] = "", + value: str | list[str], + style: str | list[str] = "", indent: int = 0, new_lines: int = 0, ): - """ - Print out a a value to the console as a line with optional indentation. - - :param value: The value to print. - :param style: The style to apply to the value. - Defaults to none. - :param indent: The number of spaces to indent the line. - Defaults to 0. - :param new_lines: The number of new lines to print before the value. - Defaults to 0. - """ - if not self.enabled: - return - text = Text() - for _ in range(new_lines): text.append("\n") if not isinstance(value, list): value = [value] - if not isinstance(style, list): style = [style for _ in range(len(value))] if len(value) != len(style): raise ValueError( - f"Value and style length mismatch. Value length: {len(value)}, " - f"Style length: {len(style)}." + f"Value and style length mismatch: {len(value)} vs {len(style)}" ) for val, sty in zip(value, style): @@ -572,128 +386,81 @@ def print_line( self.console.print(Padding.indent(text, indent)) - def print_table( + def _print_table( self, headers: list[str], rows: list[list[Any]], title: str, - sections: Optional[dict[str, tuple[int, int]]] = None, - max_char_per_col: int = 2**10, + sections: dict[str, tuple[int, int]] | None = None, + max_char_per_col: int = 1024, indent: int = 0, new_lines: int = 2, ): - """ - Print a table to the console with the given headers and rows. - - :param headers: The headers of the table. - :param rows: The rows of the table. - :param title: The title of the table. - :param sections: The sections of the table grouping columns together. - This is a mapping of the section display name to a tuple of the start and - end column indices. If None, no sections are added (default). - :param max_char_per_col: The maximum number of characters per column. - :param indent: The number of spaces to indent the table. - Defaults to 0. - :param new_lines: The number of new lines to print before the table. - Defaults to 0. - """ - if rows and any(len(row) != len(headers) for row in rows): raise ValueError( - f"Headers and rows length mismatch. Headers length: {len(headers)}, " - f"Row length: {len(rows[0]) if rows else 'N/A'}." + "Headers and rows length mismatch: " + f"{len(headers)} vs {len(rows[0]) if rows else 'N/A'}" ) - max_characters_per_column = self.calculate_max_chars_per_column( + max_chars_per_column = self._calculate_max_chars_per_column( headers, rows, sections, max_char_per_col ) - self.print_section_header(title, indent=indent, new_lines=new_lines) - self.print_table_divider( - max_characters_per_column, include_separators=False, indent=indent - ) + self._print_section_header(title, indent=indent, new_lines=new_lines) + self._print_table_divider(max_chars_per_column, False, indent) if sections: - self.print_table_sections( - sections, max_characters_per_column, indent=indent - ) - self.print_table_row( - split_text_list_by_length(headers, max_characters_per_column), - style=f"bold {Colors.INFO}", - indent=indent, - ) - self.print_table_divider( - max_characters_per_column, include_separators=True, indent=indent + self._print_table_sections(sections, max_chars_per_column, indent) + self._print_table_row( + split_text_list_by_length(headers, max_chars_per_column), + f"bold {Colors.info}", + indent, ) + self._print_table_divider(max_chars_per_column, True, indent) for row in rows: - self.print_table_row( - split_text_list_by_length(row, max_characters_per_column), - style="italic", - indent=indent, + self._print_table_row( + split_text_list_by_length(row, max_chars_per_column), + "italic", + indent, ) - self.print_table_divider( - max_characters_per_column, include_separators=False, indent=indent - ) + self._print_table_divider(max_chars_per_column, False, indent) - def calculate_max_chars_per_column( + def _calculate_max_chars_per_column( self, headers: list[str], rows: list[list[Any]], - sections: Optional[dict[str, tuple[int, int]]], + sections: dict[str, tuple[int, int]] | None, max_char_per_col: int, ) -> list[int]: - """ - Calculate the maximum number of characters per column in the table. - This is done by checking the length of the headers, rows, and optional sections - to ensure all columns are accounted for and spaced correctly. - - :param headers: The headers of the table. - :param rows: The rows of the table. - :param sections: The sections of the table grouping columns together. - This is a mapping of the section display name to a tuple of the start and - end column indices. If None, no sections are added (default). - :param max_char_per_col: The maximum number of characters per column. - :return: A list of the maximum number of characters per column. - """ - max_characters_per_column = [] + """Calculate maximum characters per column for table formatting.""" + max_chars_per_column = [] for ind in range(len(headers)): - max_characters_per_column.append(min(len(headers[ind]), max_char_per_col)) - + max_chars_per_column.append(min(len(headers[ind]), max_char_per_col)) for row in rows: - max_characters_per_column[ind] = max( - max_characters_per_column[ind], len(str(row[ind])) + max_chars_per_column[ind] = max( + max_chars_per_column[ind], len(str(row[ind])) ) if not sections: - return max_characters_per_column + return max_chars_per_column - for section in sections: - start_col, end_col = sections[section] - min_section_len = len(section) + ( - end_col - start_col - ) # ensure we have enough space for separators + for section, (start_col, end_col) in sections.items(): + min_section_len = len(section) + (end_col - start_col) chars_in_columns = sum( - max_characters_per_column[start_col : end_col + 1] + max_chars_per_column[start_col : end_col + 1] ) + 2 * (end_col - start_col) if min_section_len > chars_in_columns: add_chars_per_col = math.ceil( (min_section_len - chars_in_columns) / (end_col - start_col + 1) ) for col in range(start_col, end_col + 1): - max_characters_per_column[col] += add_chars_per_col + max_chars_per_column[col] += add_chars_per_col - return max_characters_per_column + return max_chars_per_column - def print_table_divider( + def _print_table_divider( self, max_chars_per_column: list[int], include_separators: bool, indent: int = 0 ): - """ - Print a divider line for the table (top and bottom of table with '=' characters) - - :param max_chars_per_column: The maximum number of characters per column. - :param include_separators: Whether to include separators between columns. - :param indent: The number of spaces to indent the line. - Defaults to 0. - """ + """Print table divider line.""" if include_separators: columns = [ settings.table_headers_border_char * max_chars @@ -706,29 +473,15 @@ def print_table_divider( settings.table_border_char * (max_chars + 2) for max_chars in max_chars_per_column ] - columns[-1] = columns[-1][:-2] - self.print_line(value=columns, style=Colors.INFO, indent=indent) + self._print_line(columns, Colors.info, indent) - def print_table_sections( + def _print_table_sections( self, sections: dict[str, tuple[int, int]], max_chars_per_column: list[int], indent: int = 0, ): - """ - Print the sections of the table with corresponding separators to the columns - the sections are mapped to to ensure it is compliant with a CSV format. - For example, a section named "Metadata" with columns 0-3 will print this: - Metadata ,,,, - Where the spaces plus the separators at the end will span the columns 0-3. - All columns must be accounted for in the sections. - - :param sections: The sections of the table. - :param max_chars_per_column: The maximum number of characters per column. - :param indent: The number of spaces to indent the line. - Defaults to 0. - """ section_tuples = [(start, end, name) for name, (start, end) in sections.items()] section_tuples.sort(key=lambda x: x[0]) @@ -752,30 +505,23 @@ def print_table_sections( end_col - start_col + 1 ) num_separators = end_col - start_col - line_values.append(section) - line_styles.append("bold " + Colors.INFO) - line_values.append( - " " * (section_length - len(section) - num_separators - 2) + line_values.extend( + [ + section, + " " * (section_length - len(section) - num_separators - 2), + settings.table_column_separator_char * num_separators, + settings.table_column_separator_char + " ", + ] ) - line_styles.append("") - line_values.append(settings.table_column_separator_char * num_separators) - line_styles.append("") - line_values.append(settings.table_column_separator_char + " ") - line_styles.append(Colors.INFO) + line_styles.extend(["bold " + Colors.info, "", "", Colors.info]) + line_values = line_values[:-1] line_styles = line_styles[:-1] - self.print_line(value=line_values, style=line_styles, indent=indent) + self._print_line(line_values, line_styles, indent) - def print_table_row( + def _print_table_row( self, column_lines: list[list[str]], style: str, indent: int = 0 ): - """ - Print a single row of a table to the console. - - :param column_lines: The lines of text to print for each column. - :param indent: The number of spaces to indent the line. - Defaults to 0. - """ for row in range(len(column_lines[0])): print_line = [] print_styles = [] @@ -787,212 +533,203 @@ def print_table_row( " ", ] ) - print_styles.extend([style, Colors.INFO, ""]) + print_styles.extend([style, Colors.info, ""]) print_line = print_line[:-2] print_styles = print_styles[:-2] - self.print_line(value=print_line, style=print_styles, indent=indent) + self._print_line(print_line, print_styles, indent) - def print_benchmarks_metadata(self): - """ - Print out the metadata of the benchmarks to the console including the run id, - duration, profile, args, worker, request loader, and extras. - """ - if not self.benchmarks: - raise ValueError( - "No benchmarks to print metadata for. Please set benchmarks first." - ) +@GenerativeBenchmarkerOutput.register("csv") +class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput): + """CSV output formatter for benchmark results.""" - start_time = self.benchmarks[0].run_stats.start_time - end_time = self.benchmarks[-1].run_stats.end_time - duration = end_time - start_time + DEFAULT_FILE: ClassVar[str] = "benchmarks.csv" - self.print_section_header(title="Benchmarks Metadata") - self.print_labeled_line( - label="Run id", - value=str(self.benchmarks[0].run_id), - ) - self.print_labeled_line( - label="Duration", - value=f"{duration:.1f} seconds", - ) - self.print_labeled_line( - label="Profile", - value=self.benchmarks_profile_str, - ) - self.print_labeled_line( - label="Args", - value=self.benchmarks_args_str, - ) - self.print_labeled_line( - label="Worker", - value=self.benchmarks_worker_desc_str, - ) - self.print_labeled_line( - label="Request Loader", - value=self.benchmarks_request_loader_desc_str, - ) - self.print_labeled_line( - label="Extras", - value=self.benchmarks_extras_str, - ) + @classmethod + def validated_kwargs( + cls, output_path: str | Path | None, **_kwargs + ) -> dict[str, Any]: + new_kwargs = {} + if output_path is not None: + new_kwargs["output_path"] = ( + Path(output_path) if not isinstance(output_path, Path) else output_path + ) + return new_kwargs + + output_path: Path = Field(default_factory=lambda: Path.cwd()) - def print_benchmarks_info(self): + async def finalize(self, report: GenerativeBenchmarksReport) -> Path: """ - Print out the benchmark information to the console including the start time, - end time, duration, request totals, and token totals for each benchmark. + Save the benchmark report as a CSV file. + + :param report: The completed benchmark report. + :return: Path to the saved CSV file. """ - if not self.benchmarks: - raise ValueError( - "No benchmarks to print info for. Please set benchmarks first." - ) + output_path = self.output_path + if output_path.is_dir(): + output_path = output_path / GenerativeBenchmarkerCSV.DEFAULT_FILE + output_path.parent.mkdir(parents=True, exist_ok=True) - sections = { - "Metadata": (0, 3), - "Requests Made": (4, 6), - "Prompt Tok/Req": (7, 9), - "Output Tok/Req": (10, 12), - "Prompt Tok Total": (13, 15), - "Output Tok Total": (16, 18), - } + with output_path.open("w", newline="") as file: + writer = csv.writer(file) + headers: list[str] = [] + rows: list[list[str | float | list[float]]] = [] + + for benchmark in report.benchmarks: + benchmark_headers: list[str] = [] + benchmark_values: list[str | float | list[float]] = [] + + # Add status-based metrics + for status in StatusDistributionSummary.model_fields: + status_headers, status_values = ( + self._get_benchmark_status_headers_and_values(benchmark, status) + ) + benchmark_headers.extend(status_headers) + benchmark_values.extend(status_values) + + # Add extra fields + extras_headers, extras_values = ( + self._get_benchmark_extras_headers_and_values(benchmark) + ) + benchmark_headers.extend(extras_headers) + benchmark_values.extend(extras_values) + + if not headers: + headers = benchmark_headers + rows.append(benchmark_values) + + writer.writerow(headers) + for row in rows: + writer.writerow(row) + + return output_path + + def _get_benchmark_desc_headers_and_values( + self, benchmark: GenerativeBenchmark + ) -> tuple[list[str], list[str | float]]: + """Get description headers and values for a benchmark.""" headers = [ - "Benchmark", + "Type", + "Run Id", + "Id", + "Name", "Start Time", "End Time", - "Duration (s)", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", + "Duration", ] - rows = [] + values: list[str | float] = [ + benchmark.type_, + benchmark.run_id, + benchmark.id_, + str(benchmark.scheduler.strategy), + datetime.fromtimestamp(benchmark.start_time).strftime("%Y-%m-%d %H:%M:%S"), + datetime.fromtimestamp(benchmark.end_time).strftime("%Y-%m-%d %H:%M:%S"), + benchmark.duration, + ] + return headers, values - for benchmark in self.benchmarks: - rows.append( - [ - strategy_display_str(benchmark.args.strategy), - f"{datetime.fromtimestamp(benchmark.start_time).strftime('%H:%M:%S')}", - f"{datetime.fromtimestamp(benchmark.end_time).strftime('%H:%M:%S')}", - f"{(benchmark.end_time - benchmark.start_time):.1f}", - f"{benchmark.request_totals.successful:.0f}", - f"{benchmark.request_totals.incomplete:.0f}", - f"{benchmark.request_totals.errored:.0f}", - f"{benchmark.metrics.prompt_token_count.successful.mean:.1f}", - f"{benchmark.metrics.prompt_token_count.incomplete.mean:.1f}", - f"{benchmark.metrics.prompt_token_count.errored.mean:.1f}", - f"{benchmark.metrics.output_token_count.successful.mean:.1f}", - f"{benchmark.metrics.output_token_count.incomplete.mean:.1f}", - f"{benchmark.metrics.output_token_count.errored.mean:.1f}", - f"{benchmark.metrics.prompt_token_count.successful.total_sum:.0f}", - f"{benchmark.metrics.prompt_token_count.incomplete.total_sum:.0f}", - f"{benchmark.metrics.prompt_token_count.errored.total_sum:.0f}", - f"{benchmark.metrics.output_token_count.successful.total_sum:.0f}", - f"{benchmark.metrics.output_token_count.incomplete.total_sum:.0f}", - f"{benchmark.metrics.output_token_count.errored.total_sum:.0f}", - ] + def _get_benchmark_status_headers_and_values( + self, benchmark: GenerativeBenchmark, status: str + ) -> tuple[list[str], list[float | list[float]]]: + """Get status-based metrics headers and values for a benchmark.""" + headers = [f"{status.capitalize()} Requests"] + values = [getattr(benchmark.request_totals, status)] + + for metric in GenerativeMetrics.model_fields: + metric_headers, metric_values = self._get_benchmark_status_metrics_stats( + benchmark, status, metric ) + headers.extend(metric_headers) + values.extend(metric_values) - self.print_table( - headers=headers, rows=rows, title="Benchmarks Info", sections=sections - ) + return headers, values - def print_benchmarks_stats(self): - """ - Print out the benchmark statistics to the console including the requests per - second, request concurrency, output tokens per second, total tokens per second, - request latency, time to first token, inter token latency, and time per output - token for each benchmark. - """ - if not self.benchmarks: - raise ValueError( - "No benchmarks to print stats for. Please set benchmarks first." - ) + def _get_benchmark_status_metrics_stats( + self, benchmark: GenerativeBenchmark, status: str, metric: str + ) -> tuple[list[str], list[float | list[float]]]: + """Get statistical metrics for a specific status and metric.""" + status_display = status.capitalize() + metric_display = metric.replace("_", " ").capitalize() + status_dist_summary: StatusDistributionSummary = getattr( + benchmark.metrics, metric + ) + dist_summary: DistributionSummary = getattr(status_dist_summary, status) - sections = { - "Metadata": (0, 0), - "Request Stats": (1, 2), - "Out Tok/sec": (3, 3), - "Tot Tok/sec": (4, 4), - "Req Latency (sec)": (5, 7), - "TTFT (ms)": (8, 10), - "ITL (ms)": (11, 13), - "TPOT (ms)": (14, 16), - } headers = [ - "Benchmark", - "Per Second", - "Concurrency", - "mean", - "mean", - "mean", - "median", - "p99", - "mean", - "median", - "p99", - "mean", - "median", - "p99", - "mean", - "median", - "p99", + f"{status_display} {metric_display} mean", + f"{status_display} {metric_display} median", + f"{status_display} {metric_display} std dev", + ( + f"{status_display} {metric_display} " + "[min, 0.1, 1, 5, 10, 25, 75, 90, 95, 99, max]" + ), ] - rows = [] + values: list[float | list[float]] = [ + dist_summary.mean, + dist_summary.median, + dist_summary.std_dev, + [ + dist_summary.min, + dist_summary.percentiles.p001, + dist_summary.percentiles.p01, + dist_summary.percentiles.p05, + dist_summary.percentiles.p10, + dist_summary.percentiles.p25, + dist_summary.percentiles.p75, + dist_summary.percentiles.p90, + dist_summary.percentiles.p95, + dist_summary.percentiles.p99, + dist_summary.max, + ], + ] + return headers, values - for benchmark in self.benchmarks: - rows.append( - [ - strategy_display_str(benchmark.args.strategy), - f"{benchmark.metrics.requests_per_second.successful.mean:.2f}", - f"{benchmark.metrics.request_concurrency.successful.mean:.2f}", - f"{benchmark.metrics.output_tokens_per_second.successful.mean:.1f}", - f"{benchmark.metrics.tokens_per_second.successful.mean:.1f}", - f"{benchmark.metrics.request_latency.successful.mean:.2f}", - f"{benchmark.metrics.request_latency.successful.median:.2f}", - f"{benchmark.metrics.request_latency.successful.percentiles.p99:.2f}", - f"{benchmark.metrics.time_to_first_token_ms.successful.mean:.1f}", - f"{benchmark.metrics.time_to_first_token_ms.successful.median:.1f}", - f"{benchmark.metrics.time_to_first_token_ms.successful.percentiles.p99:.1f}", - f"{benchmark.metrics.inter_token_latency_ms.successful.mean:.1f}", - f"{benchmark.metrics.inter_token_latency_ms.successful.median:.1f}", - f"{benchmark.metrics.inter_token_latency_ms.successful.percentiles.p99:.1f}", - f"{benchmark.metrics.time_per_output_token_ms.successful.mean:.1f}", - f"{benchmark.metrics.time_per_output_token_ms.successful.median:.1f}", - f"{benchmark.metrics.time_per_output_token_ms.successful.percentiles.p99:.1f}", - ] + +@GenerativeBenchmarkerOutput.register("html") +class GenerativeBenchmarkerHTML(GenerativeBenchmarkerOutput): + """HTML output formatter for benchmark results.""" + + DEFAULT_FILE: ClassVar[str] = "benchmarks.html" + + @classmethod + def validated_kwargs( + cls, output_path: str | Path | None, **_kwargs + ) -> dict[str, Any]: + new_kwargs = {} + if output_path is not None: + new_kwargs["output_path"] = ( + Path(output_path) if not isinstance(output_path, Path) else output_path ) + return new_kwargs - self.print_table( - headers=headers, - rows=rows, - title="Benchmarks Stats", - sections=sections, - ) + output_path: Path = Field(default_factory=lambda: Path.cwd()) - def print_full_report(self): + async def finalize(self, report: GenerativeBenchmarksReport) -> Path: """ - Print out the benchmark statistics to the console. - Temporarily enables the console if it's disabled. + Save the benchmark report as an HTML file. - Format: - - Metadata - - Info - - Stats + :param report: The completed benchmark report. + :return: Path to the saved HTML file. """ - orig_enabled = self.enabled - self.enabled = True - self.print_benchmarks_metadata() - self.print_benchmarks_info() - self.print_benchmarks_stats() - self.enabled = orig_enabled + import humps + + output_path = self.output_path + if output_path.is_dir(): + output_path = output_path / GenerativeBenchmarkerHTML.DEFAULT_FILE + output_path.parent.mkdir(parents=True, exist_ok=True) + + data_builder = UIDataBuilder(report.benchmarks) + data = data_builder.to_dict() + camel_data = humps.camelize(data) + + ui_api_data = {} + for key, value in camel_data.items(): + placeholder_key = f"window.{humps.decamelize(key)} = {{}};" + replacement_value = ( + f"window.{humps.decamelize(key)} = {json.dumps(value, indent=2)};\n" + ) + ui_api_data[placeholder_key] = replacement_value + + create_report(ui_api_data, output_path) + + return output_path diff --git a/src/guidellm/benchmark/profile.py b/src/guidellm/benchmark/profile.py index ca25fc24..042179ba 100644 --- a/src/guidellm/benchmark/profile.py +++ b/src/guidellm/benchmark/profile.py @@ -1,20 +1,52 @@ -from collections.abc import Sequence -from typing import Literal, Optional, Union +""" +Benchmarking profile configurations for coordinating multi-strategy execution. + +Provides configurable profile abstractions for orchestrating sequential and +parallel execution of different scheduling strategies during benchmarking, +with automatic strategy generation and constraint management. + +Classes: + Profile: Abstract base for multi-strategy benchmarking profiles. + SynchronousProfile: Single synchronous strategy execution profile. + ConcurrentProfile: Fixed-concurrency strategy execution profile. + ThroughputProfile: Maximum throughput strategy execution profile. + AsyncProfile: Rate-based asynchronous strategy execution profile. + SweepProfile: Adaptive multi-strategy sweep execution profile. + +Type Aliases: + ProfileType: Literal type for supported profile configurations. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections.abc import Generator +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Literal, +) import numpy as np -from pydantic import Field, computed_field +from pydantic import Field, computed_field, field_serializer, field_validator -from guidellm.objects import StandardBaseModel from guidellm.scheduler import ( AsyncConstantStrategy, AsyncPoissonStrategy, ConcurrentStrategy, + Constraint, + ConstraintInitializer, + ConstraintsInitializerFactory, SchedulingStrategy, StrategyType, SynchronousStrategy, ThroughputStrategy, ) -from guidellm.settings import settings +from guidellm.utils import PydanticClassRegistryMixin + +if TYPE_CHECKING: + from guidellm.benchmark.objects import Benchmark __all__ = [ "AsyncProfile", @@ -24,386 +56,661 @@ "SweepProfile", "SynchronousProfile", "ThroughputProfile", - "create_profile", ] ProfileType = Literal["synchronous", "concurrent", "throughput", "async", "sweep"] -class Profile(StandardBaseModel): +class Profile( + PydanticClassRegistryMixin["type[Profile]"], + ABC, +): + """ + Abstract base for multi-strategy benchmarking execution profiles. + + Coordinates sequential execution of scheduling strategies with automatic + strategy generation, constraint management, and completion tracking for + comprehensive benchmarking workflows. + """ + + schema_discriminator: ClassVar[str] = "type_" + + @classmethod + def __pydantic_schema_base_type__(cls) -> type[Profile]: + if cls.__name__ == "Profile": + return cls + + return Profile + + @classmethod + def create( + cls, + rate_type: str, + rate: float | int | list[float | int] | None, + random_seed: int = 42, + **kwargs: Any, + ) -> Profile: + """ + Create a profile instance based on the specified type. + + :param rate_type: The type of profile to create. + :param rate: Rate parameter for profile configuration. + :param random_seed: Random seed for stochastic strategies. + :param kwargs: Additional arguments for profile configuration. + :return: Configured profile instance for the specified type. + :raises ValueError: If the profile type is not registered. + """ + profile_class: type[Profile] = cls.get_registered_object(rate_type) + resolved_kwargs = profile_class.resolve_args( + rate_type=rate_type, rate=rate, random_seed=random_seed, **kwargs + ) + + return profile_class(**resolved_kwargs) + + @classmethod + @abstractmethod + def resolve_args( + cls, + rate_type: str, + rate: float | int | list[float, int] | None, + random_seed: int, + **kwargs: Any, + ) -> dict[str, Any]: + """ + Resolve and validate arguments for profile construction. + + :param rate_type: The type of the profile. + :param rate: Rate parameter for configuration. + :param random_seed: Random seed for stochastic strategies. + :param kwargs: Additional arguments to resolve. + :return: Dictionary of resolved arguments for profile construction. + """ + ... + type_: Literal["profile"] = Field( - description="The type of benchmarking profile to use.", + description="The type of benchmarking profile to use", ) - completed_strategies: int = Field( - default=0, - description="The number of scheduling strategies generated so far.", - ) - measured_rates: list[float] = Field( + completed_strategies: list[SchedulingStrategy] = Field( default_factory=list, - description=("The average rates measured for the strategies that have run."), + description="The strategies that have completed execution", ) - measured_concurrencies: list[float] = Field( - default_factory=list, - description=( - "The average concurrency measured for the strategies that have run." - ), + constraints: dict[str, Any | dict[str, Any] | ConstraintInitializer] | None = Field( + default=None, + description="Runtime constraints to apply during strategy execution", ) - def completed_strategy(self, average_rate: float, average_concurrency: float): - self.measured_rates.append(average_rate) - self.measured_concurrencies.append(average_concurrency) - self.completed_strategies += 1 - @computed_field # type: ignore[misc] @property def strategy_types(self) -> list[StrategyType]: - return [] + """ + :return: List of all strategy types expected to be executed or have been + executed in this profile. By default, this returns just the + completed strategies. + """ + return [strat.type_ for strat in self.completed_strategies] + + def strategies_generator( + self, + ) -> Generator[ + tuple[ + SchedulingStrategy | None, + dict[str, Any | dict[str, Any] | Constraint] | None, + ], + Benchmark | None, + None, + ]: + """ + Generate strategies and constraints for sequential profile execution. + + :return: Generator yielding (strategy, constraints) tuples and + receiving benchmark results from each execution. + """ + prev_strategy: SchedulingStrategy | None = None + prev_benchmark: Benchmark | None = None + + while ( + strategy := self.next_strategy(prev_strategy, prev_benchmark) + ) is not None: + constraints = self.next_strategy_constraints( + strategy, prev_strategy, prev_benchmark + ) + prev_benchmark = yield ( + strategy, + constraints, + ) + prev_strategy = strategy + self.completed_strategies.append(prev_strategy) + + @abstractmethod + def next_strategy( + self, + prev_strategy: SchedulingStrategy | None, + prev_benchmark: Benchmark | None, + ) -> SchedulingStrategy | None: + """ + Generate the next strategy to execute in the profile sequence. + + :param prev_strategy: The previously completed strategy. + :param prev_benchmark: Benchmark results from the previous strategy. + :return: Next strategy to execute, or None if profile is complete. + """ + ... + + def next_strategy_constraints( + self, + next_strategy: SchedulingStrategy | None, + prev_strategy: SchedulingStrategy | None, + prev_benchmark: Benchmark | None, + ) -> dict[str, Any | dict[str, Any] | Constraint] | None: + """ + Generate constraints for the next strategy execution. + + :param next_strategy: The next strategy to be executed. + :param prev_strategy: The previously completed strategy. + :param prev_benchmark: Benchmark results from the previous strategy. + :return: Constraints dictionary for the next strategy, or None. + """ + _ = (prev_strategy, prev_benchmark) # unused + return ( + ConstraintsInitializerFactory.resolve(self.constraints) + if next_strategy and self.constraints + else None + ) - def next_strategy(self) -> Optional[SchedulingStrategy]: - return None + @field_validator("constraints", mode="before") + @classmethod + def _constraints_validator( + cls, value: Any + ) -> dict[str, Any | dict[str, Any] | ConstraintInitializer] | None: + if value is None: + return None + if not isinstance(value, dict): + raise ValueError("Constraints must be a dictionary") + return { + key: ( + val + if not isinstance(val, ConstraintInitializer) + else ConstraintsInitializerFactory.deserialize(initializer_dict=val) + ) + for key, val in value.items() + } + + @field_serializer + def _constraints_serializer( + self, + constraints: dict[str, Any | dict[str, Any] | ConstraintInitializer] | None, + ) -> dict[str, Any | dict[str, Any]] | None: + if constraints is None: + return None + + return { + key: ( + val + if not isinstance(val, ConstraintInitializer) + else ConstraintsInitializerFactory.serialize(initializer=val) + ) + for key, val in constraints.items() + } + + +@Profile.register("synchronous") class SynchronousProfile(Profile): + """Single synchronous strategy execution profile.""" + type_: Literal["synchronous"] = "synchronous" # type: ignore[assignment] + @classmethod + def resolve_args( + cls, + rate_type: str, + rate: float | int | list[float, int] | None, + random_seed: int, + **kwargs: Any, + ) -> dict[str, Any]: + """ + Resolve arguments for synchronous profile construction. + + :param rate_type: The type/strategy of the profile (ignored). + :param rate: Rate parameter (must be None, will be stripped). + :param random_seed: Random seed (ignored and stripped). + :param kwargs: Additional arguments to pass through. + :return: Dictionary of resolved arguments. + :raises ValueError: If rate is not None. + """ + _ = (rate_type, random_seed) # unused + if rate is not None: + raise ValueError("SynchronousProfile does not accept a rate parameter") + + return kwargs + @property def strategy_types(self) -> list[StrategyType]: + """ + :return: The single synchronous strategy type. + """ return [self.type_] - def next_strategy(self) -> Optional[SchedulingStrategy]: - if self.completed_strategies >= 1: + def next_strategy( + self, + prev_strategy: SchedulingStrategy | None, + prev_benchmark: Benchmark | None, + ) -> SynchronousStrategy | None: + """ + Generate synchronous strategy or None if already completed. + + :param prev_strategy: The previously completed strategy (unused). + :param prev_benchmark: Benchmark results from the previous strategy (unused). + :return: SynchronousStrategy for the first execution, None afterward. + """ + _ = (prev_strategy, prev_benchmark) # unused + if len(self.completed_strategies) >= 1: return None return SynchronousStrategy() - @staticmethod - def from_standard_args( - rate_type: Union[StrategyType, ProfileType], - rate: Optional[Union[float, Sequence[float]]], - **kwargs, - ) -> "SynchronousProfile": - if rate_type != "synchronous": - raise ValueError("Rate type must be 'synchronous' for synchronous profile.") - - if rate is not None: - raise ValueError( - "Rate does not apply to synchronous profile, it must be set to None." - ) - - if kwargs: - raise ValueError( - "No additional arguments are allowed for synchronous profile." - ) - - return SynchronousProfile() - +@Profile.register("concurrent") class ConcurrentProfile(Profile): + """Fixed-concurrency strategy execution profile with configurable stream counts.""" + type_: Literal["concurrent"] = "concurrent" # type: ignore[assignment] - streams: Union[int, Sequence[int]] = Field( - description="The number of concurrent streams to use.", + streams: int | list[int] = Field( + description="Number of concurrent streams for request scheduling", + gt=0, + ) + startup_duration: float = Field( + default=0.0, + description=( + "Duration in seconds for distributing startup requests " + "before completion-based timing" + ), + ge=0, ) + @classmethod + def resolve_args( + cls, + rate_type: str, + rate: float | int | list[float, int] | None, + random_seed: int, + **kwargs: Any, + ) -> dict[str, Any]: + """ + Resolve arguments for concurrent profile construction. + + :param rate_type: The type/strategy of the profile (ignored). + :param rate: Rate parameter, remapped to streams. + :param random_seed: Random seed (ignored and stripped). + :param kwargs: Additional arguments to pass through. + :return: Dictionary of resolved arguments. + :raises ValueError: If rate is None. + """ + _ = (rate_type, random_seed) # unused + kwargs["streams"] = rate + return kwargs + @property def strategy_types(self) -> list[StrategyType]: - num_strategies = len(self.streams) if isinstance(self.streams, Sequence) else 1 - + """Get concurrent strategy types for each configured stream count.""" + num_strategies = len(self.streams) if isinstance(self.streams, list) else 1 return [self.type_] * num_strategies - def next_strategy(self) -> Optional[SchedulingStrategy]: - streams = self.streams if isinstance(self.streams, Sequence) else [self.streams] - - if self.completed_strategies >= len(streams): + def next_strategy( + self, + prev_strategy: SchedulingStrategy | None, + prev_benchmark: Benchmark | None, + ) -> ConcurrentStrategy | None: + """ + Generate concurrent strategy for the next stream count. + + :param prev_strategy: The previously completed strategy (unused). + :param prev_benchmark: Benchmark results from the previous strategy (unused). + :return: ConcurrentStrategy with next stream count, or None if complete. + """ + _ = (prev_strategy, prev_benchmark) # unused + streams = self.streams if isinstance(self.streams, list) else [self.streams] + + if len(self.completed_strategies) >= len(streams): return None return ConcurrentStrategy( - streams=streams[self.completed_strategies], + streams=streams[len(self.completed_strategies)], + startup_duration=self.startup_duration, ) - @staticmethod - def from_standard_args( - rate_type: Union[StrategyType, ProfileType], - rate: Optional[Union[float, Sequence[float]]], - **kwargs, - ) -> "ConcurrentProfile": - if rate_type != "concurrent": - raise ValueError("Rate type must be 'concurrent' for concurrent profile.") - - if not rate: - raise ValueError("Rate (streams) must be provided for concurrent profile.") - - if not isinstance(rate, Sequence): - rate = [rate] - - if not all(stream.is_integer() and stream > 0 for stream in rate): - raise ValueError( - f"All rate values (streams) must be positive integers, received {rate}" - ) - - if kwargs: - raise ValueError( - "No additional arguments are allowed for concurrent profile." - ) - - return ConcurrentProfile(streams=[int(rat) for rat in rate]) - +@Profile.register("throughput") class ThroughputProfile(Profile): + """ + Maximum throughput strategy execution profile with optional concurrency limits. + """ + type_: Literal["throughput"] = "throughput" # type: ignore[assignment] - max_concurrency: Optional[int] = Field( + max_concurrency: int | None = Field( default=None, - description="The maximum number of concurrent requests that can be scheduled.", + description="Maximum number of concurrent requests to schedule", + gt=0, + ) + startup_duration: float = Field( + default=0.0, + description=( + "Duration in seconds for distributing startup requests " + "before full throughput scheduling" + ), + ge=0, ) + @classmethod + def resolve_args( + cls, + rate_type: str, + rate: float | int | list[float, int] | None, + random_seed: int, + **kwargs: Any, + ) -> dict[str, Any]: + """ + Resolve arguments for throughput profile construction. + + :param rate_type: The type/strategy of the profile (ignored). + :param rate: Rate parameter to remap to max_concurrency. + :param random_seed: Random seed (ignored and stripped). + :param kwargs: Additional arguments to pass through. + :return: Dictionary of resolved arguments. + """ + _ = (rate_type, random_seed) # unused + # Remap rate to max_concurrency, strip out random_seed + kwargs.pop("random_seed", None) + if rate is not None: + kwargs["max_concurrency"] = rate + return kwargs + @property def strategy_types(self) -> list[StrategyType]: + """Get the single throughput strategy type.""" return [self.type_] - def next_strategy(self) -> Optional[SchedulingStrategy]: - if self.completed_strategies >= 1: + def next_strategy( + self, + prev_strategy: SchedulingStrategy | None, + prev_benchmark: Benchmark | None, + ) -> ThroughputStrategy | None: + """ + Generate throughput strategy or None if already completed. + + :param prev_strategy: The previously completed strategy (unused). + :param prev_benchmark: Benchmark results from the previous strategy (unused). + :return: ThroughputStrategy for the first execution, None afterward. + """ + _ = (prev_strategy, prev_benchmark) # unused + if len(self.completed_strategies) >= 1: return None return ThroughputStrategy( max_concurrency=self.max_concurrency, + startup_duration=self.startup_duration, ) - @staticmethod - def from_standard_args( - rate_type: Union[StrategyType, ProfileType], - rate: Optional[Union[float, Sequence[float]]], - **kwargs, - ) -> "ThroughputProfile": - if rate_type != "throughput": - raise ValueError("Rate type must be 'throughput' for throughput profile.") - - if rate is not None: - raise ValueError( - "Rate does not apply to throughput profile, it must be set to None." - ) - return ThroughputProfile(**kwargs) +@Profile.register(["async", "constant", "poisson"]) +class AsyncProfile(Profile): + """ + Rate-based asynchronous strategy execution profile with configurable patterns. + """ - -class AsyncProfile(ThroughputProfile): - type_: Literal["async"] = "async" # type: ignore[assignment] + type_: Literal["async", "constant", "poisson"] = "async" # type: ignore[assignment] strategy_type: Literal["constant", "poisson"] = Field( - description="The type of asynchronous strategy to use.", + description="Type of asynchronous strategy pattern to use", ) - rate: Union[float, Sequence[float]] = Field( - description="The rate of requests per second to use.", + rate: float | list[float] = Field( + description="Request scheduling rate in requests per second", + gt=0, ) - initial_burst: bool = Field( - default=True, + startup_duration: float = Field( + default=0.0, description=( - "True to send an initial burst of requests (math.floor(self.rate)) " - "to reach target rate. False to not send an initial burst." + "Duration in seconds for distributing startup requests " + "to converge quickly to desired rate" ), + ge=0, + ) + max_concurrency: int | None = Field( + default=None, + description="Maximum number of concurrent requests to schedule", + gt=0, ) random_seed: int = Field( default=42, - description=( - "The random seed to use for the asynchronous strategy. " - "This is used to generate random numbers for the Poisson strategy." - ), + description="Random seed for Poisson distribution strategy", ) + @classmethod + def resolve_args( + cls, + rate_type: str, + rate: float | int | list[float, int] | None, + random_seed: int, + **kwargs: Any, + ) -> dict[str, Any]: + """ + Resolve arguments for async profile construction. + + :param rate_type: The type/strategy of the profile. + :param rate: Rate parameter for the profile. + :param random_seed: Random seed for stochastic strategies. + :param kwargs: Additional arguments to pass through. + :return: Dictionary of resolved arguments. + :raises ValueError: If rate is None. + """ + if rate is None: + raise ValueError("AsyncProfile requires a rate parameter") + + kwargs["type_"] = ( + rate_type + if rate_type in ["async", "constant", "poisson"] + else kwargs.get("type_", "async") + ) + kwargs["strategy_type"] = ( + rate_type + if rate_type in ["constant", "poisson"] + else kwargs.get("strategy_type", "constant") + ) + kwargs["rate"] = rate + kwargs["random_seed"] = random_seed + return kwargs + @property def strategy_types(self) -> list[StrategyType]: - num_strategies = len(self.rate) if isinstance(self.rate, Sequence) else 1 - + """Get async strategy types for each configured rate.""" + num_strategies = len(self.rate) if isinstance(self.rate, list) else 1 return [self.strategy_type] * num_strategies - def next_strategy(self) -> Optional[SchedulingStrategy]: - rate = self.rate if isinstance(self.rate, Sequence) else [self.rate] - - if self.completed_strategies >= len(rate): + def next_strategy( + self, + prev_strategy: SchedulingStrategy | None, + prev_benchmark: Benchmark | None, + ) -> AsyncConstantStrategy | AsyncPoissonStrategy | None: + """ + Generate async strategy for the next configured rate. + + :param prev_strategy: The previously completed strategy (unused). + :param prev_benchmark: Benchmark results from the previous strategy (unused). + :return: AsyncConstantStrategy or AsyncPoissonStrategy for next rate, + or None if all rates completed. + :raises ValueError: If strategy_type is neither 'constant' nor 'poisson'. + """ + _ = (prev_strategy, prev_benchmark) # unused + rate = self.rate if isinstance(self.rate, list) else [self.rate] + + if len(self.completed_strategies) >= len(rate): return None + current_rate = rate[len(self.completed_strategies)] + if self.strategy_type == "constant": return AsyncConstantStrategy( - rate=rate[self.completed_strategies], - initial_burst=self.initial_burst, + rate=current_rate, + startup_duration=self.startup_duration, max_concurrency=self.max_concurrency, ) elif self.strategy_type == "poisson": return AsyncPoissonStrategy( - rate=rate[self.completed_strategies], - initial_burst=self.initial_burst, + rate=current_rate, + startup_duration=self.startup_duration, max_concurrency=self.max_concurrency, random_seed=self.random_seed, ) else: raise ValueError(f"Invalid strategy type: {self.strategy_type}") - @staticmethod - def from_standard_args( # type: ignore[override] - rate_type: Union[StrategyType, ProfileType], - rate: Optional[Union[float, Sequence[float]]], - random_seed: int, - **kwargs, - ) -> "AsyncProfile": - if rate_type not in ("async", "constant", "poisson"): - raise ValueError( - "Rate type must be in ('async', 'constant', 'poisson') " - f"for async profile. Received: {rate_type}" - ) - - if not rate: - raise ValueError("Rate must be provided for async profile.") - - if not isinstance(rate, Sequence): - rate = [rate] - - if not all(isinstance(r, (float, int)) and r > 0 for r in rate): - raise ValueError( - f"All rate values must be positive numbers, received {rate}" - ) - - if rate_type == "async": - rate_type = "constant" # default to constant if not specified - return AsyncProfile( - strategy_type=rate_type, # type: ignore[arg-type] - rate=rate, - random_seed=random_seed, - **kwargs, - ) +@Profile.register("sweep") +class SweepProfile(Profile): + """ + Adaptive multi-strategy sweep execution profile with rate discovery. + """ - -class SweepProfile(AsyncProfile): type_: Literal["sweep"] = "sweep" # type: ignore[assignment] sweep_size: int = Field( - description="The number of strategies to generate for the sweep.", + description="Number of strategies to generate for the sweep", + ge=2, + ) + strategy_type: Literal["constant", "poisson"] = "constant" + startup_duration: float = Field( + default=0.0, + description=( + "Duration in seconds for distributing startup requests " + "to converge quickly to desired rate" + ), + ge=0, + ) + max_concurrency: int | None = Field( + default=None, + description="Maximum number of concurrent requests to schedule", + gt=0, ) - rate: float = -1 - rate_type: Literal["constant", "poisson"] = "constant" + random_seed: int = Field( + default=42, + description="Random seed for Poisson distribution strategy", + ) + synchronous_rate: float = Field( + default=-1.0, + description="Measured rate from synchronous strategy execution", + ) + throughput_rate: float = Field( + default=-1.0, + description="Measured rate from throughput strategy execution", + ) + async_rates: list[float] = Field( + default_factory=list, + description="Generated rates for async strategy sweep", + ) + measured_rates: list[float] = Field( + default_factory=list, + description="Calculated interpolated rates between synchronous and throughput", + ) + + @classmethod + def resolve_args( + cls, + rate_type: str, + rate: float | int | list[float, int] | None, + random_seed: int, + **kwargs: Any, + ) -> dict[str, Any]: + """ + Resolve arguments for sweep profile construction. + + :param rate_type: The type/strategy for async strategies in the sweep. + :param rate: Rate parameter (ignored for sweep). + :param random_seed: Random seed for stochastic strategies. + :param kwargs: Additional arguments to pass through. + :return: Dictionary of resolved arguments. + """ + kwargs["sweep_size"] = kwargs.get("sweep_size", rate) + kwargs["random_seed"] = random_seed + if rate_type in ["constant", "poisson"]: + kwargs["strategy_type"] = rate_type + return kwargs @property def strategy_types(self) -> list[StrategyType]: - return ( - ["synchronous"] + ["throughput"] + [self.rate_type] * (self.sweep_size - 2) # type: ignore[return-value] - ) - - def next_strategy(self) -> Optional[SchedulingStrategy]: - if self.completed_strategies >= self.sweep_size: - return None - - if self.completed_strategies == 0: + """Get strategy types for the complete sweep sequence.""" + types = ["synchronous", "throughput"] + types += [self.strategy_type] * (self.sweep_size - len(types)) + return types + + def next_strategy( + self, + prev_strategy: SchedulingStrategy | None, + prev_benchmark: Benchmark | None, + ) -> ( + AsyncConstantStrategy + | AsyncPoissonStrategy + | SynchronousProfile + | ThroughputProfile + | None + ): + """ + Generate the next strategy in the adaptive sweep sequence. + + Executes synchronous and throughput strategies first to measure + baseline rates, then generates interpolated rates for async strategies. + + :param prev_strategy: The previously completed strategy. + :param prev_benchmark: Benchmark results from the previous strategy. + :return: Next strategy in sweep sequence, or None if complete. + :raises ValueError: If strategy_type is neither 'constant' nor 'poisson'. + """ + if prev_strategy is None: return SynchronousStrategy() - if self.completed_strategies == 1: + if prev_strategy.type_ == "synchronous": + self.synchronous_rate = ( + prev_benchmark.metrics.requests_per_second.successful.mean + ) + return ThroughputStrategy( max_concurrency=self.max_concurrency, + startup_duration=self.startup_duration, ) - min_rate = self.measured_rates[0] - max_rate = self.measured_rates[1] - rates = np.linspace(min_rate, max_rate, self.sweep_size - 1)[1:] + if prev_strategy.type_ == "throughput": + self.throughput_rate = ( + prev_benchmark.metrics.requests_per_second.successful.mean + ) + self.measured_rates = list( + np.linspace( + self.synchronous_rate, + self.throughput_rate, + self.sweep_size - 1, + ) + )[1:] # don't rerun synchronous - if self.rate_type == "constant": + if len(self.completed_strategies) >= self.sweep_size: + return None + + next_rate_index = len( + [ + strat + for strat in self.completed_strategies + if strat.type_ == self.strategy_type + ] + ) + + if self.strategy_type == "constant": return AsyncConstantStrategy( - rate=rates[self.completed_strategies - 2], - initial_burst=self.initial_burst, + rate=self.measured_rates[next_rate_index], + startup_duration=self.startup_duration, max_concurrency=self.max_concurrency, ) - elif self.rate_type == "poisson": + elif self.strategy_type == "poisson": return AsyncPoissonStrategy( - rate=rates[self.completed_strategies - 2], - initial_burst=self.initial_burst, + rate=self.measured_rates[next_rate_index], + startup_duration=self.startup_duration, max_concurrency=self.max_concurrency, + random_seed=self.random_seed, ) else: - raise ValueError(f"Invalid strategy type: {self.rate_type}") - - @staticmethod - def from_standard_args( # type: ignore[override] - rate_type: Union[StrategyType, ProfileType], - rate: Optional[Union[float, Sequence[float]]], - random_seed: int, - **kwargs, - ) -> "SweepProfile": - if rate_type != "sweep": - raise ValueError("Rate type must be 'sweep' for sweep profile.") - - if "sweep_size" in kwargs: - raise ValueError("Sweep size must not be provided, use rate instead.") - - if isinstance(rate, Sequence): - if len(rate) != 1: - raise ValueError( - "Rate must be a single value for sweep profile, received " - f"{len(rate)} values." - ) - rate = rate[0] - - if not rate: - rate = settings.default_sweep_number - - if not rate: - raise ValueError( - "Rate (sweep_size) must be provided for concurrent profile." - ) - - if ( - not isinstance(rate, (int, float)) - or (isinstance(rate, float) and not rate.is_integer()) - or rate <= 1 - ): - raise ValueError( - f"Rate (sweep_size) must be a positive integer > 1, received {rate} " - f"with type {type(rate)}" - ) - - if not kwargs: - kwargs = {} - - if "strategy_type" not in kwargs: - kwargs["strategy_type"] = "constant" - - return SweepProfile(sweep_size=int(rate), random_seed=random_seed, **kwargs) - - -def create_profile( - rate_type: Union[StrategyType, ProfileType], - rate: Optional[Union[float, Sequence[float]]], - random_seed: int = 42, - **kwargs, -) -> "Profile": - if rate_type == "synchronous": - return SynchronousProfile.from_standard_args( - rate_type=rate_type, - rate=rate, - **kwargs, - ) - - if rate_type == "concurrent": - return ConcurrentProfile.from_standard_args( - rate_type=rate_type, - rate=rate, - **kwargs, - ) - - if rate_type == "throughput": - return ThroughputProfile.from_standard_args( - rate_type=rate_type, - rate=rate, - **kwargs, - ) - - if rate_type in ("async", "constant", "poisson"): - return AsyncProfile.from_standard_args( - rate_type=rate_type, - rate=rate, - random_seed=random_seed, - **kwargs, - ) - - if rate_type == "sweep": - return SweepProfile.from_standard_args( - rate_type=rate_type, - rate=rate, - random_seed=random_seed, - **kwargs, - ) - - raise ValueError(f"Invalid profile type: {rate_type}") + raise ValueError(f"Invalid strategy type: {self.strategy_type}") diff --git a/src/guidellm/benchmark/progress.py b/src/guidellm/benchmark/progress.py index 1232107b..f93b3a83 100644 --- a/src/guidellm/benchmark/progress.py +++ b/src/guidellm/benchmark/progress.py @@ -1,8 +1,27 @@ -import math -import time +""" +Benchmark progress tracking and console display abstractions. + +Provides progress tracking interfaces and implementations for monitoring benchmark +execution, displaying real-time statistics, and managing UI updates during +generative benchmarking operations. + +Classes: + BenchmarkerProgress: Abstract base for benchmark progress tracking. + BenchmarkerProgressGroup: Composite progress handler for multiple instances. + GenerativeConsoleBenchmarkerProgress: Console-based progress display. + +Type Variables: + BenchmarkT: Generic benchmark object type. +""" + +from __future__ import annotations + +import asyncio +from abc import ABC, abstractmethod +from collections.abc import AsyncIterable, AsyncIterator, Iterable from dataclasses import dataclass from datetime import datetime -from typing import Generic, Optional, TypeVar, Union +from typing import Any, Generic, Literal from rich.console import Group from rich.live import Live @@ -10,7 +29,6 @@ from rich.progress import ( BarColumn, Progress, - ProgressColumn, SpinnerColumn, TaskID, TaskProgressColumn, @@ -19,145 +37,631 @@ TimeRemainingColumn, ) -from guidellm.benchmark.aggregator import ( - BenchmarkAggregator, - GenerativeBenchmarkAggregator, -) -from guidellm.benchmark.benchmark import Benchmark, GenerativeBenchmark -from guidellm.benchmark.benchmarker import BenchmarkerResult +from guidellm.benchmark.aggregator import AggregatorState +from guidellm.benchmark.objects import BenchmarkT, GenerativeBenchmark +from guidellm.benchmark.profile import Profile from guidellm.scheduler import ( + SchedulerState, SchedulingStrategy, StrategyType, - strategy_display_str, ) -from guidellm.utils import Colors +from guidellm.utils import Colors, format_value_display __all__ = [ - "BenchmarkerProgressDisplay", - "BenchmarkerTaskProgressState", - "GenerativeTextBenchmarkerProgressDisplay", - "GenerativeTextBenchmarkerTaskProgressState", + "BenchmarkerProgress", + "BenchmarkerProgressGroup", + "GenerativeConsoleBenchmarkerProgress", ] -@dataclass -class BenchmarkerTaskProgressState: - display_scheduler_stats: bool - - task_id: TaskID - strategy: Union[StrategyType, SchedulingStrategy] - started: bool = False - compiling: bool = False - ended: bool = False - - start_time: Optional[float] = None - max_number: Optional[float] = None - max_duration: Optional[float] = None - in_warmup: bool = False - in_cooldown: bool = False - - requests_rate: float = 0 - request_latency: float = 0 - requests_processing: float = 0 - requests_successful: float = 0 - requests_incomplete: float = 0 - requests_errored: float = 0 +class BenchmarkerProgress(Generic[BenchmarkT], ABC): + """ + Abstract base class for tracking and displaying benchmark progress. + + Provides lifecycle hooks for monitoring benchmark execution stages including + initialization, start, updates, completion, and finalization. Supports + enable/disable functionality for conditional progress tracking. + """ + + def __init__(self, enabled: bool = True): + """ + Initialize progress tracker. - worker_overheads_time_ms: float = 0.0 - backend_overheads_time_ms: float = 0.0 - requests_sleep_time_ms: float = 0.0 - requests_targeted_start_time_delay_ms: float = 0.0 + :param enabled: Whether to enable progress tracking and display. + """ + self._enabled = enabled + self.profile: Profile = None + self.current_strategy: SchedulingStrategy = None @property - def description(self) -> str: - return strategy_display_str(self.strategy) + def enabled(self) -> bool: + """ + :return: Whether progress tracking is currently enabled. + """ + return self._enabled + + @enabled.setter + def enabled(self, value: bool) -> None: + """ + :param value: True to enable progress tracking, False to disable. + :raises RuntimeError: If called after progress run has started. + """ + if self.profile is not None: + raise RuntimeError( + "Cannot change enabled state after __call__ for progress run" + ) + + self._enabled = value + + def __call__( + self, + profile: Profile, + agen: AsyncIterable[ + tuple[ + AggregatorState | None, + BenchmarkT | None, + SchedulingStrategy, + SchedulerState | None, + ] + ], + ) -> AsyncIterator[ + tuple[ + AggregatorState | None, + BenchmarkT | None, + SchedulingStrategy, + SchedulerState | None, + ] + ]: + """ + Track progress through benchmark execution pipeline. + + Wraps the provided async generator to monitor benchmark progress, + calling appropriate lifecycle hooks based on execution state. + + :param profile: Benchmark profile configuration. + :param agen: Async generator yielding benchmark execution updates. + :return: Async iterator forwarding original updates with progress tracking. + """ + + async def aiterator() -> AsyncIterator[ + tuple[ + AggregatorState | None, + BenchmarkT | None, + SchedulingStrategy, + SchedulerState | None, + ] + ]: + self.profile = profile + if self.enabled: + await self.on_initialize(profile) + + async for aggregator_update, benchmark, strategy, scheduler_state in agen: + if self.enabled: + await self.on_raw_update( + profile, + aggregator_update, + benchmark, + strategy, + scheduler_state, + ) + + if self.current_strategy != strategy: + self.current_strategy = strategy + await self.on_benchmark_start(strategy) + elif benchmark is not None: + await self.on_benchmark_complete(benchmark) + self.current_strategy = None + else: + await self.on_benchmark_update( + aggregator_update, scheduler_state + ) + + yield aggregator_update, benchmark, strategy, scheduler_state + + if self.enabled: + await self.on_finalize() + + return aiterator() + + @abstractmethod + async def on_initialize(self, profile: Profile): + """ + Initialize progress tracking for benchmark profile. + + :param profile: Benchmark profile configuration. + """ + + @abstractmethod + async def on_benchmark_start(self, strategy: SchedulingStrategy): + """ + Handle start of new benchmark strategy execution. + + :param strategy: Scheduling strategy being executed. + """ + + @abstractmethod + async def on_benchmark_update( + self, aggregator_update: AggregatorState, scheduler_state: SchedulerState + ): + """ + Handle benchmark execution progress update. + + :param aggregator_update: Current benchmark metrics and statistics. + :param scheduler_state: Current scheduler execution state. + """ + + @abstractmethod + async def on_benchmark_complete(self, benchmark: BenchmarkT): + """ + Handle completion of benchmark strategy execution. + + :param benchmark: Completed benchmark results. + """ + + @abstractmethod + async def on_finalize(self): + """Finalize progress tracking and cleanup resources.""" + + async def on_raw_update( + self, + profile: Profile, + aggregator_update: AggregatorState | None, + benchmark: BenchmarkT | None, + strategy: SchedulingStrategy, + scheduler_state: SchedulerState | None, + ): + """ + Handle raw benchmark execution update. + + Optional hook for accessing all execution state updates. Default + implementation does nothing. + + :param profile: Benchmark profile configuration. + :param aggregator_update: Current benchmark metrics and statistics. + :param benchmark: Completed benchmark if available. + :param strategy: Current scheduling strategy. + :param scheduler_state: Current scheduler execution state. + """ + + +class BenchmarkerProgressGroup(BenchmarkerProgress[BenchmarkT]): + """ + Composite progress handler that manages multiple progress instances. + + Distributes progress events to all contained progress instances, enabling + parallel progress tracking through multiple channels (e.g., console display + and file logging). + + :param instances: Collection of progress handlers to manage. + :param enabled: Whether the group is active. + """ + + def __init__( + self, + instances: ( + Iterable[BenchmarkerProgress[BenchmarkT]] + | list[BenchmarkerProgress[BenchmarkT]] + ), + enabled: bool = True, + ): + """ + Initialize progress group with handler instances. + + :param instances: Progress handler instances to coordinate. + :param enabled: Whether to enable the progress group. + """ + self.instances: list[BenchmarkerProgress[BenchmarkT]] = list(instances) + super().__init__(enabled=enabled) @property - def total(self) -> Optional[float]: - if self.max_number is None and self.max_duration is None: - return None + def enabled(self) -> bool: + """Whether the progress group is currently enabled.""" + return self._enabled + + @enabled.setter + def enabled(self, value: bool): + """ + Set enabled state for group and all contained instances. + + :param value: New enabled state. + """ + self._enabled = value + for instance in self.instances: + instance.enabled = value - return 1000 + async def on_initialize(self, profile: Profile): + """ + Initialize all progress handler instances. + + :param profile: Benchmark profile configuration. + """ + await asyncio.gather( + *[child.on_initialize(profile) for child in self.instances] + ) + + async def on_benchmark_start(self, strategy: SchedulingStrategy): + """ + Notify all handlers of benchmark strategy start. + + :param strategy: Scheduling strategy being executed. + """ + await asyncio.gather( + *[child.on_benchmark_start(strategy) for child in self.instances] + ) + + async def on_benchmark_update( + self, aggregator_update: AggregatorState, scheduler_state: SchedulerState + ): + """ + Distribute benchmark updates to all handlers. + + :param aggregator_update: Current benchmark metrics and statistics. + :param scheduler_state: Current scheduler execution state. + """ + await asyncio.gather( + *[ + child.on_benchmark_update(aggregator_update, scheduler_state) + for child in self.instances + ] + ) + + async def on_benchmark_complete(self, benchmark: BenchmarkT): + """ + Notify all handlers of benchmark completion. + + :param benchmark: Completed benchmark results. + """ + await asyncio.gather( + *[child.on_benchmark_complete(benchmark) for child in self.instances] + ) + + async def on_finalize(self): + """Finalize all progress handler instances.""" + await asyncio.gather(*[child.on_finalize() for child in self.instances]) + + async def on_raw_update( + self, + profile: Profile, + aggregator_update: AggregatorState | None, + benchmark: BenchmarkT | None, + strategy: SchedulingStrategy, + scheduler_state: SchedulerState | None, + ): + """ + Distribute raw updates to all handlers. + + :param profile: Benchmark profile configuration. + :param aggregator_update: Current benchmark metrics and statistics. + :param benchmark: Completed benchmark if available. + :param strategy: Current scheduling strategy. + :param scheduler_state: Current scheduler execution state. + """ + await asyncio.gather( + *[ + child.on_raw_update( + profile, + aggregator_update, + benchmark, + strategy, + scheduler_state, + ) + for child in self.instances + ] + ) + + +class GenerativeConsoleBenchmarkerProgress( + BenchmarkerProgress[GenerativeBenchmark], Live +): + """ + Console-based progress display for generative benchmarks. + + Provides real-time visual progress tracking using Rich library components, + displaying benchmark execution statistics, timing information, and progress + bars in a structured console interface. + """ + + def __init__(self, enabled: bool = True, display_scheduler_stats: bool = False): + """ + Initialize console progress display. + + :param enabled: Whether to enable progress tracking and display. + :param display_scheduler_stats: Whether to display scheduler statistics. + """ + BenchmarkerProgress.__init__(self, enabled=enabled) + Live.__init__( + self, + refresh_per_second=4, + auto_refresh=True, + redirect_stdout=True, + redirect_stderr=True, + ) + self.display_scheduler_stats: bool = display_scheduler_stats + self.run_progress: Progress = None + self.run_progress_task: TaskID = None + self.tasks_progress: _GenerativeProgressTasks = None + + async def on_initialize(self, profile: Profile): + """ + Initialize console display components and start rendering. + + :param profile: Benchmark profile configuration. + """ + self.tasks_progress = _GenerativeProgressTasks( + profile=profile, display_scheduler_stats=self.display_scheduler_stats + ) + self.run_progress = Progress( + TextColumn("Generating...", style=f"italic {Colors.progress}"), + BarColumn( + bar_width=None, + complete_style=Colors.progress, + finished_style=Colors.success, + ), + TextColumn( + "({task.fields[completed_benchmarks]}/{task.fields[total_benchmarks]})", + style=Colors.progress, + ), + TextColumn("["), + TimeElapsedColumn(), + TextColumn("<"), + TimeRemainingColumn(), + TextColumn("]"), + ) + self.run_progress_task = self.run_progress.add_task("") + self._sync_run_progress() + self.update( + Group( + Panel( + self.tasks_progress, + title="Benchmarks", + title_align="left", + expand=True, + ), + self.run_progress, + ) + ) + self.start() + + async def on_benchmark_start(self, strategy: SchedulingStrategy): + """ + Update display for new benchmark strategy start. + + :param strategy: Scheduling strategy being executed. + """ + self.tasks_progress.start_benchmark(strategy) + self._sync_run_progress() + + async def on_benchmark_update( + self, aggregator_update: AggregatorState | None, scheduler_state: SchedulerState + ): + """ + Update display with current benchmark progress. + + :param aggregator_update: Current benchmark metrics and statistics. + :param scheduler_state: Current scheduler execution state. + """ + self.tasks_progress.update_benchmark(aggregator_update, scheduler_state) + self._sync_run_progress() + + async def on_benchmark_complete(self, benchmark: GenerativeBenchmark): + """ + Update display for completed benchmark. + + :param benchmark: Completed benchmark results. + """ + self.tasks_progress.complete_benchmark(benchmark) + self._sync_run_progress() + + async def on_finalize(self): + """Stop display rendering and cleanup resources.""" + self.tasks_progress.finalize() + self._sync_run_progress() + self.run_progress.stop_task(self.run_progress_task) + self.stop() + self.run_progress = None + self.run_progress_task = None + self.tasks_progress = None + + def _sync_run_progress(self): + """Synchronize overall progress display with task progress.""" + self.run_progress.update( + self.run_progress_task, + total=self.tasks_progress.steps_total, + completed=self.tasks_progress.steps_progress, + completed_benchmarks=self.tasks_progress.tasks_progress, + total_benchmarks=self.tasks_progress.tasks_total, + ) + + +# Scaling factor for progress calculations to provide granular progress updates +_PROGRESS_SCALE = 1000 + + +class _GenerativeProgressTasks(Progress): + def __init__(self, profile: Profile, display_scheduler_stats: bool): + self.profile: Profile = profile + self.display_scheduler_stats: bool = display_scheduler_stats + self.benchmark_task_states: list[_GenerativeProgressTaskState] = [] + self.current_index: int = -1 + + summary_text = "{task.fields[requests_summary]}\n{task.fields[tokens_summary]}" + if self.display_scheduler_stats: + summary_text += "\n{task.fields[scheduler_stats]}" + super().__init__( + TextColumn("[{task.fields[start_time]}]"), + SpinnerColumn(style=Colors.progress), + TaskProgressColumn(style=Colors.progress), + TextColumn("{task.description}"), + TextColumn("({task.fields[progress_status]})"), + TextColumn(" "), + TextColumn(summary_text), + ) + + for strategy_type in profile.strategy_types: + task_state = _GenerativeProgressTaskState( + strategy_type=strategy_type, + ) + task_id = self.add_task(**task_state.current) + task_state.task_id = task_id + self.benchmark_task_states.append(task_state) @property - def completed(self) -> int: - if self.ended: - return 1000 + def tasks_total(self) -> int: + return len(self.benchmark_task_states) - if self.max_number is None and self.max_duration is None: - return 0 + @property + def tasks_progress(self) -> int: + return self.current_index + 1 - number = self.requests_successful + self.requests_errored - number_percent = ( - number / float(self.max_number) * 1000 if self.max_number else -math.inf + @property + def steps_total(self) -> int: + return _PROGRESS_SCALE * len(self.benchmark_task_states) + + @property + def steps_progress(self) -> int: + progress_current_task = ( + self.benchmark_task_states[self.current_index].progress + if self.current_index < len(self.benchmark_task_states) + else 0 + ) + progress_total = self.current_index + (progress_current_task or 0) + + return progress_total * _PROGRESS_SCALE + + def start_benchmark(self, strategy: SchedulingStrategy): + self.current_index += 1 + if self.current_index >= len(self.benchmark_task_states): + # New task past initially estimated, append it to the end + task_state = _GenerativeProgressTaskState(strategy_type=strategy.type_) + task_id = self.add_task(**task_state.current) + task_state.task_id = task_id + self.benchmark_task_states.append(task_state) + + self.benchmark_task_states[self.current_index].start(strategy) + self.update( + self.benchmark_task_states[self.current_index].task_id, + start=True, + **self.benchmark_task_states[self.current_index].current, + ) + + def update_benchmark( + self, aggregator_update: AggregatorState, scheduler_state: SchedulerState + ): + self.benchmark_task_states[self.current_index].update( + aggregator_update, scheduler_state + ) + self.update( + self.benchmark_task_states[self.current_index].task_id, + **self.benchmark_task_states[self.current_index].current, ) - duration_percent = ( - (time.time() - self.start_time) / self.max_duration * 1000 - if self.max_duration and self.start_time - else -math.inf + + def complete_benchmark(self, benchmark: GenerativeBenchmark): + self.benchmark_task_states[self.current_index].complete(benchmark) + self.update( + self.benchmark_task_states[self.current_index].task_id, + **self.benchmark_task_states[self.current_index].current, ) - return min(int(max(number_percent, duration_percent)), 1000) + def finalize(self): + self.stop() + + +@dataclass +class _GenerativeProgressTaskState: + strategy_type: StrategyType + task_id: TaskID = None + strategy: SchedulingStrategy | None = None + benchmark_status: Literal[ + "pending", "in_warmup", "in_progress", "in_cooldown", "completed" + ] = "pending" + progress: float | None = None + start_time: float = -1.0 + successful_requests: int = 0 + cancelled_requests: int = 0 + errored_requests: int = 0 + request_concurrency: int = 0 + requests_per_second: float = 0 + request_latency: float = 0 + output_tokens: int = 0 + output_tokens_rate: float = 0 + prompt_tokens: int = 0 + total_tokens_rate: float = 0 + time_to_first_token: float = 0 + inter_token_latency: float = 0 + queued_time: float = 0 + request_targeted_start_delay: float = 0 + scheduler_overheads_time: float = 0 @property - def fields(self) -> dict[str, str]: - fields = { + def current(self) -> dict[str, Any]: + return { "start_time": self.formatted_start_time, + "description": str(self.strategy or self.strategy_type), "progress_status": self.formatted_progress_status, "requests_summary": self.formatted_requests_summary, + "tokens_summary": self.formatted_tokens_summary, + "scheduler_stats": self.formatted_scheduler_stats, + "completed": self.completed, + "total": self.total, } - if self.display_scheduler_stats: - fields["scheduler_stats"] = self.formatted_scheduler_stats + @property + def completed(self) -> float: + if self.benchmark_status == "pending": + return 0 + + if self.benchmark_status == "completed": + return _PROGRESS_SCALE - return fields + return self.progress * _PROGRESS_SCALE if self.progress is not None else None + + @property + def total(self) -> float: + return _PROGRESS_SCALE @property def formatted_start_time(self) -> str: - if self.start_time is None: + if self.start_time < 0.0: return "--:--:--" return datetime.fromtimestamp(self.start_time).strftime("%H:%M:%S") @property def formatted_progress_status(self) -> str: - if self.ended: - status = "complete" - color = Colors.SUCCESS - elif self.compiling: - status = "compiling" - color = Colors.PROGRESS - elif self.started and self.in_warmup: + if self.benchmark_status == "in_warmup": status = "warmup" - color = Colors.PROGRESS - elif self.started and self.in_cooldown: - status = "cooldown" - color = Colors.PROGRESS - elif self.started: + color = Colors.progress + elif self.benchmark_status == "in_progress": status = "running" - color = Colors.PROGRESS + color = Colors.progress + elif self.benchmark_status == "in_cooldown": + status = "cooldown" + color = Colors.progress + elif self.benchmark_status == "completed": + status = "complete" + color = Colors.success else: status = "pending" - color = Colors.INFO + color = Colors.info return f"[{color}]{status.ljust(8)}[/{color}]" @property def formatted_requests_summary(self) -> str: - if not self.started: + if self.benchmark_status == "pending": return " " return ( - f"[{Colors.INFO}]Req:[/{Colors.INFO}] " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.requests_rate, + f"[{Colors.info}]Req:[/{Colors.info}] " + + format_value_display( + value=self.requests_per_second, label="req/s", total_characters=12, digits_places=4, decimal_places=1, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( + + format_value_display( value=self.request_latency, label="Lat", units="s", @@ -166,32 +670,32 @@ def formatted_requests_summary(self) -> str: decimal_places=2, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.requests_processing, + + format_value_display( + value=self.request_concurrency, label="Conc", total_characters=12, digits_places=4, decimal_places=1, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.requests_successful, + + format_value_display( + value=self.successful_requests, label="Comp", total_characters=12, digits_places=5, decimal_places=0, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.requests_incomplete, + + format_value_display( + value=self.cancelled_requests, label="Inc", total_characters=12, digits_places=5, decimal_places=0, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.requests_errored, + + format_value_display( + value=self.errored_requests, label="Err", total_characters=12, digits_places=5, @@ -199,101 +703,14 @@ def formatted_requests_summary(self) -> str: ) ) - @property - def formatted_scheduler_stats(self) -> str: - if not self.started: - return " " - - return ( - f"[{Colors.INFO}]Sys:[/{Colors.INFO}] " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.worker_overheads_time_ms, - label="Work OH", - units="ms", - total_characters=18, - digits_places=3, - decimal_places=1, - ) - + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.backend_overheads_time_ms, - label="Back OH", - units="ms", - total_characters=18, - digits_places=3, - decimal_places=1, - ) - + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.requests_sleep_time_ms, - label="Req Sleep", - units="ms", - total_characters=18, - digits_places=5, - decimal_places=0, - ) - + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.requests_targeted_start_time_delay_ms, - label="Start Del", - units="ms", - total_characters=18, - digits_places=5, - decimal_places=0, - ) - ) - - @staticmethod - def format_progress_display( - value: float, - label: str, - units: str = "", - total_characters: Optional[int] = None, - digits_places: Optional[int] = None, - decimal_places: Optional[int] = None, - ) -> str: - if decimal_places is None and digits_places is None: - formatted_number = f"{value:.0f}" - elif digits_places is None: - formatted_number = f"{value:.{decimal_places}f}" - elif decimal_places is None: - formatted_number = f"{value:>{digits_places}f}" - else: - formatted_number = f"{value:>{digits_places}.{decimal_places}f}" - - result = f"{formatted_number}{units} [{Colors.INFO}]{label}[/{Colors.INFO}]" - - if total_characters is not None: - total_characters += len(Colors.INFO) * 2 + 5 - - if len(result) < total_characters: - result = result.rjust(total_characters) - - return result - - -class GenerativeTextBenchmarkerTaskProgressState(BenchmarkerTaskProgressState): - output_tokens: float = 0 - prompt_tokens: float = 0 - output_tokens_rate: float = 0 - total_tokens_rate: float = 0 - tokens_ttft: float = 0 - tokens_itl: float = 0 - - @property - def fields(self) -> dict[str, str]: - fields = super().fields - fields["tokens_summary"] = self.formatted_tokens_summary - return fields - @property def formatted_tokens_summary(self) -> str: - if not self.started: + if self.benchmark_status == "pending": return " " return ( - f"[{Colors.INFO}]Tok:[/{Colors.INFO}] " - + BenchmarkerTaskProgressState.format_progress_display( + f"[{Colors.info}]Tok:[/{Colors.info}] " + + format_value_display( value=self.output_tokens_rate, label="gen/s", total_characters=12, @@ -301,7 +718,7 @@ def formatted_tokens_summary(self) -> str: decimal_places=1, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( + + format_value_display( value=self.total_tokens_rate, label="tot/s", total_characters=12, @@ -309,8 +726,8 @@ def formatted_tokens_summary(self) -> str: decimal_places=1, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.tokens_ttft, + + format_value_display( + value=self.time_to_first_token, label="TTFT", units="ms", total_characters=12, @@ -318,8 +735,8 @@ def formatted_tokens_summary(self) -> str: decimal_places=1, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( - value=self.tokens_itl, + + format_value_display( + value=self.inter_token_latency, label="ITL", units="ms", total_characters=12, @@ -327,7 +744,7 @@ def formatted_tokens_summary(self) -> str: decimal_places=1, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( + + format_value_display( value=self.prompt_tokens, label="Prompt", total_characters=12, @@ -335,7 +752,7 @@ def formatted_tokens_summary(self) -> str: decimal_places=0, ) + ", " - + BenchmarkerTaskProgressState.format_progress_display( + + format_value_display( value=self.output_tokens, label="Gen", total_characters=12, @@ -344,377 +761,216 @@ def formatted_tokens_summary(self) -> str: ) ) + @property + def formatted_scheduler_stats(self) -> str: + if self.benchmark_status == "pending": + return " " -BTPS = TypeVar("BTPS", bound=BenchmarkerTaskProgressState) - - -class BenchmarkerProgressDisplay(Generic[BTPS]): - def __init__(self, display_scheduler_stats: bool): - self.display_scheduler_stats = display_scheduler_stats - self.started = False - self.benchmarker_tasks_progress = Progress(*self.create_task_progress_columns()) - self.benchmarker_tasks_panel = Panel( - self.benchmarker_tasks_progress, - title="Benchmarks", - title_align="left", - expand=True, - ) - self.benchmarker_progress = Progress( - TextColumn("Generating...", style=f"italic {Colors.PROGRESS}"), - BarColumn( - bar_width=None, - complete_style=Colors.PROGRESS, - finished_style=Colors.SUCCESS, - ), - TextColumn( - "({task.fields[completed_benchmarks]}/{task.fields[total_benchmarks]})", - style=Colors.PROGRESS, - ), - TextColumn("["), - TimeElapsedColumn(), - TextColumn("<"), - TimeRemainingColumn(), - TextColumn("]"), - ) - self.benchmarker_live = Live( - Group( - self.benchmarker_tasks_panel, - self.benchmarker_progress, - ), - redirect_stdout=True, - redirect_stderr=True, - ) - self.active_task: Optional[TaskID] = None - self.benchmarker_tasks: list[BTPS] = [] - self.progress_task: Optional[TaskID] = None - - def update(self, result: BenchmarkerResult): - if result.type_ == "run_start": - if self.started: - raise RuntimeError("Progress display already started.") - - self.handle_start(result) - self.started = True - elif result.type_ == "run_complete": - if not self.started: - raise RuntimeError("Progress display not started.") - - self.handle_end(result) - self.started = False - else: - if not self.started: - raise RuntimeError("Progress display not started.") - - self.handle_update(result) - - def handle_start(self, result: BenchmarkerResult): - self.benchmarker_live.start() - - for index, strategy_type in enumerate(result.profile.strategy_types): - task_id = self.benchmarker_tasks_progress.add_task( - description=strategy_type, - start=False, - total=None, - completed=0, - visible=False, + return ( + f"[{Colors.info}]Sys:[/{Colors.info}] , " + + format_value_display( + value=self.request_targeted_start_delay, + label="Start Del", + units="ms", + total_characters=18, + digits_places=5, + decimal_places=0, ) - task_progress_state = self.create_task_progress_state( - task_id=task_id, - index=index, - strategy_type=strategy_type, - result=result, + + format_value_display( + value=self.scheduler_overheads_time, + label="Sched OH", + units="ms", + total_characters=18, + digits_places=3, + decimal_places=1, ) - self.benchmarker_tasks.append(task_progress_state) - self.benchmarker_tasks_progress.update( - task_id, - description=task_progress_state.description, - visible=True, - **task_progress_state.fields, # type: ignore[arg-type] + + ", " + + format_value_display( + value=self.queued_time, + label="Queued", + units="ms", + total_characters=18, + digits_places=5, + decimal_places=0, ) - - self.progress_task = self.benchmarker_progress.add_task( - "", - total=len(self.benchmarker_tasks) * 1000, - completed_benchmarks=0, - total_benchmarks=len(self.benchmarker_tasks), ) - def handle_update(self, result: BenchmarkerResult): - current_state: BTPS = self.benchmarker_tasks[result.current_index] - - if result.type_ == "scheduler_start": - self.handle_update_scheduler_start(current_state, result) - self.active_task = current_state.task_id - elif result.type_ == "scheduler_update": - self.handle_update_scheduler_update(current_state, result) - elif result.type_ == "scheduler_complete": - self.handle_update_scheduler_complete(current_state, result) - elif result.type_ == "benchmark_compiled": - self.handle_update_benchmark_compiled(current_state, result) - else: - raise ValueError(f"Unknown result type: {result.type_}") + def start(self, strategy: SchedulingStrategy): + self.strategy = strategy + self.strategy_type = strategy.type_ - if self.progress_task is None: - raise RuntimeError("Progress task not set.") - - self.benchmarker_tasks_progress.update( - current_state.task_id, - description=current_state.description, - completed=current_state.completed, - total=current_state.total, - **current_state.fields, # type: ignore[arg-type] - ) - self.benchmarker_progress.update( - self.progress_task, - completed=(result.current_index * 1000) + current_state.completed, - total=1000 * len(self.benchmarker_tasks), - completed_benchmarks=( - result.current_index + (1 if current_state.ended else 0) + def update( + self, aggregator_update: AggregatorState, scheduler_state: SchedulerState + ): + self.progress = ( + (1.0 - scheduler_state.remaining_fraction) + if scheduler_state.remaining_fraction is not None + else 0.0 + ) + status: Literal["in_warmup", "in_progress", "in_cooldown"] | None = ( + "in_progress" # Need to handle requests_in_* isn't in aggregator_update + ) + if aggregator_update.get("requests_in_warmup"): + status = "in_warmup" + elif aggregator_update.get("requests_in_cooldown"): + status = "in_cooldown" + self._update_processing_states( + benchmark_status=status, + start_time=scheduler_state.start_time, + successful_requests=scheduler_state.successful_requests, + cancelled_requests=scheduler_state.cancelled_requests, + errored_requests=scheduler_state.errored_requests, + ) + self._update_request_stats( + request_concurrency=aggregator_update.get_metric( + key="requests", type_="avg", prefix="completed" + ), + requests_per_second=aggregator_update.get_metric( + key="requests", + type_="rate", + prefix="completed", + ), + request_latency=aggregator_update.get_metric( + key="request_latency", type_="avg", prefix="completed" ), - total_benchmarks=len(self.benchmarker_tasks), ) - - if current_state.ended: - self.benchmarker_tasks_progress.stop_task(current_state.task_id) - self.active_task = None - - def handle_update_scheduler_start( - self, progress_state: BTPS, result: BenchmarkerResult - ): - if self.active_task is not None: - raise RuntimeError("Active task already set.") - - progress_state.strategy = result.current_strategy # type: ignore[assignment] - progress_state.started = True - current_aggregator: BenchmarkAggregator = result.current_aggregator # type: ignore[assignment] - progress_state.start_time = ( - current_aggregator.requests_stats.totals.total.start_time + self._update_token_stats( + output_tokens=aggregator_update.get_metric( + key="output_tokens", type_="avg", prefix="completed" + ), + output_tokens_rate=aggregator_update.get_metric( + key="output_tokens", type_="rate" + ), + prompt_tokens=aggregator_update.get_metric( + key="prompt_tokens", type_="avg", prefix="completed" + ), + total_tokens_rate=aggregator_update.get_metric( + key="total_tokens", type_="rate" + ), + time_to_first_token=( + aggregator_update.get_metric(key="time_to_first_token", type_="avg") + ), + inter_token_latency=( + aggregator_update.get_metric(key="inter_token_latency", type_="avg") + ), ) - progress_state.max_number = current_aggregator.args.max_number - progress_state.max_duration = current_aggregator.args.max_duration - - def handle_update_scheduler_update( - self, progress_state: BTPS, result: BenchmarkerResult - ): - if self.active_task is None: - raise RuntimeError("Active task not set.") - - if self.active_task != progress_state.task_id: - raise RuntimeError("Active task does not match current task.") + if aggregator_update.get("updated_scheduler_stats"): + self._update_system_stats( + request_targeted_start_delay=( + aggregator_update.get_metric( + key="request_targeted_start_delay", type_="avg", default=0.0 + ) + ), + queued_time=( + aggregator_update.get_metric( + key="queued_time", type_="avg", default=0.0 + ) + ), + scheduler_overheads_time=0.0, # Need to add up metrics here + ) - current_aggregator: BenchmarkAggregator = result.current_aggregator # type: ignore[assignment] - progress_state.in_warmup = current_aggregator.in_warmup - progress_state.in_cooldown = current_aggregator.in_cooldown - progress_state.requests_rate = ( - current_aggregator.requests_stats.totals.successful.rate - ) - progress_state.request_latency = ( - current_aggregator.requests_stats.request_time.mean - ) - progress_state.requests_processing = ( - current_aggregator.scheduler_stats.processing_requests.last - ) - progress_state.requests_successful = ( - current_aggregator.requests_stats.totals.successful.total - ) - progress_state.requests_incomplete = ( - current_aggregator.requests_stats.totals.incomplete.total - ) - progress_state.requests_errored = ( - current_aggregator.requests_stats.totals.errored.total - ) - progress_state.worker_overheads_time_ms = ( - current_aggregator.requests_stats.scheduled_time_delay.mean_ms - + current_aggregator.requests_stats.worker_start_delay.mean_ms - ) - progress_state.backend_overheads_time_ms = ( - current_aggregator.requests_stats.request_time_delay.mean_ms - ) - progress_state.requests_sleep_time_ms = ( - current_aggregator.requests_stats.scheduled_time_sleep.mean_ms - ) - progress_state.requests_targeted_start_time_delay_ms = ( - current_aggregator.requests_stats.request_start_time_targeted_delay.mean_ms + def complete(self, benchmark: GenerativeBenchmark): + self._update_processing_states( + benchmark_status="completed", + start_time=benchmark.start_time, + successful_requests=benchmark.request_totals.successful, + cancelled_requests=benchmark.request_totals.incomplete, + errored_requests=benchmark.request_totals.errored, + ) + self._update_request_stats( + request_concurrency=benchmark.metrics.request_concurrency.successful.mean, + requests_per_second=benchmark.metrics.requests_per_second.successful.mean, + request_latency=benchmark.metrics.request_latency.successful.mean, + ) + self._update_token_stats( + output_tokens=benchmark.metrics.output_token_count.successful.mean, + output_tokens_rate=benchmark.metrics.output_tokens_per_second.successful.mean, + prompt_tokens=benchmark.metrics.prompt_token_count.successful.mean, + total_tokens_rate=benchmark.metrics.tokens_per_second.successful.mean, + time_to_first_token=( + benchmark.metrics.time_to_first_token_ms.successful.mean + ), + inter_token_latency=( + benchmark.metrics.inter_token_latency_ms.successful.mean + ), + converted=True, ) - def handle_update_scheduler_complete( + def _update_processing_states( self, - progress_state: BTPS, - result: BenchmarkerResult, # noqa: ARG002 + benchmark_status: Literal[ + "pending", "in_warmup", "in_progress", "in_cooldown", "completed" + ], + start_time: float | None = None, + successful_requests: int | None = None, + cancelled_requests: int | None = None, + errored_requests: int | None = None, ): - if self.active_task is None: - raise RuntimeError("Active task not set.") - - if self.active_task != progress_state.task_id: - raise RuntimeError("Active task does not match current task.") - - progress_state.in_warmup = False - progress_state.in_cooldown = False - progress_state.compiling = True - - def handle_update_benchmark_compiled( - self, progress_state: BTPS, result: BenchmarkerResult - ): - if self.active_task is None: - raise RuntimeError("Active task not set.") - - if self.active_task != progress_state.task_id: - raise RuntimeError("Active task does not match current task.") - - current_benchmark: Benchmark = result.current_benchmark # type: ignore[assignment] - progress_state.compiling = False - progress_state.ended = True - progress_state.requests_rate = ( - current_benchmark.metrics.requests_per_second.successful.mean - ) - progress_state.requests_processing = ( - current_benchmark.metrics.request_concurrency.successful.mean - ) - - def handle_end(self, result: BenchmarkerResult): # noqa: ARG002 - if self.progress_task is None: - raise RuntimeError("Progress task not set.") - - self.benchmarker_progress.update( - self.progress_task, - completed=len(self.benchmarker_tasks) * 1000, - total=len(self.benchmarker_tasks) * 1000, - completed_benchmarks=len(self.benchmarker_tasks), - total_benchmarks=len(self.benchmarker_tasks), - ) - self.benchmarker_progress.stop_task(self.progress_task) - self.benchmarker_live.stop() - self.active_task = None - self.benchmarker_tasks = [] - self.progress_task = None - - def create_task_progress_columns(self) -> list[ProgressColumn]: - columns = [ - TextColumn("[{task.fields[start_time]}]"), - SpinnerColumn(style=Colors.PROGRESS), - TaskProgressColumn(style=Colors.PROGRESS), - TextColumn("{task.description}"), - TextColumn("({task.fields[progress_status]})"), - TextColumn(" "), - ] - - if not self.display_scheduler_stats: - columns += [ - TextColumn("{task.fields[requests_summary]}\n"), - ] - else: - columns += [ - TextColumn( - "{task.fields[requests_summary]}\n{task.fields[scheduler_stats]}\n" - ), - ] - - return columns - - def create_task_progress_state( + if benchmark_status is not None: + self.benchmark_status = benchmark_status + if start_time is not None: + self.start_time = start_time + if successful_requests is not None: + self.successful_requests = successful_requests + if cancelled_requests is not None: + self.cancelled_requests = cancelled_requests + if errored_requests is not None: + self.errored_requests = errored_requests + + def _update_request_stats( self, - task_id: TaskID, - index: int, # noqa: ARG002 - strategy_type: StrategyType, - result: BenchmarkerResult, # noqa: ARG002 - ) -> BTPS: - return BenchmarkerTaskProgressState( # type: ignore[return-value] - display_scheduler_stats=self.display_scheduler_stats, - task_id=task_id, - strategy=strategy_type, - ) - - -class GenerativeTextBenchmarkerProgressDisplay( - BenchmarkerProgressDisplay[GenerativeTextBenchmarkerTaskProgressState] -): - def handle_update_scheduler_update( - self, - progress_state: GenerativeTextBenchmarkerTaskProgressState, - result: BenchmarkerResult, + request_concurrency: int | None = None, + requests_per_second: float | None = None, + request_latency: float | None = None, ): - super().handle_update_scheduler_update(progress_state, result) - current_aggregator: GenerativeBenchmarkAggregator = result.current_aggregator # type: ignore[assignment] - progress_state.output_tokens = ( - current_aggregator.requests_stats.output_tokens.mean - ) - progress_state.prompt_tokens = ( - current_aggregator.requests_stats.prompt_tokens.mean - ) - progress_state.output_tokens_rate = ( - current_aggregator.requests_stats.output_tokens.rate - ) - progress_state.total_tokens_rate = ( - current_aggregator.requests_stats.total_tokens.rate - ) - progress_state.tokens_ttft = ( - current_aggregator.requests_stats.time_to_first_token.mean_ms - ) - progress_state.tokens_itl = ( - current_aggregator.requests_stats.inter_token_latency.mean_ms - ) - - def handle_update_benchmark_compiled( + if request_concurrency is not None: + self.request_concurrency = request_concurrency + if requests_per_second is not None: + self.requests_per_second = requests_per_second + if request_latency is not None: + self.request_latency = request_latency + + def _update_token_stats( self, - progress_state: GenerativeTextBenchmarkerTaskProgressState, - result: BenchmarkerResult, + output_tokens: int | None = None, + output_tokens_rate: float | None = None, + prompt_tokens: int | None = None, + total_tokens_rate: float | None = None, + time_to_first_token: float | None = None, + inter_token_latency: float | None = None, + converted: bool = False, ): - super().handle_update_benchmark_compiled(progress_state, result) - - current_benchmark: GenerativeBenchmark = result.current_benchmark # type: ignore[assignment] - progress_state.request_latency = ( - current_benchmark.metrics.request_latency.successful.mean - ) - progress_state.requests_successful = current_benchmark.request_totals.successful - progress_state.requests_errored = current_benchmark.request_totals.errored - progress_state.requests_incomplete = current_benchmark.request_totals.incomplete - progress_state.output_tokens = ( - current_benchmark.metrics.output_token_count.successful.mean - ) - progress_state.prompt_tokens = ( - current_benchmark.metrics.prompt_token_count.successful.mean - ) - progress_state.output_tokens_rate = ( - current_benchmark.metrics.output_tokens_per_second.successful.mean - ) - progress_state.total_tokens_rate = ( - current_benchmark.metrics.tokens_per_second.successful.mean - ) - progress_state.tokens_ttft = ( - current_benchmark.metrics.time_to_first_token_ms.successful.mean - ) - progress_state.tokens_itl = ( - current_benchmark.metrics.inter_token_latency_ms.successful.mean - ) + if output_tokens is not None: + self.output_tokens = output_tokens + if output_tokens_rate is not None: + self.output_tokens_rate = output_tokens_rate + if prompt_tokens is not None: + self.prompt_tokens = prompt_tokens + if total_tokens_rate is not None: + self.total_tokens_rate = total_tokens_rate + if time_to_first_token is not None: + self.time_to_first_token = time_to_first_token * ( + 1000 if not converted else 1 + ) + if inter_token_latency is not None: + self.inter_token_latency = inter_token_latency * ( + 1000 if not converted else 1 + ) - def create_task_progress_state( + def _update_system_stats( self, - task_id: TaskID, - index: int, # noqa: ARG002 - strategy_type: StrategyType, - result: BenchmarkerResult, # noqa: ARG002 - ) -> GenerativeTextBenchmarkerTaskProgressState: - return GenerativeTextBenchmarkerTaskProgressState( - display_scheduler_stats=self.display_scheduler_stats, - task_id=task_id, - strategy=strategy_type, - ) - - def create_task_progress_columns(self) -> list[ProgressColumn]: - columns = super().create_task_progress_columns() - columns = columns[:-1] # remove the last display info column - - if not self.display_scheduler_stats: - columns += [ - TextColumn( - "{task.fields[requests_summary]}\n{task.fields[tokens_summary]}", - ), - ] - else: - columns += [ - TextColumn( - "{task.fields[requests_summary]}\n{task.fields[tokens_summary]}\n{task.fields[scheduler_stats]}", - ), - ] - - return columns + request_targeted_start_delay: float | None = None, + queued_time: float | None = None, + scheduler_overheads_time: float | None = None, + converted: bool = False, + ): + if request_targeted_start_delay is not None: + self.request_targeted_start_delay = request_targeted_start_delay * ( + 1000 if not converted else 1 + ) + if queued_time is not None: + self.queued_time = queued_time * (1000 if not converted else 1) + if scheduler_overheads_time is not None: + self.scheduler_overheads_time = scheduler_overheads_time * ( + 1000 if not converted else 1 + ) diff --git a/src/guidellm/benchmark/scenario.py b/src/guidellm/benchmark/scenario.py index 042b25b1..15e3cd81 100644 --- a/src/guidellm/benchmark/scenario.py +++ b/src/guidellm/benchmark/scenario.py @@ -1,7 +1,9 @@ +from __future__ import annotations + from collections.abc import Iterable from functools import cache from pathlib import Path -from typing import Annotated, Any, Literal, Optional, TypeVar, Union +from typing import Annotated, Any, Literal, TypeVar from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict from pydantic import BeforeValidator, Field, NonNegativeInt, PositiveFloat, PositiveInt @@ -11,8 +13,8 @@ from guidellm.backend.backend import BackendType from guidellm.benchmark.profile import ProfileType -from guidellm.objects.pydantic import StandardBaseModel -from guidellm.scheduler.strategies import StrategyType +from guidellm.scheduler.strategy import StrategyType +from guidellm.utils import StandardBaseModel __ALL__ = ["Scenario", "GenerativeTextScenario", "get_builtin_scenarios"] @@ -25,7 +27,7 @@ def get_builtin_scenarios() -> list[str]: return [p.stem for p in SCENARIO_DIR.glob("*.json")] -def parse_float_list(value: Union[str, float, list[float]]) -> list[float]: +def parse_float_list(value: str | float | list[float]) -> list[float]: """ Parse a comma separated string to a list of float or convert single float list of one or pass float @@ -57,7 +59,7 @@ class Scenario(StandardBaseModel): target: str @classmethod - def from_builtin(cls: type[T], name: str, overrides: Optional[dict] = None) -> T: + def from_builtin(cls: type[T], name: str, overrides: dict | None = None) -> T: filename = SCENARIO_DIR / f"{name}.json" if not filename.is_file(): @@ -77,28 +79,28 @@ class Config: arbitrary_types_allowed = True backend_type: BackendType = "openai_http" - backend_args: Optional[dict[str, Any]] = None - model: Optional[str] = None - processor: Optional[Union[str, Path, PreTrainedTokenizerBase]] = None - processor_args: Optional[dict[str, Any]] = None - data: Union[ - str, - Path, - Iterable[Union[str, dict[str, Any]]], - Dataset, - DatasetDict, - IterableDataset, - IterableDatasetDict, - ] - data_args: Optional[dict[str, Any]] = None - data_sampler: Optional[Literal["random"]] = None - rate_type: Union[StrategyType, ProfileType] - rate: Annotated[ - Optional[list[PositiveFloat]], BeforeValidator(parse_float_list) - ] = None - max_seconds: Optional[PositiveFloat] = None - max_requests: Optional[PositiveInt] = None - warmup_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None - cooldown_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None - output_sampling: Optional[NonNegativeInt] = None + backend_args: dict[str, Any] | None = None + model: str | None = None + processor: str | Path | PreTrainedTokenizerBase | None = None + processor_args: dict[str, Any] | None = None + data: ( + str + | Path + | Iterable[str | dict[str, Any]] + | Dataset + | DatasetDict + | IterableDataset + | IterableDatasetDict + ) + data_args: dict[str, Any] | None = None + data_sampler: Literal["random"] | None = None + rate_type: StrategyType | ProfileType + rate: Annotated[list[PositiveFloat] | None, BeforeValidator(parse_float_list)] = ( + None + ) + max_seconds: PositiveFloat | None = None + max_requests: PositiveInt | None = None + warmup_percent: Annotated[float | None, Field(gt=0, le=1)] = None + cooldown_percent: Annotated[float | None, Field(gt=0, le=1)] = None + output_sampling: NonNegativeInt | None = None random_seed: int = 42