diff --git a/.gitignore b/.gitignore index e0a29972..80e7a1c0 100644 --- a/.gitignore +++ b/.gitignore @@ -179,6 +179,8 @@ cython_debug/ # Project specific files *.json *.yaml +/bin +uv.lock # But not scenarios !src/guidellm/benchmark/scenarios/*.json diff --git a/README.md b/README.md index 9312c55f..040c6aa4 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted. +- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM\_\_ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold. + - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results. - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results. diff --git a/pyproject.toml b/pyproject.toml index 0b1014cb..abc40eb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,7 @@ dev = [ "pytest-cov~=5.0.0", "pytest-mock~=3.14.0", "pytest-rerunfailures~=14.0", + "pytest-timeout~=2.4.0", "respx~=0.22.0", # code quality diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index ac0872c3..57023936 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -178,12 +178,28 @@ def benchmark(): "If None, will run until max_seconds or the data is exhausted." ), ) +@click.option( + "--max-error", + type=float, + help=( + "The maximum error after which a benchmark will stop. " + "Can either be a rate i.e 0 < rate < 1 or constant number. " + "If rate is given and rate_type is 'constant' and 'max_seconds' exists " + "then the rate will be calculated as part of the total expected " + "requests count i.e rate * duration. If rate is given and number" + "of requests is not pre-determined than a context window " + "of the last requests will be looked at. Context window size" + "is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE." + "If a number above 1 is given than we just count the total" + "number of error and check if it's above the threshold." + ), +) @click.option( "--warmup-percent", type=float, default=GenerativeTextScenario.get_default("warmup_percent"), help=( - "The percent of the benchmark (based on max-seconds, max-requets, " + "The percent of the benchmark (based on max-seconds, max-requests, " "or lenth of dataset) to run as a warmup and not include in the final results. " "Defaults to None." ), @@ -193,7 +209,7 @@ def benchmark(): type=float, default=GenerativeTextScenario.get_default("cooldown_percent"), help=( - "The percent of the benchmark (based on max-seconds, max-requets, or lenth " + "The percent of the benchmark (based on max-seconds, max-requests, or length " "of dataset) to run as a cooldown and not include in the final results. " "Defaults to None." ), @@ -259,6 +275,7 @@ def run( rate, max_seconds, max_requests, + max_error, warmup_percent, cooldown_percent, disable_progress, @@ -286,6 +303,7 @@ def run( rate=rate, max_seconds=max_seconds, max_requests=max_requests, + max_error=max_error, warmup_percent=warmup_percent, cooldown_percent=cooldown_percent, output_sampling=output_sampling, diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py index 6e24dfc5..04edfd46 100644 --- a/src/guidellm/backend/openai.py +++ b/src/guidellm/backend/openai.py @@ -103,7 +103,7 @@ def __init__( raise ValueError("Target URL must be provided for OpenAI HTTP backend.") if self._target.endswith("/v1") or self._target.endswith("/v1/"): - # backwards compatability, strip v1 off + # backwards compatibility, strip v1 off self._target = self._target[:-3] if self._target.endswith("/"): diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index af7f1a13..aca727f0 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -1,19 +1,13 @@ import time from abc import ABC, abstractmethod from pathlib import Path -from typing import ( - Any, - Generic, - Literal, - Optional, - TypeVar, - Union, -) +from typing import Any, Generic, Literal, Optional, TypeVar, Union, get_args from pydantic import Field from guidellm.backend import ResponseSummary from guidellm.benchmark.benchmark import ( + REASON_STATUS_MAPPING, BenchmarkArgs, BenchmarkRunStats, BenchmarkT, @@ -40,6 +34,7 @@ SchedulerRequestResult, WorkerDescription, ) +from guidellm.scheduler.result import TerminationReason from guidellm.utils import check_load_processor __all__ = [ @@ -305,6 +300,24 @@ class BenchmarkAggregator( total=None, ), ) + current_window: int = Field( + description=( + "The current accumulated window size for error checking. " + "This is a number between 0 and the value of " + "GUIDELLM__ERROR_CHECK_WINDOW_SIZE" + ), + default=0, + ) + errors_in_window: int = Field( + description=("The amount of errored requests in the current window."), + default=0, + ) + termination_reason: TerminationReason = Field( + description=( + f"The benchmark termination reason, one of: {get_args(TerminationReason)}" + ), + default="interrupted", + ) def add_result( self, @@ -600,6 +613,8 @@ def compile(self) -> GenerativeBenchmark: """ successful, incomplete, errored = self._compile_results() + error_rate, window_error_rate = self._calculate_error_rate() + return GenerativeBenchmark.from_stats( run_id=self.run_id, successful=successful, @@ -625,12 +640,28 @@ def compile(self) -> GenerativeBenchmark: request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean, request_time_delay_avg=self.requests_stats.request_time_delay.mean, request_time_avg=self.requests_stats.request_time.mean, + error_rate=error_rate, + window_error_rate=window_error_rate, + status=REASON_STATUS_MAPPING[self.termination_reason], + termination_reason=self.termination_reason, ), worker=self.worker_description, requests_loader=self.request_loader_description, extras=self.extras, ) + def _calculate_error_rate(self) -> tuple[float, float]: + total_successful = self.requests_stats.totals.successful.total + total_errored = self.requests_stats.totals.errored.total + total_finished = total_errored + total_successful + error_rate = 0.0 if total_finished == 0 else (total_errored / total_finished) + window_error_rate = ( + 0.0 + if self.current_window == 0 + else self.errors_in_window / self.current_window + ) + return error_rate, window_error_rate + def _compile_results( self, ) -> tuple[ diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 1e2a5f4b..8574a97e 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -1,6 +1,6 @@ import random import uuid -from typing import Any, Literal, Optional, TypeVar, Union +from typing import Any, Literal, Optional, TypeVar, Union, get_args from pydantic import Field, computed_field @@ -32,6 +32,7 @@ ThroughputStrategy, WorkerDescription, ) +from guidellm.scheduler.result import TerminationReason __all__ = [ "Benchmark", @@ -46,6 +47,14 @@ "StatusBreakdown", ] +BenchmarkStatus = Literal["success", "error", "interrupted"] +REASON_STATUS_MAPPING: dict[TerminationReason, BenchmarkStatus] = { + "interrupted": "interrupted", + "max_error_reached": "error", + "max_seconds_reached": "success", + "max_requests_reached": "success", +} + class BenchmarkArgs(StandardBaseModel): """ @@ -90,6 +99,9 @@ class BenchmarkArgs(StandardBaseModel): max_duration: Optional[float] = Field( description="The maximum duration in seconds to run this benchmark, if any." ) + max_error: Optional[float] = Field( + description="Maximum error rate or const after which a benchmark will stop." + ) warmup_number: Optional[int] = Field( description=( "The number of requests to run for the warmup phase of this benchmark, " @@ -213,6 +225,34 @@ class BenchmarkRunStats(StandardBaseModel): "it was completed." ) ) + error_rate: float = Field( + description=( + "The number of total errored requests divided by the number " + "of total successful and errored requests at the end of benchmark. " + ) + ) + window_error_rate: float = Field( + description=( + "The number of errored requests within the error checking window" + "divided by the window size at the end of benchmark. " + "If the window_error_rate is above the max_error " + "the termination_reason should be 'max_error_reached'. " + "You may configure the error checking window size by setting " + "the environment variable GUIDELLM__ERROR_CHECK_WINDOW_SIZE." + ) + ) + status: BenchmarkStatus = Field( + description=( + f"The status of the benchmark output, " + f"one of the following options: {get_args(BenchmarkStatus)}." + ) + ) + termination_reason: TerminationReason = Field( + description=( + "The reason for the benchmark termination, " + f"one of the following options: {get_args(TerminationReason)}." + ) + ) class BenchmarkMetrics(StandardBaseModel): diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index 11b6d245..3ab25586 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -74,6 +74,12 @@ class BenchmarkerStrategyLimits(StandardBaseModel): description="Maximum duration (in seconds) to process requests per strategy.", ge=0, ) + max_error_per_strategy: Optional[float] = Field( + description="Maximum error after which a " + "benchmark will stop," + " either rate or fixed number", + ge=0, + ) warmup_percent_per_strategy: Optional[float] = Field( description="Percentage of requests to use for warmup.", ge=0, @@ -99,6 +105,10 @@ def max_number(self) -> Optional[int]: def max_duration(self) -> Optional[float]: return self.max_duration_per_strategy + @property + def max_error(self) -> Optional[float]: + return self.max_error_per_strategy + @property def warmup_number(self) -> Optional[int]: if self.warmup_percent_per_strategy is None or self.max_number is None: @@ -148,6 +158,7 @@ async def run( profile: Profile, max_number_per_strategy: Optional[int], max_duration_per_strategy: Optional[float], + max_error_per_strategy: Optional[float], warmup_percent_per_strategy: Optional[float], cooldown_percent_per_strategy: Optional[float], ) -> AsyncGenerator[ @@ -162,6 +173,7 @@ async def run( requests_loader_size=requests_loader_size, max_number_per_strategy=max_number_per_strategy, max_duration_per_strategy=max_duration_per_strategy, + max_error_per_strategy=max_error_per_strategy, warmup_percent_per_strategy=warmup_percent_per_strategy, cooldown_percent_per_strategy=cooldown_percent_per_strategy, ) @@ -196,6 +208,7 @@ async def run( scheduling_strategy=scheduling_strategy, max_number=max_number_per_strategy, max_duration=max_duration_per_strategy, + max_error=max_error_per_strategy, ): if result.type_ == "run_start": yield BenchmarkerResult( @@ -210,6 +223,9 @@ async def run( current_result=None, ) elif result.type_ == "run_complete": + aggregator.termination_reason = result.run_info.termination_reason + aggregator.current_window = result.run_info.current_window + aggregator.errors_in_window = result.run_info.errors_in_window yield BenchmarkerResult( type_="scheduler_complete", start_time=start_time, @@ -321,6 +337,7 @@ def create_benchmark_aggregator( strategy=strategy, max_number=limits.max_number, max_duration=limits.max_duration, + max_error=limits.max_error, warmup_number=limits.warmup_number, warmup_duration=limits.warmup_duration, cooldown_number=limits.cooldown_number, diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index 2ef85c3e..c7cc8e3d 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -53,6 +53,7 @@ async def benchmark_generative_text( rate: Optional[Union[float, list[float]]], max_seconds: Optional[float], max_requests: Optional[int], + max_error: Optional[float], warmup_percent: Optional[float], cooldown_percent: Optional[float], output_path: Optional[Union[str, Path]], @@ -119,6 +120,7 @@ async def benchmark_generative_text( profile=profile, max_number_per_strategy=max_requests, max_duration_per_strategy=max_seconds, + max_error_per_strategy=max_error, warmup_percent_per_strategy=warmup_percent, cooldown_percent_per_strategy=cooldown_percent, ): diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py index 8a113f72..29ab53fb 100644 --- a/src/guidellm/benchmark/output.py +++ b/src/guidellm/benchmark/output.py @@ -452,6 +452,7 @@ def benchmarks_args_str(self) -> str: { "max_number": args.max_number, "max_duration": args.max_duration, + "max_error": args.max_error, "warmup_number": args.warmup_number, "warmup_duration": args.warmup_duration, "cooldown_number": args.cooldown_number, diff --git a/src/guidellm/benchmark/scenario.py b/src/guidellm/benchmark/scenario.py index af43e426..e84250b8 100644 --- a/src/guidellm/benchmark/scenario.py +++ b/src/guidellm/benchmark/scenario.py @@ -98,6 +98,7 @@ class Config: ] = None max_seconds: Optional[PositiveFloat] = None max_requests: Optional[PositiveInt] = None + max_error: Optional[PositiveFloat] = None warmup_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None cooldown_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None output_sampling: Optional[NonNegativeInt] = None diff --git a/src/guidellm/config.py b/src/guidellm/config.py index 3b426bd8..6c4c84f7 100644 --- a/src/guidellm/config.py +++ b/src/guidellm/config.py @@ -121,6 +121,8 @@ class Settings(BaseSettings): default_async_loop_sleep: float = 10e-5 logging: LoggingSettings = LoggingSettings() default_sweep_number: int = 10 + shutdown_poll_interval_seconds: float = 1 + error_check_window_size: int = 30 # HTTP settings request_follow_redirects: bool = True diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 0f12687f..14d6587a 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -16,6 +16,11 @@ ] +TerminationReason = Literal[ + "interrupted", "max_error_reached", "max_seconds_reached", "max_requests_reached" +] + + class SchedulerRunInfo(StandardBaseModel): """ Information about the current run of the scheduler. @@ -46,12 +51,21 @@ class SchedulerRunInfo(StandardBaseModel): end_number: float processes: int strategy: SchedulingStrategy + max_error: Optional[float] = None + current_window: int = 0 + errors_in_window: int = 0 created_requests: int = 0 queued_requests: int = 0 scheduled_requests: int = 0 processing_requests: int = 0 completed_requests: int = 0 + errored_requests: int = 0 + + # The default is "interrupted" to be fail safe, if + # the `termination_reason` logic is not reached for + # any reason - we assume it was interrupted. + termination_reason: TerminationReason = "interrupted" class SchedulerRequestInfo(StandardBaseModel): diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 06203827..4963e151 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -5,6 +5,7 @@ import time from collections.abc import AsyncGenerator, Iterable, Iterator from concurrent.futures import ProcessPoolExecutor +from multiprocessing.synchronize import Event as MultiprocessingEvent from typing import ( Any, Generic, @@ -64,12 +65,14 @@ def __init__( self.worker = worker self.request_loader = request_loader + self.error_rate: Optional[float] = None async def run( self, scheduling_strategy: SchedulingStrategy, max_number: Optional[int] = None, max_duration: Optional[float] = None, + max_error: Optional[float] = None, ) -> AsyncGenerator[ Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None ]: @@ -98,20 +101,17 @@ async def run( :param max_duration: The maximum duration for the scheduling run. If None, then no limit is set and either the iterator must be exhaustible or the max_number must be set. + :param max_error: The maximum error rate or const + after which the scheduler shuts down. + Only applicable in benchmarks with finite deterministic number of requests. + If None or not applicable then scheduler will continue regardless of errors. :return: An asynchronous generator that yields SchedulerResult objects. Each SchedulerResult object contains information about the request, the response, and the run information. """ - if scheduling_strategy is None or not isinstance( - scheduling_strategy, SchedulingStrategy - ): - raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}") - - if max_number is not None and max_number < 1: - raise ValueError(f"Invalid max_number: {max_number}") - - if max_duration is not None and max_duration < 0: - raise ValueError(f"Invalid max_duration: {max_duration}") + self._validate_scheduler_params( + scheduling_strategy, max_number, max_duration, max_error + ) with ( multiprocessing.Manager() as manager, @@ -120,11 +120,17 @@ async def run( ) as executor, ): requests_iter: Optional[Iterator[Any]] = None - futures, requests_queue, responses_queue = await self._start_processes( - manager, executor, scheduling_strategy - ) + ( + futures, + requests_queue, + responses_queue, + shutdown_event, + ) = await self._start_processes(manager, executor, scheduling_strategy) + if shutdown_event.is_set(): + raise RuntimeError("shutdown_event is set before starting scheduling") + run_info, requests_iter, times_iter = self._run_setup( - futures, scheduling_strategy, max_number, max_duration + futures, scheduling_strategy, max_number, max_duration, max_error ) yield SchedulerResult( type_="run_start", @@ -132,20 +138,15 @@ async def run( ) try: - while True: + while not self._should_stop( + run_info=run_info, + requests_iter=requests_iter, + ): # check errors and raise them for future in futures: if future.done() and (err := future.exception()) is not None: raise err - if ( - requests_iter is None - and run_info.completed_requests >= run_info.created_requests - ): - # we've exhausted all requests we've wanted to run - # and yielded all responses - break - requests_iter = self._add_requests( requests_iter, times_iter, @@ -166,12 +167,89 @@ async def run( except Exception as err: raise RuntimeError(f"Scheduler run failed: {err}") from err + shutdown_event.set() yield SchedulerResult( type_="run_complete", run_info=run_info, ) - await self._stop_processes(futures, requests_queue) + logger.debug("Waiting for futures to shut down") + await asyncio.gather(*futures) + + def _should_stop( + self, run_info: SchedulerRunInfo, requests_iter: Optional[Iterator[Any]] + ): + # we've exhausted all requests we've wanted to run + # and yielded all responses + is_complete = ( + requests_iter is None + and run_info.completed_requests >= run_info.created_requests + ) + + max_error_reached = self._handle_max_error(run_info) + + return is_complete or max_error_reached + + def _handle_max_error( + self, + run_info: SchedulerRunInfo, + ) -> bool: + max_error_reached = self._is_max_error_reached(run_info) + + if max_error_reached: + run_info.termination_reason = "max_error_reached" + logger.info( + f"Max error rate of " + f"({run_info.max_error}) " + f"reached, sending shutdown signal" + ) + return max_error_reached + + def _validate_scheduler_params( + self, + scheduling_strategy: SchedulingStrategy, + max_number: Optional[int], + max_duration: Optional[float], + max_error: Optional[float], + ) -> None: + if scheduling_strategy is None or not isinstance( + scheduling_strategy, SchedulingStrategy + ): + raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}") + if max_number is not None and max_number < 1: + raise ValueError(f"Invalid max_number: {max_number}") + if max_duration is not None and max_duration < 0: + raise ValueError(f"Invalid max_duration: {max_duration}") + if max_error is not None and (max_error < 0): + raise ValueError(f"Invalid max_error: {max_error}") + + def _is_max_error_reached(self, run_info: SchedulerRunInfo) -> bool: + if settings.error_check_window_size > run_info.current_window: + return False + max_error = run_info.max_error + if max_error is None: + max_error = math.inf + + if max_error >= 1: + # Absolute error count, i.e not a ratio + logger.debug( + f"Window error count: " + f"{run_info.errors_in_window} / {max_error} (max error)" + ) + max_error_reached = max_error < run_info.errors_in_window + else: + window_error_ratio = run_info.errors_in_window / run_info.current_window + logger.debug( + f"Window error rate: {window_error_ratio} " + f"i.e errors_in_window / current_window" + ) + max_error_reached = max_error < window_error_ratio + + if not max_error_reached: + run_info.current_window = 0 + run_info.errors_in_window = 0 + + return max_error_reached async def _start_processes( self, @@ -182,8 +260,10 @@ async def _start_processes( list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, + MultiprocessingEvent, ]: await self.worker.prepare_multiprocessing() + shutdown_event = manager.Event() requests_queue = manager.Queue( maxsize=scheduling_strategy.queued_requests_limit ) @@ -212,25 +292,18 @@ async def _start_processes( futures = [] loop = asyncio.get_event_loop() for id_, requests_limit in zip(process_ids, process_requests_limits): - if scheduling_strategy.processing_mode == "sync": + if scheduling_strategy.processing_mode in ["sync", "async"]: futures.append( loop.run_in_executor( executor, - self.worker.process_loop_synchronous, + self.worker.run_process, + scheduling_strategy.processing_mode, requests_queue, responses_queue, + shutdown_event, + settings.shutdown_poll_interval_seconds, id_, - ) - ) - elif scheduling_strategy.processing_mode == "async": - futures.append( - loop.run_in_executor( - executor, - self.worker.process_loop_asynchronous, - requests_queue, - responses_queue, requests_limit, - id_, ) ) else: @@ -241,7 +314,7 @@ async def _start_processes( await asyncio.sleep(0.1) # give time for processes to start - return futures, requests_queue, responses_queue + return futures, requests_queue, responses_queue, shutdown_event def _run_setup( self, @@ -249,20 +322,13 @@ def _run_setup( scheduling_strategy: SchedulingStrategy, max_number: Optional[int], max_duration: Optional[float], + max_error: Optional[float], ) -> tuple[SchedulerRunInfo, Iterator[Any], Iterator[float]]: requests_iter = iter(self.request_loader) start_time = time.time() times_iter = iter(scheduling_strategy.request_times()) end_time = time.time() + (max_duration or math.inf) - end_number = max_number or math.inf - - try: - # update end number if the request loader is finite and less than max - iter_length = len(self.request_loader) # type: ignore[arg-type] - if 0 < iter_length < end_number: - end_number = iter_length - except Exception: # noqa: BLE001, S110 - pass + end_number = self._determine_total_requests_count(max_number) if end_number == math.inf and end_time is None: logger.warning( @@ -276,10 +342,25 @@ def _run_setup( end_number=end_number, processes=len(processes), strategy=scheduling_strategy, + max_error=max_error, ) return info, requests_iter, times_iter + def _determine_total_requests_count( + self, + max_number: Optional[int], + ) -> Union[int, float]: + end_number = max_number or math.inf + try: + # update end_number if the request_loader is finite and less than max_number + iter_length = len(self.request_loader) # type: ignore[arg-type] + if 0 < iter_length < end_number: + end_number = iter_length + except Exception: # noqa: BLE001, S110 + pass + return end_number + def _add_requests( self, requests_iter: Optional[Iterator[Any]], @@ -296,11 +377,20 @@ def _add_requests( and added_count < settings.max_add_requests_per_loop ): if run_info.created_requests >= run_info.end_number: + # When `--max-seconds` is set and the dataset is finite - + # almost always the benchmark will end slightly before the + # actual `end_time`. + # Since the units are seconds, rounding up 1s is reasonable. + if time.time() >= run_info.end_time - 1: + run_info.termination_reason = "max_seconds_reached" + else: + run_info.termination_reason = "max_requests_reached" raise StopIteration if ( request_time := next(times_iter) ) >= run_info.end_time or time.time() >= run_info.end_time: + run_info.termination_reason = "max_seconds_reached" raise StopIteration request = next(requests_iter) @@ -361,6 +451,11 @@ def _check_result_ready( if process_response.type_ == "request_complete": run_info.processing_requests -= 1 run_info.completed_requests += 1 + run_info.current_window += 1 + + if process_response.info.errored: + run_info.errored_requests += 1 + run_info.errors_in_window += 1 return SchedulerRequestResult( type_="request_complete", @@ -370,13 +465,3 @@ def _check_result_ready( response=process_response.response, ) raise ValueError(f"Invalid process response type: {process_response}") - - async def _stop_processes( - self, - futures: list[asyncio.Future], - requests_queue: multiprocessing.Queue, - ): - for _ in futures: - requests_queue.put(None) - - await asyncio.gather(*futures) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index a53b14c2..c71fbc81 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -1,11 +1,14 @@ import asyncio import math -import multiprocessing import multiprocessing.queues +import queue +import threading import time +import typing from abc import ABC, abstractmethod from collections.abc import AsyncGenerator from dataclasses import dataclass +from multiprocessing.synchronize import Event as MultiprocessingEvent from typing import ( Any, Generic, @@ -40,6 +43,10 @@ ] +class ShutdownSignalReceivedError(Exception): + pass + + @dataclass class WorkerProcessRequest(Generic[RequestT]): request: RequestT @@ -121,9 +128,32 @@ async def resolve( ... async def get_request( - self, requests_queue: multiprocessing.Queue - ) -> Optional[WorkerProcessRequest[RequestT]]: - return await asyncio.to_thread(requests_queue.get) # type: ignore[attr-defined] + self, + requests_queue: multiprocessing.Queue, + process_id: int, + shutdown_poll_interval_seconds: float, + ) -> WorkerProcessRequest[RequestT]: + shutdown_event = threading.Event() + + # We need to check shutdown_event intermittently cause + # if we simply use asyncio.to_thread(requests_queue.get) + # the cancellation task doesn't propagate because the + # asyncio.to_thread is blocking + def _get_queue_intermittently(): + while not shutdown_event.is_set(): + try: + request = requests_queue.get(timeout=shutdown_poll_interval_seconds) + logger.debug(f"Gor request in future {process_id}") + return request + except queue.Empty: + logger.trace(f"Queue was empty in future {process_id}") + logger.info(f"Shutdown signal received in future {process_id}") + return None + + try: + return await asyncio.to_thread(_get_queue_intermittently) + finally: + shutdown_event.set() async def send_result( self, @@ -149,25 +179,33 @@ async def resolve_scheduler_request( scheduled_time=time.time(), process_id=process_id, ) - result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult( - type_="request_scheduled", - request=request, - response=None, - info=info, + asyncio.create_task( + self.send_result( + results_queue, + WorkerProcessResult( + type_="request_scheduled", + request=request, + response=None, + info=info, + ), + ) ) - asyncio.create_task(self.send_result(results_queue, result)) if (wait_time := start_time - time.time()) > 0: await asyncio.sleep(wait_time) info.worker_start = time.time() - result = WorkerProcessResult( - type_="request_start", - request=request, - response=None, - info=info, + asyncio.create_task( + self.send_result( + results_queue, + WorkerProcessResult( + type_="request_start", + request=request, + response=None, + info=info, + ), + ) ) - asyncio.create_task(self.send_result(results_queue, result)) status, response = await self.resolve(request, timeout_time) info.worker_end = time.time() @@ -185,27 +223,75 @@ async def resolve_scheduler_request( ) asyncio.create_task(self.send_result(results_queue, result)) - def process_loop_synchronous( + def run_process( self, + type_: Literal["sync", "async"], requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval_seconds: float, process_id: int, + max_concurrency: int, ): async def _process_runner(): - while ( - process_request := await self.get_request(requests_queue) - ) is not None: - dequeued_time = time.time() + if type_ == "sync": + loop_task = asyncio.create_task( + self._process_synchronous_requests_loop( + requests_queue=requests_queue, + results_queue=results_queue, + process_id=process_id, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, + ), + name="request_loop_processor_task", + ) + elif type_ == "async": + loop_task = asyncio.create_task( + self._process_asynchronous_requests_loop( + requests_queue=requests_queue, + results_queue=results_queue, + max_concurrency=max_concurrency, + process_id=process_id, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, + ), + name="request_loop_processor_task", + ) + else: + raise ValueError(f"Invalid process type: {type_}") - await self.resolve_scheduler_request( - request=process_request.request, - queued_time=process_request.queued_time, - dequeued_time=dequeued_time, - start_time=process_request.start_time, - timeout_time=process_request.timeout_time, - results_queue=results_queue, + shutdown_task = asyncio.create_task( + self._wait_for_shutdown( + shutdown_event=shutdown_event, + shutdown_poll_interval=shutdown_poll_interval_seconds, process_id=process_id, + ), + name="shutdown_task", + ) + + done, pending = await asyncio.wait( + [ + loop_task, + shutdown_task, + ], + return_when=asyncio.FIRST_EXCEPTION, + ) + logger.info( + f"First exception happened, done: [{[r.get_name() for r in done]}" + ) + + for task in pending: + logger.debug( + f"Cancelling task {task.get_name()}|| Process {process_id}" ) + task.cancel() + try: # noqa: SIM105 + await task + except asyncio.CancelledError: + pass + + for task in done: + task_exception = typing.cast("Exception", task.exception()) + if not isinstance(task_exception, ShutdownSignalReceivedError): + raise task_exception try: asyncio.run(_process_runner()) @@ -215,53 +301,100 @@ async def _process_runner(): exc_info=True, stack_info=True, ) + finally: + shutdown_event.set() # ensure shutdown event is set to stop other processes + + async def _wait_for_shutdown( + self, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval: float, + process_id: int, + ): + while not shutdown_event.is_set(): # noqa: ASYNC110 + await asyncio.sleep(shutdown_poll_interval) + + raise ShutdownSignalReceivedError( + f"Shutdown event set for process {process_id}, cancelling process loop." + ) + + async def _process_synchronous_requests_loop( + self, + requests_queue: multiprocessing.Queue, + results_queue: multiprocessing.Queue, + process_id: int, + shutdown_poll_interval_seconds: float, + ): + while True: + process_request = await self.get_request( + requests_queue=requests_queue, + process_id=process_id, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, + ) - def process_loop_asynchronous( + dequeued_time = time.time() + + await self.resolve_scheduler_request( + request=process_request.request, + queued_time=process_request.queued_time, + dequeued_time=dequeued_time, + start_time=process_request.start_time, + timeout_time=process_request.timeout_time, + results_queue=results_queue, + process_id=process_id, + ) + + async def _process_asynchronous_requests_loop( self, requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, + shutdown_poll_interval_seconds: float, ): - async def _process_runner(): - pending = asyncio.Semaphore(max_concurrency) + pending = asyncio.Semaphore(max_concurrency) - if pending.locked(): - raise ValueError("Async worker called with max_concurrency < 1") + if pending.locked(): + raise ValueError("Async worker called with max_concurrency < 1") - while ( - process_request := await self.get_request(requests_queue) - ) is not None: - dequeued_time = time.time() + while True: + process_request = await self.get_request( + requests_queue=requests_queue, + process_id=process_id, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, + ) - await pending.acquire() + dequeued_time = time.time() + logger.debug( + f"Dequeued Process ID {process_id} || " + f"Timestamp {dequeued_time} || " + f"Semaphore {pending._value}/{max_concurrency}" # noqa: SLF001 + ) - def _task_done(_: asyncio.Task): - nonlocal pending - pending.release() + await pending.acquire() + lock_acquired_at = time.time() + logger.debug( + f"Lock acquired Process ID {process_id} ||" + f" Timestamp {lock_acquired_at} ||" + f" Semaphore {pending._value}/{max_concurrency}" # noqa: SLF001 + ) - task = asyncio.create_task( - self.resolve_scheduler_request( - request=process_request.request, - queued_time=process_request.queued_time, - dequeued_time=dequeued_time, - start_time=process_request.start_time, - timeout_time=process_request.timeout_time, - results_queue=results_queue, - process_id=process_id, - ) - ) - task.add_done_callback(_task_done) - await asyncio.sleep(0) # enable start task immediately + def _task_done(_: asyncio.Task): + nonlocal pending + pending.release() - try: - asyncio.run(_process_runner()) - except Exception as exc: # noqa: BLE001 - logger.error( - f"Error in worker process {process_id}: {exc}", - exc_info=True, - stack_info=True, + task = asyncio.create_task( + self.resolve_scheduler_request( + request=process_request.request, + queued_time=process_request.queued_time, + dequeued_time=dequeued_time, + start_time=process_request.start_time, + timeout_time=process_request.timeout_time, + results_queue=results_queue, + process_id=process_id, + ) ) + task.add_done_callback(_task_done) + await asyncio.sleep(0) # enable start task immediately class GenerativeRequestsWorkerDescription(WorkerDescription): @@ -309,32 +442,25 @@ async def prepare_multiprocessing(self): """ await self.backend.prepare_multiprocessing() - def process_loop_synchronous( + def run_process( self, + type_: Literal["sync", "async"], requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, + shutdown_event: MultiprocessingEvent, + shutdown_poll_interval_seconds: float, process_id: int, - ): - asyncio.run(self.backend.validate()) - super().process_loop_synchronous( - requests_queue=requests_queue, - results_queue=results_queue, - process_id=process_id, - ) - - def process_loop_asynchronous( - self, - requests_queue: multiprocessing.Queue, - results_queue: multiprocessing.Queue, max_concurrency: int, - process_id: int, ): asyncio.run(self.backend.validate()) - super().process_loop_asynchronous( + super().run_process( + type_=type_, requests_queue=requests_queue, results_queue=results_queue, - max_concurrency=max_concurrency, + shutdown_event=shutdown_event, + shutdown_poll_interval_seconds=shutdown_poll_interval_seconds, process_id=process_id, + max_concurrency=max_concurrency, ) async def resolve( @@ -375,7 +501,7 @@ async def resolve( request_func, request_kwargs = self._create_request_func_kwargs(request) async def _runner(): - # wrap function so we can enforce timeout and + # wrap function so that we can enforce timeout and # still return the latest state from the backend async for resp in request_func(**request_kwargs): # type: ignore[operator] nonlocal response diff --git a/tests/e2e/README.md b/tests/e2e/README.md new file mode 100644 index 00000000..cdb0e741 --- /dev/null +++ b/tests/e2e/README.md @@ -0,0 +1,7 @@ +# E2E tests + +The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command: + +```shell +docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./ +``` diff --git a/tests/e2e/test_max_error_benchmark.py b/tests/e2e/test_max_error_benchmark.py new file mode 100644 index 00000000..7c377120 --- /dev/null +++ b/tests/e2e/test_max_error_benchmark.py @@ -0,0 +1,92 @@ +# test_server_interaction.py + +import json +import subprocess +import time +from pathlib import Path + +import pytest +from loguru import logger + +from tests.e2e.vllm_sim_server import VllmSimServer + + +@pytest.fixture(scope="module") +def server(): + """ + Pytest fixture to start and stop the server for the entire module + using the TestServer class. + """ + server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b", mode="echo") + try: + server.start() + yield server # Yield the URL for tests to use + finally: + server.stop() # Teardown: Stop the server after tests are done + + +@pytest.mark.timeout(30) +def test_max_error_benchmark(server: VllmSimServer): + """ + Another example test interacting with the server. + """ + report_path = Path("tests/e2e/max_error_benchmarks.json") + rate = 10 + max_error_rate = 0.1 + command = f"""guidellm benchmark \ + --target "{server.get_url()}" \ + --rate-type constant \ + --rate {rate} \ + --max-seconds 60 \ + --max-error {max_error_rate} \ + --data "prompt_tokens=256,output_tokens=128" \ + --output-path {report_path} + """ + logger.info(f"Client command: {command}") + process = subprocess.Popen( # noqa: S603 + ["/bin/bash", "-c", command], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + logger.info("Waiting for client to start...") + time.sleep(10) + server.stop() + + try: + logger.info("Fetching client output") + stdout, stderr = process.communicate() + logger.debug(f"Client stdout:\n{stdout}") + logger.debug(f"Client stderr:\n{stderr}") + + assert report_path.exists() + with report_path.open("r") as f: + report = json.load(f) + + assert "benchmarks" in report + benchmarks = report["benchmarks"] + assert len(benchmarks) > 0 + benchmark = benchmarks[0] + assert "run_stats" in benchmark + run_stats = benchmark["run_stats"] + assert "status" in run_stats + status = run_stats["status"] + assert status == "error" + assert "termination_reason" in run_stats + termination_reason = run_stats["termination_reason"] + assert termination_reason == "max_error_reached" + assert "window_error_rate" in run_stats + window_error_rate = run_stats["window_error_rate"] + assert window_error_rate > max_error_rate + finally: + process.terminate() # Send SIGTERM + try: + process.wait(timeout=5) # Wait for the process to terminate + logger.info("Client stopped successfully.") + except subprocess.TimeoutExpired: + logger.warning("Client did not terminate gracefully, killing it...") + process.kill() # Send SIGKILL if it doesn't terminate + process.wait() + + if report_path.exists(): + report_path.unlink() diff --git a/tests/e2e/test_placeholder.py b/tests/e2e/test_placeholder.py deleted file mode 100644 index 0d35031c..00000000 --- a/tests/e2e/test_placeholder.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.smoke -def test_placeholder(): - assert True diff --git a/tests/e2e/test_successful_benchmark.py b/tests/e2e/test_successful_benchmark.py new file mode 100644 index 00000000..c8521030 --- /dev/null +++ b/tests/e2e/test_successful_benchmark.py @@ -0,0 +1,118 @@ +# test_server_interaction.py + +import json +import os +from pathlib import Path + +import pytest +from loguru import logger + +from tests.e2e.vllm_sim_server import VllmSimServer + + +@pytest.fixture(scope="module") +def server(): + """ + Pytest fixture to start and stop the server for the entire module + using the TestServer class. + """ + server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b", mode="echo") + try: + server.start() + yield server # Yield the URL for tests to use + finally: + server.stop() # Teardown: Stop the server after tests are done + + +@pytest.mark.timeout(30) +def test_max_seconds_benchmark(server: VllmSimServer): + """ + Another example test interacting with the server. + """ + report_path = Path("tests/e2e/max_duration_benchmarks.json") + rate = 10 + command = f""" +guidellm benchmark \ + --target "{server.get_url()}" \ + --rate-type constant \ + --rate {rate} \ + --max-seconds 1 \ + --data "prompt_tokens=256,output_tokens=128" \ + --output-path {report_path} + """ + + logger.info(f"Client command: {command}") + os.system(command) # noqa: S605 + + assert report_path.exists() + with report_path.open("r") as f: + report = json.load(f) + + assert "benchmarks" in report + benchmarks = report["benchmarks"] + assert len(benchmarks) > 0 + benchmark = benchmarks[0] + assert "requests" in benchmark + requests = benchmark["requests"] + assert "successful" in requests + successful = requests["successful"] + assert len(successful) > rate + + assert "run_stats" in benchmark + run_stats = benchmark["run_stats"] + assert "status" in run_stats + status = run_stats["status"] + assert status == "success" + assert "termination_reason" in run_stats + termination_reason = run_stats["termination_reason"] + assert termination_reason == "max_seconds_reached" + + if report_path.exists(): + report_path.unlink() + + +@pytest.mark.timeout(30) +def test_max_requests_benchmark(server: VllmSimServer): + """ + Another example test interacting with the server. + """ + report_path = Path("tests/e2e/max_number_benchmarks.json") + rate = 10 + command = f""" +guidellm benchmark \ + --target "{server.get_url()}" \ + --rate-type constant \ + --rate {rate} \ + --max-requests {rate} \ + --data "prompt_tokens=256,output_tokens=128" \ + --output-path {report_path} + """ + + logger.info(f"Client command: {command}") + os.system(command) # noqa: S605 + + assert report_path.exists() + with report_path.open("r") as f: + report = json.load(f) + + assert "benchmarks" in report + benchmarks = report["benchmarks"] + assert len(benchmarks) > 0 + benchmark = benchmarks[0] + assert "requests" in benchmark + requests = benchmark["requests"] + assert "successful" in requests + successful = requests["successful"] + assert len(successful) == rate + + assert "run_stats" in benchmark + run_stats = benchmark["run_stats"] + assert "status" in run_stats + status = run_stats["status"] + assert status == "success" + assert "termination_reason" in run_stats + termination_reason = run_stats["termination_reason"] + assert termination_reason == "max_requests_reached" + + if report_path.exists(): + report_path.unlink() diff --git a/tests/e2e/vllm-sim.Dockerfile b/tests/e2e/vllm-sim.Dockerfile new file mode 100644 index 00000000..1606e46c --- /dev/null +++ b/tests/e2e/vllm-sim.Dockerfile @@ -0,0 +1,12 @@ +FROM golang AS base + +WORKDIR /app + +RUN git clone https://github.com/llm-d/llm-d-inference-sim.git && \ + cd llm-d-inference-sim && \ + make build + +WORKDIR /app/llm-d-inference-sim + +FROM scratch +COPY --from=base /app/llm-d-inference-sim/bin/llm-d-inference-sim /bin/llm-d-inference-sim diff --git a/tests/e2e/vllm_sim_server.py b/tests/e2e/vllm_sim_server.py new file mode 100644 index 00000000..4910abf5 --- /dev/null +++ b/tests/e2e/vllm_sim_server.py @@ -0,0 +1,138 @@ +import subprocess +import time +from pathlib import Path +from typing import Optional + +import pytest +import requests +from loguru import logger + + +class VllmSimServer: + """ + [vLLM simulator](https://llm-d.ai/docs/architecture/Components/inf-simulator) + A vLLM simulator wrapper for pytest. + """ + + def __init__( + self, + port: int, + model: str, + lora: Optional[list[str]] = None, + mode: Optional[str] = None, + echo: Optional[bool] = None, + random: Optional[bool] = None, + time_to_first_token: Optional[float] = None, + inter_token_latency: Optional[float] = None, + max_loras: Optional[int] = None, + max_cpu_loras: Optional[int] = None, + max_running_requests: Optional[int] = None, + ): + self.port = port + self.model = model + self.lora = lora + self.mode = mode + self.echo = echo + self.random = random + self.time_to_first_token = time_to_first_token + self.inter_token_latency = inter_token_latency + self.max_loras = max_loras + self.max_cpu_loras = max_cpu_loras + self.max_running_requests = max_running_requests + self.server_url = f"http://127.0.0.1:{self.port}" + self.health_url = f"{self.server_url}/health" + self.app_script = "./bin/llm-d-inference-sim" + self.process: Optional[subprocess.Popen] = None + if not Path(self.app_script).exists(): + message = ( + "The vLLM simulator binary is required for E2E tests, but is missing.\n" + "To build it and enable E2E tests, please run:\n" + "docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./" + ) + logger.warning(message) + pytest.skip("vLLM simlator binary missing", allow_module_level=True) + + def get_cli_parameters(self) -> list[str]: + parameters = ["--port", f"{self.port}", "--model", self.model] + if self.lora is not None: + parameters.extend(["--lora", ",".join(self.lora)]) + if self.mode is not None: + parameters.extend(["--mode", self.mode]) + if self.echo is not None: + parameters.extend(["--echo"]) + if self.random is not None: + parameters.extend(["--random"]) + if self.time_to_first_token is not None: + parameters.extend(["--time-to-first-token", f"{self.time_to_first_token}"]) + if self.inter_token_latency is not None: + parameters.extend(["--inter-token-latency", f"{self.inter_token_latency}"]) + if self.max_loras is not None: + parameters.extend(["--max-loras", f"{self.max_loras}"]) + if self.max_cpu_loras is not None: + parameters.extend(["--max-cpu-loras", f"{self.max_cpu_loras}"]) + if self.max_running_requests is not None: + parameters.extend( + ["--max-running-requests", f"{self.max_running_requests}"] + ) + return parameters + + def start(self): + """ + Starts the server process and waits for it to become healthy. + """ + + logger.info(f"Starting server on {self.server_url} using {self.app_script}...") + cli_parameters = self.get_cli_parameters() + command = " ".join([self.app_script, *cli_parameters]) + logger.info(f"Server command: {command}") + self.process = subprocess.Popen( # noqa: S603 + [self.app_script, *cli_parameters], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, # Decode stdout/stderr as text + ) + + # Wait for the server to start and become healthy + max_retries = 20 + retry_delay_sec = 0.5 + for i in range(max_retries): + try: + response = requests.get(self.health_url, timeout=1) + if response.status_code == 200: + logger.info(f"Server started successfully at {self.server_url}") + return + else: + logger.warning(f"Got response with status: {response.status_code}") + logger.warning(response.json()) + except requests.ConnectionError: + logger.warning(f"Waiting for server... (attempt {i + 1}/{max_retries})") + time.sleep(retry_delay_sec) + # If the loop completes without breaking, the server didn't start + stdout, stderr = self.process.communicate() + logger.error(f"Server failed to start after {max_retries} retries.") + logger.error(f"Server stdout:\n{stdout}") + logger.error(f"Server stderr:\n{stderr}") + self.stop() # Attempt to clean up + pytest.fail("Server did not start within the expected time.") + + def stop(self): + """ + Stops the server process. + """ + if self.process: + logger.info(f"Stopping server on {self.server_url}...") + self.process.terminate() # Send SIGTERM + try: + self.process.wait(timeout=1) # Wait for the process to terminate + logger.info("Server stopped successfully.") + except subprocess.TimeoutExpired: + logger.warning("Server did not terminate gracefully, killing it...") + self.process.kill() # Send SIGKILL if it doesn't terminate + self.process.wait() + self.process = None # Clear the process reference + + def get_url(self): + """ + Returns the base URL of the running server. + """ + return self.server_url diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py index 9076834b..de32b44b 100644 --- a/tests/unit/benchmark/test_output.py +++ b/tests/unit/benchmark/test_output.py @@ -113,7 +113,7 @@ def test_console_benchmarks_args_str(): mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_args_str == ( - "max_number=None, max_duration=10.0, warmup_number=None, " + "max_number=None, max_duration=10.0, max_error=0.05, warmup_number=None, " "warmup_duration=None, cooldown_number=None, cooldown_duration=None" ) diff --git a/tests/unit/entrypoints/assets/benchmarks_stripped.json b/tests/unit/entrypoints/assets/benchmarks_stripped.json index a95d2880..57c339f9 100644 --- a/tests/unit/entrypoints/assets/benchmarks_stripped.json +++ b/tests/unit/entrypoints/assets/benchmarks_stripped.json @@ -1 +1 @@ -{"benchmarks": [{"type_": "generative_benchmark", "id_": "97ece514-8717-412f-9dba-2b42bcd9866f", "run_id": "93e36b31-b454-471d-ba62-6b2671585485", "args": {"profile": {"type_": "sweep", "completed_strategies": 10, "measured_rates": [1.5481806532737452], "measured_concurrencies": [0.9977627456483604], "max_concurrency": null, "strategy_type": "constant", "rate": -1.0, "initial_burst": true, "random_seed": 42, "sweep_size": 10, "rate_type": "constant", "strategy_types": ["synchronous", "throughput", "constant", "constant", "constant", "constant", "constant", "constant", "constant", "constant"]}, "strategy_index": 0, "strategy": {"type_": "synchronous"}, "max_number": null, "max_duration": 30.0, "warmup_number": null, "warmup_duration": null, "cooldown_number": null, "cooldown_duration": null}, "run_stats": {"start_time": 1749157168.054225, "end_time": 1749157198.213826, "requests_made": {"successful": 1, "errored": 0, "incomplete": 0, "total": 1}, "queued_time_avg": 0.631589580089488, "scheduled_time_delay_avg": 3.784260851271609e-06, "scheduled_time_sleep_avg": 0.0, "worker_start_delay_avg": 2.8021792148021943e-05, "worker_time_avg": 0.6373953819274902, "worker_start_time_targeted_delay_avg": 0.6319031715393066, "request_start_time_delay_avg": 0.316034068452551, "request_start_time_targeted_delay_avg": 0.6319856542222043, "request_time_delay_avg": 0.00029866238857837433, "request_time_avg": 0.6370967195389119}, "worker": {"type_": "generative_requests_worker", "backend_type": "openai_http", "backend_target": "example_target", "backend_model": "example_model", "backend_info": {"max_output_tokens": 16384, "timeout": 300, "http2": true, "authorization": false, "organization": null, "project": null, "text_completions_path": "/v1/completions", "chat_completions_path": "/v1/chat/completions"}}, "request_loader": {"type_": "generative_request_loader", "data": "prompt_tokens=256,output_tokens=128", "data_args": null, "processor": "example_processor", "processor_args": null}, "extras": {}, "metrics": {"requests_per_second": {"successful": {"mean": 1.5481806532737452, "median": 1.5530116578512305, "mode": 1.555484186315253, "variance": 0.0003352629331303757, "std_dev": 0.01831018659463567, "min": 1.4509899157628907, "max": 1.5597664461806156, "count": 45, "total_sum": 69.6707872953874, "percentiles": {"p001": 1.4509899157628907, "p01": 1.4509899157628907, "p05": 1.5190957942495127, "p10": 1.5377883923356668, "p25": 1.5483918601985445, "p75": 1.5567531615313124, "p90": 1.5583715343236735, "p95": 1.5590938878953722, "p99": 1.5597664461806156, "p999": 1.5597664461806156}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 1.5668128271815418, "median": 1.5530312090734288, "mode": 1.555484186315253, "variance": 0.036536424510388923, "std_dev": 0.19114503527528232, "min": 1.4509899157628907, "max": 3.509921881864626, "count": 46, "total_sum": 73.18070917725203, "percentiles": {"p001": 1.4509899157628907, "p01": 1.4509899157628907, "p05": 1.5190957942495127, "p10": 1.5377883923356668, "p25": 1.5483918601985445, "p75": 1.5567531615313124, "p90": 1.5583715343236735, "p95": 1.5591048992639953, "p99": 1.5597664461806156, "p999": 3.509921881864626}, "cumulative_distribution_function": null}}, "request_concurrency": {"successful": {"mean": 0.9977627456483604, "median": 1.0, "mode": 1.0, "variance": 0.002232249044605607, "std_dev": 0.047246682895263736, "min": 0.0, "max": 1.0, "count": 2, "total_sum": 1.0, "percentiles": {"p001": 0.0, "p01": 1.0, "p05": 1.0, "p10": 1.0, "p25": 1.0, "p75": 1.0, "p90": 1.0, "p95": 1.0, "p99": 1.0, "p999": 1.0}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 1.0, "median": 1.0, "mode": 1.0, "variance": 0.0, "std_dev": 0.0, "min": 1.0, "max": 1.0, "count": 1, "total_sum": 1.0, "percentiles": {"p001": 1.0, "p01": 1.0, "p05": 1.0, "p10": 1.0, "p25": 1.0, "p75": 1.0, "p90": 1.0, "p95": 1.0, "p99": 1.0, "p999": 1.0}, "cumulative_distribution_function": null}, "total": {"mean": 0.9977433642674269, "median": 1.0, "mode": 1.0, "variance": 0.002251543327743578, "std_dev": 0.047450430216633206, "min": 0.0, "max": 1.0, "count": 2, "total_sum": 1.0, "percentiles": {"p001": 0.0, "p01": 1.0, "p05": 1.0, "p10": 1.0, "p25": 1.0, "p75": 1.0, "p90": 1.0, "p95": 1.0, "p99": 1.0, "p999": 1.0}, "cumulative_distribution_function": null}}, "request_latency": {"successful": {"mean": 0.6444743664368339, "median": 0.6424565315246582, "mode": 0.6395885944366455, "variance": 6.414585873782315e-05, "std_dev": 0.008009110982988258, "min": 0.6395885944366455, "max": 0.6891846656799316, "count": 46, "total_sum": 29.64582085609436, "percentiles": {"p001": 0.6395885944366455, "p01": 0.6395885944366455, "p05": 0.6399857997894287, "p10": 0.6403069496154785, "p25": 0.6409540176391602, "p75": 0.644390344619751, "p90": 0.6488735675811768, "p95": 0.656728982925415, "p99": 0.6891846656799316, "p999": 0.6891846656799316}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.2836878299713135, "median": 0.2836878299713135, "mode": 0.2836878299713135, "variance": 0.0, "std_dev": 0.0, "min": 0.2836878299713135, "max": 0.2836878299713135, "count": 1, "total_sum": 0.2836878299713135, "percentiles": {"p001": 0.2836878299713135, "p01": 0.2836878299713135, "p05": 0.2836878299713135, "p10": 0.2836878299713135, "p25": 0.2836878299713135, "p75": 0.2836878299713135, "p90": 0.2836878299713135, "p95": 0.2836878299713135, "p99": 0.2836878299713135, "p999": 0.2836878299713135}, "cumulative_distribution_function": null}, "total": {"mean": 0.6367980571503334, "median": 0.642310380935669, "mode": 0.2836878299713135, "variance": 0.0027733643692853522, "std_dev": 0.05266274175624881, "min": 0.2836878299713135, "max": 0.6891846656799316, "count": 47, "total_sum": 29.929508686065674, "percentiles": {"p001": 0.2836878299713135, "p01": 0.2836878299713135, "p05": 0.6398613452911377, "p10": 0.6402454376220703, "p25": 0.640899658203125, "p75": 0.644390344619751, "p90": 0.6488735675811768, "p95": 0.656728982925415, "p99": 0.6891846656799316, "p999": 0.6891846656799316}, "cumulative_distribution_function": null}}, "prompt_token_count": {"successful": {"mean": 257.1086956521739, "median": 257.0, "mode": 257.0, "variance": 0.14035916824196598, "std_dev": 0.37464538999161057, "min": 257.0, "max": 259.0, "count": 46, "total_sum": 11827.0, "percentiles": {"p001": 257.0, "p01": 257.0, "p05": 257.0, "p10": 257.0, "p25": 257.0, "p75": 257.0, "p90": 257.0, "p95": 258.0, "p99": 259.0, "p999": 259.0}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 256.0, "median": 256.0, "mode": 256.0, "variance": 0.0, "std_dev": 0.0, "min": 256.0, "max": 256.0, "count": 1, "total_sum": 256.0, "percentiles": {"p001": 256.0, "p01": 256.0, "p05": 256.0, "p10": 256.0, "p25": 256.0, "p75": 256.0, "p90": 256.0, "p95": 256.0, "p99": 256.0, "p999": 256.0}, "cumulative_distribution_function": null}, "total": {"mean": 257.0851063829787, "median": 257.0, "mode": 256.0, "variance": 0.16296966953372566, "std_dev": 0.40369502044702715, "min": 256.0, "max": 259.0, "count": 47, "total_sum": 12083.0, "percentiles": {"p001": 256.0, "p01": 256.0, "p05": 257.0, "p10": 257.0, "p25": 257.0, "p75": 257.0, "p90": 257.0, "p95": 258.0, "p99": 259.0, "p999": 259.0}, "cumulative_distribution_function": null}}, "output_token_count": {"successful": {"mean": 127.99999999999999, "median": 128.0, "mode": 128.0, "variance": 2.01948391736579e-28, "std_dev": 1.4210854715202002e-14, "min": 128.0, "max": 128.0, "count": 46, "total_sum": 5888.0, "percentiles": {"p001": 128.0, "p01": 128.0, "p05": 128.0, "p10": 128.0, "p25": 128.0, "p75": 128.0, "p90": 128.0, "p95": 128.0, "p99": 128.0, "p999": 128.0}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 126.44680851063832, "median": 128.0, "mode": 55.0, "variance": 110.97057492077867, "std_dev": 10.534257207832866, "min": 55.0, "max": 128.0, "count": 47, "total_sum": 5943.0, "percentiles": {"p001": 55.0, "p01": 55.0, "p05": 128.0, "p10": 128.0, "p25": 128.0, "p75": 128.0, "p90": 128.0, "p95": 128.0, "p99": 128.0, "p999": 128.0}, "cumulative_distribution_function": null}}, "time_to_first_token_ms": {"successful": {"mean": 16.792535781860348, "median": 16.38054847717285, "mode": 15.790939331054688, "variance": 1.2776652847210441, "std_dev": 1.1303385708366516, "min": 15.790939331054688, "max": 21.281957626342773, "count": 46, "total_sum": 772.4566459655762, "percentiles": {"p001": 15.790939331054688, "p01": 15.790939331054688, "p05": 15.971660614013672, "p10": 16.034841537475586, "p25": 16.111373901367188, "p75": 16.840696334838867, "p90": 18.505334854125977, "p95": 19.00935173034668, "p99": 21.281957626342773, "p999": 21.281957626342773}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 16.777170465347616, "median": 16.371726989746094, "mode": 15.790939331054688, "variance": 1.2613411927317046, "std_dev": 1.1230944718641014, "min": 15.790939331054688, "max": 21.281957626342773, "count": 47, "total_sum": 788.5270118713379, "percentiles": {"p001": 15.790939331054688, "p01": 15.790939331054688, "p05": 15.971660614013672, "p10": 16.034841537475586, "p25": 16.100645065307617, "p75": 16.840696334838867, "p90": 18.505334854125977, "p95": 19.00935173034668, "p99": 21.281957626342773, "p999": 21.281957626342773}, "cumulative_distribution_function": null}}, "time_per_output_token_ms": {"successful": {"mean": 4.90300272307966, "median": 4.885653033852577, "mode": 4.870360717177391, "variance": 0.003163643010108571, "std_dev": 0.05624627107736628, "min": 4.870360717177391, "max": 5.217265337705612, "count": 46, "total_sum": 225.5381252616644, "percentiles": {"p001": 4.870360717177391, "p01": 4.870360717177391, "p05": 4.8728808760643005, "p10": 4.873953759670258, "p25": 4.876237362623215, "p75": 4.904214292764664, "p90": 4.934689030051231, "p95": 4.993332549929619, "p99": 5.217265337705612, "p999": 5.217265337705612}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 4.9022222114856975, "median": 4.882922396063805, "mode": 4.870360717177391, "variance": 0.003199582258516055, "std_dev": 0.05656485002646127, "min": 4.81866489757191, "max": 5.217265337705612, "count": 47, "total_sum": 230.3567901592363, "percentiles": {"p001": 4.81866489757191, "p01": 4.870360717177391, "p05": 4.872731864452362, "p10": 4.873953759670258, "p25": 4.876237362623215, "p75": 4.904214292764664, "p90": 4.934689030051231, "p95": 4.993332549929619, "p99": 5.217265337705612, "p999": 5.217265337705612}, "cumulative_distribution_function": null}}, "inter_token_latency_ms": {"successful": {"mean": 4.941609043733832, "median": 4.9241227427805505, "mode": 4.90871001416304, "variance": 0.003213660306132974, "std_dev": 0.056689155101597465, "min": 4.90871001416304, "max": 5.258346167136365, "count": 46, "total_sum": 227.31401601175622, "percentiles": {"p001": 4.90871001416304, "p01": 4.90871001416304, "p05": 4.911250016820713, "p10": 4.9123313483290785, "p25": 4.91463293240765, "p75": 4.9428301533376136, "p90": 4.973544849185493, "p95": 5.032650129062923, "p99": 5.258346167136365, "p999": 5.258346167136365}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 4.9413003057767115, "median": 4.921370603906826, "mode": 4.90871001416304, "variance": 0.003194539306669541, "std_dev": 0.056520255720135776, "min": 4.9078994327121315, "max": 5.258346167136365, "count": 47, "total_sum": 232.22191544446835, "percentiles": {"p001": 4.9078994327121315, "p01": 4.90871001416304, "p05": 4.911099831888995, "p10": 4.9123313483290785, "p25": 4.91463293240765, "p75": 4.9428301533376136, "p90": 4.973544849185493, "p95": 5.032650129062923, "p99": 5.258346167136365, "p999": 5.258346167136365}, "cumulative_distribution_function": null}}, "output_tokens_per_second": {"successful": {"mean": 198.13346751788123, "median": 203.04516628745705, "mode": 203.5378269520066, "variance": 613.9948900522365, "std_dev": 24.778920276158857, "min": 0.0, "max": 203.69598368219124, "count": 122, "total_sum": 17849.590625912137, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 190.14888022486173, "p10": 200.69400449782287, "p25": 202.23259402121505, "p75": 203.42923658938793, "p90": 203.5378269520066, "p95": 203.58722454130668, "p99": 203.6860916860917, "p999": 203.69598368219124}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 198.08514508750469, "median": 203.04516628745705, "mode": 203.5378269520066, "variance": 619.6237334717947, "std_dev": 24.89224243558211, "min": 0.0, "max": 203.69598368219124, "count": 125, "total_sum": 18310.99071823841, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 190.14888022486173, "p10": 200.69400449782287, "p25": 202.23259402121505, "p75": 203.4193704835346, "p90": 203.5378269520066, "p95": 203.58722454130668, "p99": 203.6860916860917, "p999": 203.69598368219124}, "cumulative_distribution_function": null}}, "tokens_per_second": {"successful": {"mean": 992.6867036588937, "median": 614.3700014647723, "mode": 615.2712336805046, "variance": 62014350.40386989, "std_dev": 7874.919072845758, "min": 0.0, "max": 159300.81436773148, "count": 139, "total_sum": 5852579.912913391, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 574.9559972583961, "p10": 606.8148148148148, "p25": 611.5928842228055, "p75": 615.0907757735738, "p90": 615.4517975055026, "p95": 615.542119166422, "p99": 617.5359246171967, "p999": 157985.65557672578}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 1002.1268169766876, "median": 614.3700014647723, "mode": 615.2712336805046, "variance": 63939736.95341249, "std_dev": 7996.232672541019, "min": 0.0, "max": 296531.848660591, "count": 143, "total_sum": 6151486.576325966, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 574.9559972583961, "p10": 606.8148148148148, "p25": 611.5928842228055, "p75": 615.0907757735738, "p90": 615.4517975055026, "p95": 615.542119166422, "p99": 1158.3275338304336, "p999": 158008.81383758428}, "cumulative_distribution_function": null}}}, "start_time": 1749157168.1827004, "end_time": 1749157198.1799018, "request_totals": {"successful": 46, "errored": 0, "incomplete": 1, "total": 47}, "request_samples": null, "requests": {"successful": [{"type_": "generative_text_response", "request_id": "73054dd1-486f-4894-a861-075750b82453", "request_type": "text_completions", "scheduler_info": {"requested": true, "completed": true, "errored": false, "canceled": false, "targeted_start_time": 1749157168.179883, "queued_time": 1749157168.1811602, "dequeued_time": 1749157168.1818697, "scheduled_time": 1749157168.181895, "worker_start": 1749157168.1820004, "request_start": 1749157168.1827004, "request_end": 1749157168.871885, "worker_end": 1749157168.8723884, "process_id": 0}, "prompt": "such a sacrifice to her advantage as years of gratitude cannot enough acknowledge. By this time she is actually with them! If such goodness does not make her miserable now, she will never deserve to be happy! What a meeting for her, when she first sees my aunt! We must endeavour to forget all that has passed on either side, said Jane I hope and trust they will yet be happy. His consenting to marry her is a proof, I will believe, that he is come to a right way of thinking. Their mutual affection will steady them; and I flatter myself they will settle so quietly, and live in so rational a manner, as may in time make their past imprudence forgotten. Their conduct has been such, replied Elizabeth, as neither you, nor I, nor anybody, can ever forget. It is useless to talk of it. It now occurred to the girls that their mother was in all likelihood perfectly ignorant of what had happened. They went to the library, therefore, and asked their father whether he would not wish them to make it known to her. He was writing, and, without raising his head, coolly replied, Just as you please. May we take my uncle s letter to read to her? Take whatever you like, and get away", "output": ", said Jane. The letter was read, and the girls retired to their own apartments. Elizabeth was the first to return. She found her mother seated in the drawing-room, and looking very pale. She was dressed in a loose white gown, and her hair was disordered. She rose as they entered, and clasped them both in her arms, and then, without saying a word, took her seat on the sofa, and began to weep. Elizabeth and Jane stood by her side, and listened to the sobs which issued from her heart. She had no words to express her gratitude, and, in a few minutes,", "prompt_tokens": 257, "output_tokens": 128, "start_time": 1749157168.1827004, "end_time": 1749157168.871885, "first_token_time": 1749157168.2039824, "last_token_time": 1749157168.8717923, "request_latency": 0.6891846656799316, "time_to_first_token_ms": 21.281957626342773, "time_per_output_token_ms": 5.217265337705612, "inter_token_latency_ms": 5.258346167136365, "tokens_per_second": 558.631117568713, "output_tokens_per_second": 185.72670921765}], "errored": [], "incomplete": [], "total": null}, "duration": 29.997201442718506}]} \ No newline at end of file +{"benchmarks": [{"type_": "generative_benchmark", "id_": "97ece514-8717-412f-9dba-2b42bcd9866f", "run_id": "93e36b31-b454-471d-ba62-6b2671585485", "args": {"profile": {"type_": "sweep", "completed_strategies": 10, "measured_rates": [1.5481806532737452], "measured_concurrencies": [0.9977627456483604], "max_concurrency": null, "strategy_type": "constant", "rate": -1.0, "initial_burst": true, "random_seed": 42, "sweep_size": 10, "rate_type": "constant", "strategy_types": ["synchronous", "throughput", "constant", "constant", "constant", "constant", "constant", "constant", "constant", "constant"]}, "strategy_index": 0, "strategy": {"type_": "synchronous"}, "max_number": null, "max_duration": 30.0, "max_error": null, "warmup_number": null, "warmup_duration": null, "cooldown_number": null, "cooldown_duration": null}, "run_stats": {"start_time": 1749157168.054225, "end_time": 1749157198.213826, "requests_made": {"successful": 1, "errored": 0, "incomplete": 0, "total": 1}, "queued_time_avg": 0.631589580089488, "scheduled_time_delay_avg": 3.784260851271609e-06, "scheduled_time_sleep_avg": 0.0, "worker_start_delay_avg": 2.8021792148021943e-05, "worker_time_avg": 0.6373953819274902, "worker_start_time_targeted_delay_avg": 0.6319031715393066, "request_start_time_delay_avg": 0.316034068452551, "request_start_time_targeted_delay_avg": 0.6319856542222043, "request_time_delay_avg": 0.00029866238857837433, "request_time_avg": 0.6370967195389119, "error_rate": 0.0, "window_error_rate": 0.0, "status": "success", "termination_reason": "max_seconds_reached"}, "worker": {"type_": "generative_requests_worker", "backend_type": "openai_http", "backend_target": "example_target", "backend_model": "example_model", "backend_info": {"max_output_tokens": 16384, "timeout": 300, "http2": true, "authorization": false, "organization": null, "project": null, "text_completions_path": "/v1/completions", "chat_completions_path": "/v1/chat/completions"}}, "request_loader": {"type_": "generative_request_loader", "data": "prompt_tokens=256,output_tokens=128", "data_args": null, "processor": "example_processor", "processor_args": null}, "extras": {}, "metrics": {"requests_per_second": {"successful": {"mean": 1.5481806532737452, "median": 1.5530116578512305, "mode": 1.555484186315253, "variance": 0.0003352629331303757, "std_dev": 0.01831018659463567, "min": 1.4509899157628907, "max": 1.5597664461806156, "count": 45, "total_sum": 69.6707872953874, "percentiles": {"p001": 1.4509899157628907, "p01": 1.4509899157628907, "p05": 1.5190957942495127, "p10": 1.5377883923356668, "p25": 1.5483918601985445, "p75": 1.5567531615313124, "p90": 1.5583715343236735, "p95": 1.5590938878953722, "p99": 1.5597664461806156, "p999": 1.5597664461806156}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 1.5668128271815418, "median": 1.5530312090734288, "mode": 1.555484186315253, "variance": 0.036536424510388923, "std_dev": 0.19114503527528232, "min": 1.4509899157628907, "max": 3.509921881864626, "count": 46, "total_sum": 73.18070917725203, "percentiles": {"p001": 1.4509899157628907, "p01": 1.4509899157628907, "p05": 1.5190957942495127, "p10": 1.5377883923356668, "p25": 1.5483918601985445, "p75": 1.5567531615313124, "p90": 1.5583715343236735, "p95": 1.5591048992639953, "p99": 1.5597664461806156, "p999": 3.509921881864626}, "cumulative_distribution_function": null}}, "request_concurrency": {"successful": {"mean": 0.9977627456483604, "median": 1.0, "mode": 1.0, "variance": 0.002232249044605607, "std_dev": 0.047246682895263736, "min": 0.0, "max": 1.0, "count": 2, "total_sum": 1.0, "percentiles": {"p001": 0.0, "p01": 1.0, "p05": 1.0, "p10": 1.0, "p25": 1.0, "p75": 1.0, "p90": 1.0, "p95": 1.0, "p99": 1.0, "p999": 1.0}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 1.0, "median": 1.0, "mode": 1.0, "variance": 0.0, "std_dev": 0.0, "min": 1.0, "max": 1.0, "count": 1, "total_sum": 1.0, "percentiles": {"p001": 1.0, "p01": 1.0, "p05": 1.0, "p10": 1.0, "p25": 1.0, "p75": 1.0, "p90": 1.0, "p95": 1.0, "p99": 1.0, "p999": 1.0}, "cumulative_distribution_function": null}, "total": {"mean": 0.9977433642674269, "median": 1.0, "mode": 1.0, "variance": 0.002251543327743578, "std_dev": 0.047450430216633206, "min": 0.0, "max": 1.0, "count": 2, "total_sum": 1.0, "percentiles": {"p001": 0.0, "p01": 1.0, "p05": 1.0, "p10": 1.0, "p25": 1.0, "p75": 1.0, "p90": 1.0, "p95": 1.0, "p99": 1.0, "p999": 1.0}, "cumulative_distribution_function": null}}, "request_latency": {"successful": {"mean": 0.6444743664368339, "median": 0.6424565315246582, "mode": 0.6395885944366455, "variance": 6.414585873782315e-05, "std_dev": 0.008009110982988258, "min": 0.6395885944366455, "max": 0.6891846656799316, "count": 46, "total_sum": 29.64582085609436, "percentiles": {"p001": 0.6395885944366455, "p01": 0.6395885944366455, "p05": 0.6399857997894287, "p10": 0.6403069496154785, "p25": 0.6409540176391602, "p75": 0.644390344619751, "p90": 0.6488735675811768, "p95": 0.656728982925415, "p99": 0.6891846656799316, "p999": 0.6891846656799316}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.2836878299713135, "median": 0.2836878299713135, "mode": 0.2836878299713135, "variance": 0.0, "std_dev": 0.0, "min": 0.2836878299713135, "max": 0.2836878299713135, "count": 1, "total_sum": 0.2836878299713135, "percentiles": {"p001": 0.2836878299713135, "p01": 0.2836878299713135, "p05": 0.2836878299713135, "p10": 0.2836878299713135, "p25": 0.2836878299713135, "p75": 0.2836878299713135, "p90": 0.2836878299713135, "p95": 0.2836878299713135, "p99": 0.2836878299713135, "p999": 0.2836878299713135}, "cumulative_distribution_function": null}, "total": {"mean": 0.6367980571503334, "median": 0.642310380935669, "mode": 0.2836878299713135, "variance": 0.0027733643692853522, "std_dev": 0.05266274175624881, "min": 0.2836878299713135, "max": 0.6891846656799316, "count": 47, "total_sum": 29.929508686065674, "percentiles": {"p001": 0.2836878299713135, "p01": 0.2836878299713135, "p05": 0.6398613452911377, "p10": 0.6402454376220703, "p25": 0.640899658203125, "p75": 0.644390344619751, "p90": 0.6488735675811768, "p95": 0.656728982925415, "p99": 0.6891846656799316, "p999": 0.6891846656799316}, "cumulative_distribution_function": null}}, "prompt_token_count": {"successful": {"mean": 257.1086956521739, "median": 257.0, "mode": 257.0, "variance": 0.14035916824196598, "std_dev": 0.37464538999161057, "min": 257.0, "max": 259.0, "count": 46, "total_sum": 11827.0, "percentiles": {"p001": 257.0, "p01": 257.0, "p05": 257.0, "p10": 257.0, "p25": 257.0, "p75": 257.0, "p90": 257.0, "p95": 258.0, "p99": 259.0, "p999": 259.0}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 256.0, "median": 256.0, "mode": 256.0, "variance": 0.0, "std_dev": 0.0, "min": 256.0, "max": 256.0, "count": 1, "total_sum": 256.0, "percentiles": {"p001": 256.0, "p01": 256.0, "p05": 256.0, "p10": 256.0, "p25": 256.0, "p75": 256.0, "p90": 256.0, "p95": 256.0, "p99": 256.0, "p999": 256.0}, "cumulative_distribution_function": null}, "total": {"mean": 257.0851063829787, "median": 257.0, "mode": 256.0, "variance": 0.16296966953372566, "std_dev": 0.40369502044702715, "min": 256.0, "max": 259.0, "count": 47, "total_sum": 12083.0, "percentiles": {"p001": 256.0, "p01": 256.0, "p05": 257.0, "p10": 257.0, "p25": 257.0, "p75": 257.0, "p90": 257.0, "p95": 258.0, "p99": 259.0, "p999": 259.0}, "cumulative_distribution_function": null}}, "output_token_count": {"successful": {"mean": 127.99999999999999, "median": 128.0, "mode": 128.0, "variance": 2.01948391736579e-28, "std_dev": 1.4210854715202002e-14, "min": 128.0, "max": 128.0, "count": 46, "total_sum": 5888.0, "percentiles": {"p001": 128.0, "p01": 128.0, "p05": 128.0, "p10": 128.0, "p25": 128.0, "p75": 128.0, "p90": 128.0, "p95": 128.0, "p99": 128.0, "p999": 128.0}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 126.44680851063832, "median": 128.0, "mode": 55.0, "variance": 110.97057492077867, "std_dev": 10.534257207832866, "min": 55.0, "max": 128.0, "count": 47, "total_sum": 5943.0, "percentiles": {"p001": 55.0, "p01": 55.0, "p05": 128.0, "p10": 128.0, "p25": 128.0, "p75": 128.0, "p90": 128.0, "p95": 128.0, "p99": 128.0, "p999": 128.0}, "cumulative_distribution_function": null}}, "time_to_first_token_ms": {"successful": {"mean": 16.792535781860348, "median": 16.38054847717285, "mode": 15.790939331054688, "variance": 1.2776652847210441, "std_dev": 1.1303385708366516, "min": 15.790939331054688, "max": 21.281957626342773, "count": 46, "total_sum": 772.4566459655762, "percentiles": {"p001": 15.790939331054688, "p01": 15.790939331054688, "p05": 15.971660614013672, "p10": 16.034841537475586, "p25": 16.111373901367188, "p75": 16.840696334838867, "p90": 18.505334854125977, "p95": 19.00935173034668, "p99": 21.281957626342773, "p999": 21.281957626342773}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 16.777170465347616, "median": 16.371726989746094, "mode": 15.790939331054688, "variance": 1.2613411927317046, "std_dev": 1.1230944718641014, "min": 15.790939331054688, "max": 21.281957626342773, "count": 47, "total_sum": 788.5270118713379, "percentiles": {"p001": 15.790939331054688, "p01": 15.790939331054688, "p05": 15.971660614013672, "p10": 16.034841537475586, "p25": 16.100645065307617, "p75": 16.840696334838867, "p90": 18.505334854125977, "p95": 19.00935173034668, "p99": 21.281957626342773, "p999": 21.281957626342773}, "cumulative_distribution_function": null}}, "time_per_output_token_ms": {"successful": {"mean": 4.90300272307966, "median": 4.885653033852577, "mode": 4.870360717177391, "variance": 0.003163643010108571, "std_dev": 0.05624627107736628, "min": 4.870360717177391, "max": 5.217265337705612, "count": 46, "total_sum": 225.5381252616644, "percentiles": {"p001": 4.870360717177391, "p01": 4.870360717177391, "p05": 4.8728808760643005, "p10": 4.873953759670258, "p25": 4.876237362623215, "p75": 4.904214292764664, "p90": 4.934689030051231, "p95": 4.993332549929619, "p99": 5.217265337705612, "p999": 5.217265337705612}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 4.9022222114856975, "median": 4.882922396063805, "mode": 4.870360717177391, "variance": 0.003199582258516055, "std_dev": 0.05656485002646127, "min": 4.81866489757191, "max": 5.217265337705612, "count": 47, "total_sum": 230.3567901592363, "percentiles": {"p001": 4.81866489757191, "p01": 4.870360717177391, "p05": 4.872731864452362, "p10": 4.873953759670258, "p25": 4.876237362623215, "p75": 4.904214292764664, "p90": 4.934689030051231, "p95": 4.993332549929619, "p99": 5.217265337705612, "p999": 5.217265337705612}, "cumulative_distribution_function": null}}, "inter_token_latency_ms": {"successful": {"mean": 4.941609043733832, "median": 4.9241227427805505, "mode": 4.90871001416304, "variance": 0.003213660306132974, "std_dev": 0.056689155101597465, "min": 4.90871001416304, "max": 5.258346167136365, "count": 46, "total_sum": 227.31401601175622, "percentiles": {"p001": 4.90871001416304, "p01": 4.90871001416304, "p05": 4.911250016820713, "p10": 4.9123313483290785, "p25": 4.91463293240765, "p75": 4.9428301533376136, "p90": 4.973544849185493, "p95": 5.032650129062923, "p99": 5.258346167136365, "p999": 5.258346167136365}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 4.9413003057767115, "median": 4.921370603906826, "mode": 4.90871001416304, "variance": 0.003194539306669541, "std_dev": 0.056520255720135776, "min": 4.9078994327121315, "max": 5.258346167136365, "count": 47, "total_sum": 232.22191544446835, "percentiles": {"p001": 4.9078994327121315, "p01": 4.90871001416304, "p05": 4.911099831888995, "p10": 4.9123313483290785, "p25": 4.91463293240765, "p75": 4.9428301533376136, "p90": 4.973544849185493, "p95": 5.032650129062923, "p99": 5.258346167136365, "p999": 5.258346167136365}, "cumulative_distribution_function": null}}, "output_tokens_per_second": {"successful": {"mean": 198.13346751788123, "median": 203.04516628745705, "mode": 203.5378269520066, "variance": 613.9948900522365, "std_dev": 24.778920276158857, "min": 0.0, "max": 203.69598368219124, "count": 122, "total_sum": 17849.590625912137, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 190.14888022486173, "p10": 200.69400449782287, "p25": 202.23259402121505, "p75": 203.42923658938793, "p90": 203.5378269520066, "p95": 203.58722454130668, "p99": 203.6860916860917, "p999": 203.69598368219124}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 198.08514508750469, "median": 203.04516628745705, "mode": 203.5378269520066, "variance": 619.6237334717947, "std_dev": 24.89224243558211, "min": 0.0, "max": 203.69598368219124, "count": 125, "total_sum": 18310.99071823841, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 190.14888022486173, "p10": 200.69400449782287, "p25": 202.23259402121505, "p75": 203.4193704835346, "p90": 203.5378269520066, "p95": 203.58722454130668, "p99": 203.6860916860917, "p999": 203.69598368219124}, "cumulative_distribution_function": null}}, "tokens_per_second": {"successful": {"mean": 992.6867036588937, "median": 614.3700014647723, "mode": 615.2712336805046, "variance": 62014350.40386989, "std_dev": 7874.919072845758, "min": 0.0, "max": 159300.81436773148, "count": 139, "total_sum": 5852579.912913391, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 574.9559972583961, "p10": 606.8148148148148, "p25": 611.5928842228055, "p75": 615.0907757735738, "p90": 615.4517975055026, "p95": 615.542119166422, "p99": 617.5359246171967, "p999": 157985.65557672578}, "cumulative_distribution_function": null}, "errored": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "incomplete": {"mean": 0.0, "median": 0.0, "mode": 0.0, "variance": 0.0, "std_dev": 0.0, "min": 0.0, "max": 0.0, "count": 0, "total_sum": 0.0, "percentiles": {"p001": 0.0, "p01": 0.0, "p05": 0.0, "p10": 0.0, "p25": 0.0, "p75": 0.0, "p90": 0.0, "p95": 0.0, "p99": 0.0, "p999": 0.0}, "cumulative_distribution_function": null}, "total": {"mean": 1002.1268169766876, "median": 614.3700014647723, "mode": 615.2712336805046, "variance": 63939736.95341249, "std_dev": 7996.232672541019, "min": 0.0, "max": 296531.848660591, "count": 143, "total_sum": 6151486.576325966, "percentiles": {"p001": 46.71289356157213, "p01": 55.502236337170835, "p05": 574.9559972583961, "p10": 606.8148148148148, "p25": 611.5928842228055, "p75": 615.0907757735738, "p90": 615.4517975055026, "p95": 615.542119166422, "p99": 1158.3275338304336, "p999": 158008.81383758428}, "cumulative_distribution_function": null}}}, "start_time": 1749157168.1827004, "end_time": 1749157198.1799018, "request_totals": {"successful": 46, "errored": 0, "incomplete": 1, "total": 47}, "request_samples": null, "requests": {"successful": [{"type_": "generative_text_response", "request_id": "73054dd1-486f-4894-a861-075750b82453", "request_type": "text_completions", "scheduler_info": {"requested": true, "completed": true, "errored": false, "canceled": false, "targeted_start_time": 1749157168.179883, "queued_time": 1749157168.1811602, "dequeued_time": 1749157168.1818697, "scheduled_time": 1749157168.181895, "worker_start": 1749157168.1820004, "request_start": 1749157168.1827004, "request_end": 1749157168.871885, "worker_end": 1749157168.8723884, "process_id": 0}, "prompt": "such a sacrifice to her advantage as years of gratitude cannot enough acknowledge. By this time she is actually with them! If such goodness does not make her miserable now, she will never deserve to be happy! What a meeting for her, when she first sees my aunt! We must endeavour to forget all that has passed on either side, said Jane I hope and trust they will yet be happy. His consenting to marry her is a proof, I will believe, that he is come to a right way of thinking. Their mutual affection will steady them; and I flatter myself they will settle so quietly, and live in so rational a manner, as may in time make their past imprudence forgotten. Their conduct has been such, replied Elizabeth, as neither you, nor I, nor anybody, can ever forget. It is useless to talk of it. It now occurred to the girls that their mother was in all likelihood perfectly ignorant of what had happened. They went to the library, therefore, and asked their father whether he would not wish them to make it known to her. He was writing, and, without raising his head, coolly replied, Just as you please. May we take my uncle s letter to read to her? Take whatever you like, and get away", "output": ", said Jane. The letter was read, and the girls retired to their own apartments. Elizabeth was the first to return. She found her mother seated in the drawing-room, and looking very pale. She was dressed in a loose white gown, and her hair was disordered. She rose as they entered, and clasped them both in her arms, and then, without saying a word, took her seat on the sofa, and began to weep. Elizabeth and Jane stood by her side, and listened to the sobs which issued from her heart. She had no words to express her gratitude, and, in a few minutes,", "prompt_tokens": 257, "output_tokens": 128, "start_time": 1749157168.1827004, "end_time": 1749157168.871885, "first_token_time": 1749157168.2039824, "last_token_time": 1749157168.8717923, "request_latency": 0.6891846656799316, "time_to_first_token_ms": 21.281957626342773, "time_per_output_token_ms": 5.217265337705612, "inter_token_latency_ms": 5.258346167136365, "tokens_per_second": 558.631117568713, "output_tokens_per_second": 185.72670921765}], "errored": [], "incomplete": [], "total": null}, "duration": 29.997201442718506}]} \ No newline at end of file diff --git a/tests/unit/entrypoints/assets/benchmarks_stripped.yaml b/tests/unit/entrypoints/assets/benchmarks_stripped.yaml index 1d39e62d..260513b4 100644 --- a/tests/unit/entrypoints/assets/benchmarks_stripped.yaml +++ b/tests/unit/entrypoints/assets/benchmarks_stripped.yaml @@ -25,6 +25,7 @@ benchmarks: type_: synchronous max_number: max_duration: 30 + max_error: warmup_number: warmup_duration: cooldown_number: @@ -47,6 +48,10 @@ benchmarks: request_start_time_targeted_delay_avg: 0.6319856542222043 request_time_delay_avg: 0.00029866238857837433 request_time_avg: 0.6370967195389119 + error_rate: 0.0 + window_error_rate: 0.0 + status: success + termination_reason: max_seconds_reached worker: type_: generative_requests_worker backend_type: openai_http diff --git a/tests/unit/entrypoints/assets/benchmarks_stripped_output.txt b/tests/unit/entrypoints/assets/benchmarks_stripped_output.txt index 170d1e6a..3966e373 100644 --- a/tests/unit/entrypoints/assets/benchmarks_stripped_output.txt +++ b/tests/unit/entrypoints/assets/benchmarks_stripped_output.txt @@ -5,7 +5,7 @@ Benchmarks Metadata: Duration:30.2 seconds Profile:type=sweep, strategies=['synchronous', 'throughput', 'constant', 'constant', 'constant', 'constant', 'constant', 'constant', 'constant', 'constant'], max_concurrency=None - Args:max_number=None, max_duration=30.0, warmup_number=None, warmup_duration=None, cooldown_number=None, cooldown_duration=None + Args:max_number=None, max_duration=30.0, max_error=None, warmup_number=None, warmup_duration=None, cooldown_number=None, cooldown_duration=None Worker:type_='generative_requests_worker' backend_type='openai_http' backend_target='example_target' backend_model='example_model' backend_info={'max_output_tokens': 16384, 'timeout': 300, 'http2': True, 'authorization': False, 'organization': None, 'project': None, 'text_completions_path': '/v1/completions', 'chat_completions_path': '/v1/chat/completions'} @@ -18,7 +18,7 @@ Benchmarks Info: Metadata |||| Requests Made ||| Prompt Tok/Req ||| Output Tok/Req ||| Prompt Tok Total||| Output Tok Total || Benchmark| Start Time| End Time| Duration (s)| Comp| Inc| Err| Comp| Inc| Err| Comp| Inc| Err| Comp| Inc| Err| Comp| Inc| Err -----------|-----------|---------|-------------|------|-----|-----|------|------|----|-------|-----|-----|-------|-----|-----|-------|------|------ -synchronous| 16:59:28| 16:59:58| 30.0| 46| 1| 0| 257.1| 256.0| 0.0| 128.0| 0.0| 0.0| 11827| 256| 0| 5888| 0| 0 +synchronous| 20:59:28| 20:59:58| 30.0| 46| 1| 0| 257.1| 256.0| 0.0| 128.0| 0.0| 0.0| 11827| 256| 0| 5888| 0| 0 =================================================================================================================================================== diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py index 81364fa1..cbe06f8d 100644 --- a/tests/unit/mock_benchmark.py +++ b/tests/unit/mock_benchmark.py @@ -221,6 +221,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark: strategy=SynchronousStrategy(), max_number=None, max_duration=10.0, + max_error=0.05, warmup_number=None, warmup_duration=None, cooldown_number=None, @@ -245,6 +246,10 @@ def mock_generative_benchmark() -> GenerativeBenchmark: request_start_time_targeted_delay_avg=1.2827096836907523, request_time_delay_avg=0.0004316908972603934, request_time_avg=1.426228676523481, + error_rate=0.345346, + window_error_rate=0.345346, + status="success", + termination_reason="max_seconds_reached", ), worker=GenerativeRequestsWorkerDescription( backend_type="openai_http", diff --git a/tests/unit/preprocess/test_dataset.py b/tests/unit/preprocess/test_dataset.py index 2a2a8293..b16debeb 100644 --- a/tests/unit/preprocess/test_dataset.py +++ b/tests/unit/preprocess/test_dataset.py @@ -133,8 +133,6 @@ def test_process_dataset_non_empty( mock_save_to_file, tokenizer_mock, ): - from guidellm.preprocess.dataset import process_dataset - mock_dataset = [{"prompt": "Hello"}, {"prompt": "How are you?"}] mock_load_dataset.return_value = (mock_dataset, {"prompt_column": "prompt"}) mock_check_processor.return_value = tokenizer_mock