Skip to content

Commit e2b3691

Browse files
feat: most PR fixes done with failed E2E
1 parent 06292d4 commit e2b3691

File tree

14 files changed

+174
-202
lines changed

14 files changed

+174
-202
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
147147

148148
- `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
149149

150-
- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.
150+
- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold.
151151

152152
- `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
153153

src/guidellm/benchmark/aggregator.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,18 @@ class BenchmarkAggregator(
300300
total=None,
301301
),
302302
)
303+
current_window: int = Field(
304+
description=(
305+
"The current accumulated window size for error checking. "
306+
"This is a number between 0 and the value of "
307+
"GUIDELLM__ERROR_CHECK_WINDOW_SIZE"
308+
),
309+
default=0,
310+
)
311+
errors_in_window: int = Field(
312+
description=("The amount of errored requests in the current window."),
313+
default=0,
314+
)
303315
termination_reason: TerminationReason = Field(
304316
description=(
305317
f"The benchmark termination reason, one of: {get_args(TerminationReason)}"
@@ -445,9 +457,6 @@ def add_result(
445457

446458
return True
447459

448-
def set_termination_reason(self, termination_reason: TerminationReason) -> None:
449-
self.termination_reason = termination_reason
450-
451460
@abstractmethod
452461
def compile(self) -> BenchmarkT:
453462
"""
@@ -604,9 +613,7 @@ def compile(self) -> GenerativeBenchmark:
604613
"""
605614
successful, incomplete, errored = self._compile_results()
606615

607-
error_rate = self._calculate_error_rate()
608-
609-
termination_reason = self.termination_reason
616+
error_rate, window_error_rate = self._calculate_error_rate()
610617

611618
return GenerativeBenchmark.from_stats(
612619
run_id=self.run_id,
@@ -634,19 +641,26 @@ def compile(self) -> GenerativeBenchmark:
634641
request_time_delay_avg=self.requests_stats.request_time_delay.mean,
635642
request_time_avg=self.requests_stats.request_time.mean,
636643
error_rate=error_rate,
637-
status=REASON_STATUS_MAPPING[termination_reason],
638-
termination_reason=termination_reason,
644+
window_error_rate=window_error_rate,
645+
status=REASON_STATUS_MAPPING[self.termination_reason],
646+
termination_reason=self.termination_reason,
639647
),
640648
worker=self.worker_description,
641649
requests_loader=self.request_loader_description,
642650
extras=self.extras,
643651
)
644652

645-
def _calculate_error_rate(self) -> float:
653+
def _calculate_error_rate(self) -> tuple[float, float]:
646654
total_successful = self.requests_stats.totals.successful.total
647655
total_errored = self.requests_stats.totals.errored.total
648656
total_finished = total_errored + total_successful
649-
return total_errored / total_finished if total_finished > 0 else 0
657+
error_rate = 0.0 if total_finished == 0 else (total_errored / total_finished)
658+
window_error_rate = (
659+
0.0
660+
if self.current_window == 0
661+
else self.errors_in_window / self.current_window
662+
)
663+
return error_rate, window_error_rate
650664

651665
def _compile_results(
652666
self,

src/guidellm/benchmark/benchmark.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,11 +227,18 @@ class BenchmarkRunStats(StandardBaseModel):
227227
)
228228
error_rate: float = Field(
229229
description=(
230-
"The number of errored requests divided by the number "
231-
"of successful and errored requests. "
232-
"This can be higher than max_error "
233-
"(if applicable) cause it does not take into "
234-
"account incomplete requests."
230+
"The number of total errored requests divided by the number "
231+
"of total successful and errored requests at the end of benchmark. "
232+
)
233+
)
234+
window_error_rate: float = Field(
235+
description=(
236+
"The number of errored requests within the error checking window"
237+
"divided by the window size at the end of benchmark. "
238+
"If the window_error_rate is above the max_error "
239+
"the termination_reason should be 'max_error_reached'. "
240+
"You may configure the error checking window size by setting "
241+
"the environment variable GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
235242
)
236243
)
237244
status: BenchmarkStatus = Field(

src/guidellm/benchmark/benchmarker.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,9 @@ async def run(
223223
current_result=None,
224224
)
225225
elif result.type_ == "run_complete":
226-
aggregator.set_termination_reason(
227-
result.run_info.termination_reason
228-
)
226+
aggregator.termination_reason = result.run_info.termination_reason
227+
aggregator.current_window = result.run_info.current_window
228+
aggregator.errors_in_window = result.run_info.errors_in_window
229229
yield BenchmarkerResult(
230230
type_="scheduler_complete",
231231
start_time=start_time,

src/guidellm/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ class Settings(BaseSettings):
114114
logging: LoggingSettings = LoggingSettings()
115115
default_sweep_number: int = 10
116116
shutdown_poll_interval_seconds: float = 1
117-
error_check_window_size: int = 10
117+
error_check_window_size: int = 30
118118

119119
# HTTP settings
120120
request_follow_redirects: bool = True

src/guidellm/request/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from .loader import (
22
GenerativeRequestLoader,
33
GenerativeRequestLoaderDescription,
4-
GetInfiniteDatasetLengthError,
54
RequestLoader,
65
RequestLoaderDescription,
76
)
@@ -11,7 +10,6 @@
1110
"GenerationRequest",
1211
"GenerativeRequestLoader",
1312
"GenerativeRequestLoaderDescription",
14-
"GetInfiniteDatasetLengthError",
1513
"RequestLoader",
1614
"RequestLoaderDescription",
1715
]

src/guidellm/request/loader.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,11 @@
1919
__all__ = [
2020
"GenerativeRequestLoader",
2121
"GenerativeRequestLoaderDescription",
22-
"GetInfiniteDatasetLengthError",
2322
"RequestLoader",
2423
"RequestLoaderDescription",
2524
]
2625

2726

28-
class GetInfiniteDatasetLengthError(Exception):
29-
pass
30-
31-
3227
class RequestLoaderDescription(StandardBaseModel):
3328
type_: Literal["request_loader"] = "request_loader"
3429

@@ -125,11 +120,7 @@ def __len__(self) -> int:
125120
if self.iter_type == "finite":
126121
return self.num_unique_items()
127122

128-
if self.iter_type != "infinite":
129-
raise ValueError(f"Invalid iter_type {self.iter_type}")
130-
raise GetInfiniteDatasetLengthError(
131-
f"Dataset {self.data} is infinite and thus unable to determine length"
132-
)
123+
raise ValueError(f"Unable to determine length of dataset: {self.data}")
133124

134125
@property
135126
def description(self) -> GenerativeRequestLoaderDescription:

src/guidellm/scheduler/result.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from collections import deque
21
from typing import (
32
Generic,
43
Literal,
@@ -17,7 +16,6 @@
1716
]
1817

1918

20-
RequestStatus = Literal["success", "error"]
2119
TerminationReason = Literal[
2220
"interrupted", "max_error_reached", "max_seconds_reached", "max_requests_reached"
2321
]
@@ -53,8 +51,9 @@ class SchedulerRunInfo(StandardBaseModel):
5351
end_number: float
5452
processes: int
5553
strategy: SchedulingStrategy
56-
last_requests_statuses: deque[RequestStatus]
5754
max_error: Optional[float] = None
55+
current_window: int = 0
56+
errors_in_window: int = 0
5857

5958
created_requests: int = 0
6059
queued_requests: int = 0

0 commit comments

Comments
 (0)