vllm-project
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/guidellm/benchmark/aggregator.py
Lines changed: 24 additions & 10 deletions b/‎src/guidellm/benchmark/aggregator.py
Lines changed: 24 additions & 10 deletions
diff --git a/‎src/guidellm/benchmark/benchmark.py
Lines changed: 12 additions & 5 deletions b/‎src/guidellm/benchmark/benchmark.py
Lines changed: 12 additions & 5 deletions
diff --git a/‎src/guidellm/benchmark/benchmarker.py
Lines changed: 3 additions & 3 deletions b/‎src/guidellm/benchmark/benchmarker.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/guidellm/config.py
Lines changed: 1 addition & 1 deletion b/‎src/guidellm/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/guidellm/request/__init__.py
Lines changed: 0 additions & 2 deletions b/‎src/guidellm/request/__init__.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/guidellm/request/loader.py
Lines changed: 1 addition & 10 deletions b/‎src/guidellm/request/loader.py
Lines changed: 1 addition & 10 deletions
diff --git a/‎src/guidellm/scheduler/result.py
Lines changed: 2 additions & 3 deletions b/‎src/guidellm/scheduler/result.py
Lines changed: 2 additions & 3 deletions
@@ -147,7 +147,7 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
 
 - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
 
-- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.
+- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold.
 
 - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
 
 
@@ -300,6 +300,18 @@ class BenchmarkAggregator(
             total=None,
         ),
     )
+    current_window: int = Field(
+        description=(
+            "The current accumulated window size for error checking. "
+            "This is a number between 0 and the value of "
+            "GUIDELLM__ERROR_CHECK_WINDOW_SIZE"
+        ),
+        default=0,
+    )
+    errors_in_window: int = Field(
+        description=("The amount of errored requests in the current window."),
+        default=0,
+    )
     termination_reason: TerminationReason = Field(
         description=(
             f"The benchmark termination reason, one of: {get_args(TerminationReason)}"
@@ -445,9 +457,6 @@ def add_result(
 
         return True
 
-    def set_termination_reason(self, termination_reason: TerminationReason) -> None:
-        self.termination_reason = termination_reason
-
     @abstractmethod
     def compile(self) -> BenchmarkT:
         """
@@ -604,9 +613,7 @@ def compile(self) -> GenerativeBenchmark:
         """
         successful, incomplete, errored = self._compile_results()
 
-        error_rate = self._calculate_error_rate()
-
-        termination_reason = self.termination_reason
+        error_rate, window_error_rate = self._calculate_error_rate()
 
         return GenerativeBenchmark.from_stats(
             run_id=self.run_id,
@@ -634,19 +641,26 @@ def compile(self) -> GenerativeBenchmark:
                 request_time_delay_avg=self.requests_stats.request_time_delay.mean,
                 request_time_avg=self.requests_stats.request_time.mean,
                 error_rate=error_rate,
-                status=REASON_STATUS_MAPPING[termination_reason],
-                termination_reason=termination_reason,
+                window_error_rate=window_error_rate,
+                status=REASON_STATUS_MAPPING[self.termination_reason],
+                termination_reason=self.termination_reason,
             ),
             worker=self.worker_description,
             requests_loader=self.request_loader_description,
             extras=self.extras,
         )
 
-    def _calculate_error_rate(self) -> float:
+    def _calculate_error_rate(self) -> tuple[float, float]:
         total_successful = self.requests_stats.totals.successful.total
         total_errored = self.requests_stats.totals.errored.total
         total_finished = total_errored + total_successful
-        return total_errored / total_finished if total_finished > 0 else 0
+        error_rate = 0.0 if total_finished == 0 else (total_errored / total_finished)
+        window_error_rate = (
+            0.0
+            if self.current_window == 0
+            else self.errors_in_window / self.current_window
+        )
+        return error_rate, window_error_rate
 
     def _compile_results(
         self,
 
@@ -227,11 +227,18 @@ class BenchmarkRunStats(StandardBaseModel):
     )
     error_rate: float = Field(
         description=(
-            "The number of errored requests divided by the number "
-            "of successful and errored requests. "
-            "This can be higher than max_error "
-            "(if applicable) cause it does not take into "
-            "account incomplete requests."
+            "The number of total errored requests divided by the number "
+            "of total successful and errored requests at the end of benchmark. "
+        )
+    )
+    window_error_rate: float = Field(
+        description=(
+            "The number of errored requests within the error checking window"
+            "divided by the window size at the end of benchmark. "
+            "If the window_error_rate is above the max_error "
+            "the termination_reason should be 'max_error_reached'. "
+            "You may configure the error checking window size by setting "
+            "the environment variable GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
         )
     )
     status: BenchmarkStatus = Field(
 
@@ -223,9 +223,9 @@ async def run(
                         current_result=None,
                     )
                 elif result.type_ == "run_complete":
-                    aggregator.set_termination_reason(
-                        result.run_info.termination_reason
-                    )
+                    aggregator.termination_reason = result.run_info.termination_reason
+                    aggregator.current_window = result.run_info.current_window
+                    aggregator.errors_in_window = result.run_info.errors_in_window
                     yield BenchmarkerResult(
                         type_="scheduler_complete",
                         start_time=start_time,
 
@@ -114,7 +114,7 @@ class Settings(BaseSettings):
     logging: LoggingSettings = LoggingSettings()
     default_sweep_number: int = 10
     shutdown_poll_interval_seconds: float = 1
-    error_check_window_size: int = 10
+    error_check_window_size: int = 30
 
     # HTTP settings
     request_follow_redirects: bool = True
 
@@ -1,7 +1,6 @@
 from .loader import (
     GenerativeRequestLoader,
     GenerativeRequestLoaderDescription,
-    GetInfiniteDatasetLengthError,
     RequestLoader,
     RequestLoaderDescription,
 )
@@ -11,7 +10,6 @@
     "GenerationRequest",
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
-    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
 ]
@@ -19,16 +19,11 @@
 __all__ = [
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
-    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
 ]
 
 
-class GetInfiniteDatasetLengthError(Exception):
-    pass
-
-
 class RequestLoaderDescription(StandardBaseModel):
     type_: Literal["request_loader"] = "request_loader"
 
@@ -125,11 +120,7 @@ def __len__(self) -> int:
         if self.iter_type == "finite":
             return self.num_unique_items()
 
-        if self.iter_type != "infinite":
-            raise ValueError(f"Invalid iter_type {self.iter_type}")
-        raise GetInfiniteDatasetLengthError(
-            f"Dataset {self.data} is infinite and thus unable to determine length"
-        )
+        raise ValueError(f"Unable to determine length of dataset: {self.data}")
 
     @property
     def description(self) -> GenerativeRequestLoaderDescription:
 
@@ -1,4 +1,3 @@
-from collections import deque
 from typing import (
     Generic,
     Literal,
@@ -17,7 +16,6 @@
 ]
 
 
-RequestStatus = Literal["success", "error"]
 TerminationReason = Literal[
     "interrupted", "max_error_reached", "max_seconds_reached", "max_requests_reached"
 ]
@@ -53,8 +51,9 @@ class SchedulerRunInfo(StandardBaseModel):
     end_number: float
     processes: int
     strategy: SchedulingStrategy
-    last_requests_statuses: deque[RequestStatus]
     max_error: Optional[float] = None
+    current_window: int = 0
+    errors_in_window: int = 0
 
     created_requests: int = 0
     queued_requests: int = 0