vllm-project
diff --git a/‎src/guidellm/backend/backend.py‎
Lines changed: 8 additions & 0 deletions b/‎src/guidellm/backend/backend.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/guidellm/backend/openai.py‎
Lines changed: 43 additions & 5 deletions b/‎src/guidellm/backend/openai.py‎
Lines changed: 43 additions & 5 deletions
diff --git a/‎src/guidellm/backend/response.py‎
Lines changed: 13 additions & 3 deletions b/‎src/guidellm/backend/response.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎src/guidellm/benchmark/__init__.py‎
Lines changed: 33 additions & 0 deletions b/‎src/guidellm/benchmark/__init__.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/guidellm/benchmark/aggregator.py‎
Lines changed: 136 additions & 0 deletions b/‎src/guidellm/benchmark/aggregator.py‎
Lines changed: 136 additions & 0 deletions
@@ -102,6 +102,14 @@ def model(self) -> Optional[str]:
         """
         ...
 
+    @property
+    @abstractmethod
+    def info(self) -> Dict[str, Any]:
+        """
+        :return: The information about the backend.
+        """
+        ...
+
     def validate(self):
         """
         Handle final setup and validate the backend is ready for use.
 
@@ -19,6 +19,10 @@
 __all__ = ["OpenAIHTTPBackend"]
 
 
+TEXT_COMPLETIONS_PATH = "/v1/completions"
+CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
+
+
 @Backend.register("openai_http")
 class OpenAIHTTPBackend(Backend):
     """
@@ -61,6 +65,17 @@ def __init__(
     ):
         super().__init__(type_="openai_http")
         self._target = target or settings.openai.base_url
+
+        if not self._target:
+            raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
+
+        if self._target.endswith("/v1") or self._target.endswith("/v1/"):
+            # backwards compatability, strip v1 off
+            self._target = self._target[:-3]
+
+        if self._target.endswith("/"):
+            self._target = self._target[:-1]
+
         self._model = model
 
         api_key = api_key or settings.openai.api_key
@@ -94,6 +109,22 @@ def model(self) -> Optional[str]:
         """
         return self._model
 
+    @property
+    def info(self) -> Dict[str, Any]:
+        """
+        :return: The information about the backend.
+        """
+        return {
+            "max_output_tokens": self.max_output_tokens,
+            "timeout": self.timeout,
+            "http2": self.http2,
+            "authorization": bool(self.authorization),
+            "organization": self.organization,
+            "project": self.project,
+            "text_completions_path": TEXT_COMPLETIONS_PATH,
+            "chat_completions_path": CHAT_COMPLETIONS_PATH,
+        }
+
     def check_setup(self):
         """
         Check if the backend is setup correctly and can be used for requests.
@@ -379,12 +410,10 @@ async def _iterative_completions_request(
         headers: Dict,
         payload: Dict,
     ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
-        target = f"{self.target}/v1/"
-
         if type_ == "text":
-            target += "completions"
+            target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
         elif type_ == "chat":
-            target += "chat/completions"
+            target = f"{self.target}{CHAT_COMPLETIONS_PATH}"
         else:
             raise ValueError(f"Unsupported type: {type_}")
 
@@ -407,6 +436,8 @@ async def _iterative_completions_request(
             iter_count = 0
             start_time = time.time()
             iter_time = start_time
+            first_iter_time: Optional[float] = None
+            last_iter_time: Optional[float] = None
 
             yield StreamingTextResponse(
                 type_="start",
@@ -440,14 +471,19 @@ async def _iterative_completions_request(
 
                     data = json.loads(line.strip()[len("data: ") :])
                     if delta := self._extract_completions_delta_content(type_, data):
+                        if first_iter_time is None:
+                            first_iter_time = iter_time
+                        last_iter_time = iter_time
+
                         iter_count += 1
                         response_value += delta
 
                         yield StreamingTextResponse(
                             type_="iter",
                             value=response_value,
-                            start_time=start_time,
                             iter_count=iter_count,
+                            start_time=start_time,
+                            first_iter_time=first_iter_time,
                             delta=delta,
                             time=iter_time,
                             request_id=request_id,
@@ -477,6 +513,8 @@ async def _iterative_completions_request(
             ),
             start_time=start_time,
             end_time=iter_time,
+            first_iter_time=first_iter_time,
+            last_iter_time=last_iter_time,
             iterations=iter_count,
             request_prompt_tokens=request_prompt_tokens,
             request_output_tokens=request_output_tokens,
 
@@ -67,11 +67,19 @@ class ResponseSummary(BaseModel):
 
     :param value: The final value returned from the request.
     :param request_args: The arguments used to make the request.
+    :param iterations: The number of iterations in the request.
     :param start_time: The time the request started.
     :param end_time: The time the request ended.
-    :param iterations: The number of iterations in the request.
-    :param prompt_tokens: The number of tokens in the prompt, if any usage was returned.
-    :param output_tokens: The number of tokens in the output, if any usage was returned.
+    :param first_iter_time: The time the first iteration was received.
+    :param last_iter_time: The time the last iteration was received.
+    :param request_prompt_tokens: The number of tokens measured in the prompt
+        for the request, if any.
+    :param request_output_tokens: The number of tokens enforced for the output
+        for the request, if any.
+    :param response_prompt_tokens: The number of tokens measured in the prompt
+        for the response, if any.
+    :param response_output_tokens: The number of tokens measured in the output
+        for the response, if any.
     :param request_id: The unique identifier for the request, if any.
     :param error: The error message, if any, returned from making the request.
     """
@@ -81,6 +89,8 @@ class ResponseSummary(BaseModel):
     iterations: int = 0
     start_time: Optional[float]
     end_time: Optional[float]
+    first_iter_time: Optional[float]
+    last_iter_time: Optional[float]
     request_prompt_tokens: Optional[int] = None
     request_output_tokens: Optional[int] = None
     response_prompt_tokens: Optional[int] = None
 
@@ -0,0 +1,33 @@
+from .aggregator import AGG, BenchmarkAggregator, GenerativeBenchmarkAggregator
+from .benchmark import BENCH, Benchmark, GenerativeBenchmark
+from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
+from .profile import (
+    AsyncProfile,
+    ConcurrentProfile,
+    Profile,
+    ProfileType,
+    SweepProfile,
+    SynchronousProfile,
+    ThroughputProfile,
+    create_profile,
+)
+
+__all__ = [
+    "AGG",
+    "BENCH",
+    "Benchmark",
+    "BenchmarkAggregator",
+    "GenerativeBenchmark",
+    "GenerativeBenchmarkAggregator",
+    "Benchmarker",
+    "BenchmarkerResult",
+    "GenerativeBenchmarker",
+    "AsyncProfile",
+    "ConcurrentProfile",
+    "Profile",
+    "ProfileType",
+    "SweepProfile",
+    "SynchronousProfile",
+    "ThroughputProfile",
+    "create_profile",
+]
@@ -0,0 +1,136 @@
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import DefaultDict, Generic, List, TypeVar
+
+from pydantic import Field
+
+from guidellm.backend import ResponseSummary
+from guidellm.benchmark.benchmark import BENCH, Benchmark, GenerativeBenchmark
+from guidellm.objects import Serializable
+from guidellm.request import GenerationRequest
+from guidellm.scheduler import (
+    REQ,
+    RES,
+    SchedulerResult,
+)
+
+__all__ = [
+    "AGG",
+    "BenchmarkAggregator",
+    "GenerativeBenchmarkAggregator",
+]
+
+
+class BenchmarkAggregator(Generic[BENCH, REQ, RES], ABC, Serializable):
+    created_requests: int = 0
+    queued_requests: int = 0
+    scheduled_requests: int = 0
+    processing_requests: int = 0
+    completed_requests: int = 0
+    successful_requests: int = 0
+    errored_requests: int = 0
+
+    queued_time: float = 0.0
+    scheduled_time: float = 0.0
+    worker_time: float = 0.0
+    targeted_worker_start_delay: float = 0.0
+    process_idle_time: DefaultDict[int, float] = defaultdict(float)
+    process_idle_time_scratch: DefaultDict[int, float] = defaultdict(float)
+
+    def add_base_result(
+        self, result: SchedulerResult[REQ, RES], is_error: bool = False
+    ):
+        self.created_requests = result.run_info.created_requests
+        self.queued_requests = result.run_info.queued_requests
+        self.scheduled_requests = result.run_info.scheduled_requests
+        self.processing_requests = result.run_info.processing_requests
+        self.completed_requests = result.run_info.completed_requests
+
+        if result.type_ != "request_complete":
+            return
+
+        if is_error:
+            self.errored_requests += 1
+        else:
+            self.successful_requests += 1
+
+        self.queued_time += (
+            result.request_info.scheduled_time - result.request_info.queued_time
+        )
+        self.scheduled_time += (
+            result.request_info.worker_start - result.request_info.scheduled_time
+        )
+
+        self.worker_time += (
+            result.request_info.worker_end - result.request_info.worker_start
+        )
+        self.worker_schedule_delay_total += (
+            result.request_info.worker_start - result.request_info.targeted_start_time
+        )
+
+        first_process_request = (
+            result.request_info.process_id not in self.process_idle_time_scratch
+        )
+        if not first_process_request:
+            self.process_idle_time_scratch[result.request_info.process_id] -= (
+                result.request_info.worker_start
+            )
+            self.process_idle_time[result.request_info.process_id] = (
+                self.process_idle_time_scratch[result.request_info.process_id]
+            )
+        self.process_idle_time_scratch[result.request_info.process_id] += (
+            result.request_info.worker_end
+        )
+
+    def add_result(self, result: SchedulerResult[REQ, RES]):
+        self.add_base_result(result)
+
+    @abstractmethod
+    def compile(self) -> Benchmark[BENCH]: ...
+
+
+AGG = TypeVar("AGG", bound=BenchmarkAggregator)
+
+
+class GenerativeBenchmarkAggregator(
+    BenchmarkAggregator[GenerativeBenchmark, GenerationRequest, ResponseSummary]
+):
+    results: List[SchedulerResult[GenerationRequest, ResponseSummary]] = Field(
+        default_factory=list,
+        description="The list of results for the benchmark.",
+    )
+
+    request_time_total: float = 0.0
+    targeted_request_delay_total: float = 0.0
+    time_to_first_token_total: float = 0.0
+    inter_token_latency_total: float = 0.0
+    prompt_tokens_total: int = 0
+    output_tokens_total: int = 0
+
+    def add_result(self, result: SchedulerResult[GenerationRequest, ResponseSummary]):
+        is_error = bool(result.response.error)
+        self.add_base_result(result, is_error=is_error)
+
+        if result.type_ != "request_complete":
+            return
+
+        self.results.append(result)
+
+        if not is_error:
+            self.request_time_total += (result.response.end_time or 0.0) - (
+                result.response.start_time or 0.0
+            )
+            self.targeted_request_delay_total += (result.response.start_time or 0.0) - (
+                result.request_info.targeted_start_time or 0.0
+            )
+            self.time_to_first_token_total += (
+                result.response.first_iter_time or 0.0
+            ) - (result.response.start_time or 0.0)
+            self.inter_token_latency_total += (
+                result.response.last_iter_time or 0.0
+            ) - (result.response.first_iter_time or 0.0)
+            self.prompt_tokens_total += result.response.prompt_tokens or 0
+            self.output_tokens_total += result.response.output_tokens or 0
+
+    def compile(self) -> GenerativeBenchmark:
+        pass  # TODO