vllm-project
diff --git a/‎src/guidellm/backends/response_handlers.py‎
Lines changed: 27 additions & 0 deletions b/‎src/guidellm/backends/response_handlers.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/guidellm/schemas/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎src/guidellm/schemas/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/guidellm/schemas/base.py‎
Lines changed: 6 additions & 0 deletions b/‎src/guidellm/schemas/base.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/guidellm/schemas/request.py‎
Lines changed: 4 additions & 3 deletions b/‎src/guidellm/schemas/request.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/guidellm/schemas/request_stats.py‎
Lines changed: 40 additions & 27 deletions b/‎src/guidellm/schemas/request_stats.py‎
Lines changed: 40 additions & 27 deletions
diff --git a/‎src/guidellm/schemas/statistics.py‎
Lines changed: 23 additions & 7 deletions b/‎src/guidellm/schemas/statistics.py‎
Lines changed: 23 additions & 7 deletions
@@ -72,6 +72,33 @@ class GenerationResponseHandlerFactory(RegistryMixin[type[GenerationResponseHand
     responses from different generation services.
     """
 
+    @classmethod
+    def create(
+        cls,
+        request_type: str,
+        handler_overrides: dict[str, type[GenerationResponseHandler]] | None = None,
+    ) -> GenerationResponseHandler:
+        """
+        Create a response handler class for the given request type.
+
+        :param request_type: The type of generation request (e.g., "text_completions")
+        :param handler_overrides: Optional mapping of request types to handler classes
+            to override the default registry by checking first and then falling back
+            to the registered handlers.
+        :return: The corresponding instantiated GenerationResponseHandler
+        :raises ValueError: When no handler is registered for the request type
+        """
+        if handler_overrides and request_type in handler_overrides:
+            return handler_overrides[request_type]()
+
+        handler_cls = cls.get_registered_object(request_type)
+        if not handler_cls:
+            raise ValueError(
+                f"No response handler registered for type '{request_type}'."
+            )
+
+        return handler_cls()
+
 
 @GenerationResponseHandlerFactory.register("text_completions")
 class TextCompletionsResponseHandler(GenerationResponseHandler):
 
@@ -10,11 +10,17 @@
 from __future__ import annotations
 
 from .base import (
+    BaseModelT,
+    ErroredT,
+    IncompleteT,
     PydanticClassRegistryMixin,
+    RegisterClassT,
     ReloadableBaseModel,
     StandardBaseDict,
     StandardBaseModel,
     StatusBreakdown,
+    SuccessfulT,
+    TotalT,
 )
 from .info import RequestInfo, RequestTimings
 from .request import (
@@ -33,21 +39,27 @@
 )
 
 __all__ = [
+    "BaseModelT",
     "DistributionSummary",
+    "ErroredT",
     "FunctionObjT",
     "GenerationRequest",
     "GenerationRequestArguments",
     "GenerationResponse",
     "GenerativeRequestStats",
     "GenerativeRequestType",
+    "IncompleteT",
     "Percentiles",
     "PydanticClassRegistryMixin",
+    "RegisterClassT",
     "ReloadableBaseModel",
     "RequestInfo",
     "RequestTimings",
     "StandardBaseDict",
     "StandardBaseModel",
     "StatusBreakdown",
     "StatusDistributionSummary",
+    "SuccessfulT",
+    "TotalT",
     "UsageMetrics",
 ]
@@ -19,11 +19,17 @@
 from guidellm.utils.registry import RegistryMixin
 
 __all__ = [
+    "BaseModelT",
+    "ErroredT",
+    "IncompleteT",
     "PydanticClassRegistryMixin",
+    "RegisterClassT",
     "ReloadableBaseModel",
     "StandardBaseDict",
     "StandardBaseModel",
     "StatusBreakdown",
+    "SuccessfulT",
+    "TotalT",
 ]
 
 
 
@@ -73,7 +73,7 @@ def model_combine(
         Merge additional request arguments into the current instance.
 
         Combines method and stream fields by overwriting, while merging collection
-        fields like headers, params, json_body, and files by extending existing values.
+        fields like headers, params, body, and files by extending existing values.
 
         :param additional: Additional arguments to merge with current instance
         :return: Updated instance with merged arguments
@@ -88,9 +88,10 @@ def model_combine(
             if (val := additional_dict.get(overwrite)) is not None:
                 setattr(self, overwrite, val)
 
-        for combine in ("headers", "params", "json_body", "files"):
+        for combine in ("headers", "params", "body", "files"):
             if (val := additional_dict.get(combine)) is not None:
-                setattr(self, combine, {**getattr(self, combine, {}), **val})
+                current = getattr(self, combine, None) or {}
+                setattr(self, combine, {**current, **val})
 
         return self
 
 
@@ -69,7 +69,11 @@ def request_start_time(self) -> float | None:
         """
         :return: Timestamp when the request started, or None if unavailable
         """
-        return self.info.timings.request_start or self.info.timings.resolve_start
+        return (
+            self.info.timings.request_start
+            if self.info.timings.request_start is not None
+            else self.info.timings.resolve_start
+        )
 
     @computed_field  # type: ignore[misc]
     @property
@@ -80,7 +84,11 @@ def request_end_time(self) -> float:
         if self.info.timings.resolve_end is None:
             raise ValueError("resolve_end timings should be set but is None.")
 
-        return self.info.timings.request_end or self.info.timings.resolve_end
+        return (
+            self.info.timings.request_end
+            if self.info.timings.request_end is not None
+            else self.info.timings.resolve_end
+        )
 
     @computed_field  # type: ignore[misc]
     @property
@@ -90,9 +98,9 @@ def request_latency(self) -> float | None:
 
         :return: Duration from request start to completion, or None if unavailable
         """
-        if not (start := self.info.timings.request_start) or not (
-            end := self.info.timings.request_end
-        ):
+        start = self.info.timings.request_start
+        end = self.info.timings.request_end
+        if start is None or end is None:
             return None
 
         return end - start
@@ -142,9 +150,9 @@ def time_to_first_token_ms(self) -> float | None:
         """
         :return: Time to first token generation in milliseconds, or None if unavailable
         """
-        if not (first_token := self.first_token_iteration) or not (
-            start := self.info.timings.request_start
-        ):
+        first_token = self.first_token_iteration
+        start = self.info.timings.request_start
+        if first_token is None or start is None:
             return None
 
         return 1000 * (first_token - start)
@@ -158,9 +166,10 @@ def time_per_output_token_ms(self) -> float | None:
         :return: Average milliseconds per output token, or None if unavailable
         """
         if (
-            not (start := self.info.timings.request_start)
-            or not (last_token := self.last_token_iteration)
-            or not (output_tokens := self.output_tokens)
+            (start := self.info.timings.request_start) is None
+            or (last_token := self.last_token_iteration) is None
+            or (output_tokens := self.output_tokens) is None
+            or output_tokens == 0
         ):
             return None
 
@@ -174,10 +183,13 @@ def inter_token_latency_ms(self) -> float | None:
 
         :return: Average milliseconds between token generations, or None if unavailable
         """
+        first_token = self.first_token_iteration
+        last_token = self.last_token_iteration
+        output_tokens = self.output_tokens
         if (
-            not (first_token := self.first_token_iteration)
-            or not (last_token := self.last_token_iteration)
-            or not (output_tokens := self.output_tokens)
+            first_token is None
+            or last_token is None
+            or output_tokens is None
             or output_tokens <= 1
         ):
             return None
@@ -257,29 +269,26 @@ def token_iterations(self) -> int:
         return self.info.timings.token_iterations
 
     @property
-    def prompt_tokens_timing(self) -> tuple[float, float] | None:
+    def prompt_tokens_timing(self) -> tuple[float, float]:
         """
-        :return: Tuple of (timestamp, token_count) for prompt processing, or None
-            if unavailable
+        :return: Tuple of (timestamp, token_count) for prompt processing
+        :raises ValueError: If resolve_end timings are not set
         """
-        if self.request_end_time is None:
-            # no end time, can't compute
-            return None
-
         return (
-            self.first_token_iteration or self.request_end_time,
+            (
+                self.first_token_iteration
+                if self.first_token_iteration is not None
+                else self.request_end_time
+            ),
             self.prompt_tokens or 0.0,
         )
 
     @property
     def output_tokens_timings(self) -> list[tuple[float, float]]:
         """
         :return: List of (timestamp, token_count) tuples for output token generations
+        :raises ValueError: If resolve_end timings are not set
         """
-        if self.request_end_time is None:
-            # no end time, can't compute
-            return []
-
         if (
             self.first_token_iteration is None
             or self.last_token_iteration is None
@@ -288,7 +297,11 @@ def output_tokens_timings(self) -> list[tuple[float, float]]:
             # No iteration data, return single timing at end with all tokens
             return [
                 (
-                    self.last_token_iteration or self.request_end_time,
+                    (
+                        self.last_token_iteration
+                        if self.last_token_iteration is not None
+                        else self.request_end_time
+                    ),
                     self.output_tokens or 0.0,
                 )
             ]
 
@@ -210,13 +210,7 @@ def from_pdf(
             count = len(pdf)
 
         total_sum = mean * count
-
-        if include_pdf is False:
-            sampled_pdf = None
-        elif include_pdf is True:
-            sampled_pdf = pdf.tolist()
-        else:
-            sampled_pdf = []
+        sampled_pdf = cls._sample_pdf(pdf, include_pdf)
 
         return DistributionSummary(
             mean=mean,
@@ -232,6 +226,28 @@ def from_pdf(
             pdf=sampled_pdf,
         )
 
+    @classmethod
+    def _sample_pdf(
+        cls, pdf: np.ndarray, include_pdf: bool | int
+    ) -> list[tuple[float, float]] | None:
+        """
+        Sample PDF based on include_pdf parameter.
+
+        :param pdf: PDF array to sample
+        :param include_pdf: False for None, True for full, int for sampled size
+        :return: Sampled PDF as list of tuples or None
+        """
+        if include_pdf is False:
+            return None
+        if include_pdf is True:
+            return pdf.tolist()
+        if isinstance(include_pdf, int) and include_pdf > 0:
+            if len(pdf) <= include_pdf:
+                return pdf.tolist()
+            sample_indices = np.linspace(0, len(pdf) - 1, include_pdf, dtype=int)
+            return pdf[sample_indices].tolist()
+        return []
+
     @classmethod
     def from_values(
         cls,