vllm-project · markurtz · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ name = "guidellm"
 version = "0.1.0"
 description = "Guidance platform for deploying and managing large language models."
 readme = { file = "README.md", content-type = "text/markdown" }
-requires-python = ">=3.8.0,<4.0"
+requires-python = ">=3.9.0,<4.0"
 license = { file = "LICENSE" }
 authors = [ { name = "Neuralmagic, Inc." } ]
 urls = { homepage = "https://github.com/neuralmagic/guidellm" }
@@ -93,7 +93,7 @@ profile = "black"
 
 [tool.mypy]
 files = ["src/guidellm", "tests"]
-python_version = '3.8'
+python_version = '3.9'
 warn_redundant_casts = true
 warn_unused_ignores = false
 show_error_codes = true

diff --git a/src/guidellm/backend/backend.py b/src/guidellm/backend/backend.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator
 from pathlib import Path
-from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Type, Union
+from typing import Any, Literal, Optional, Union
 
 from loguru import logger
 from PIL import Image
@@ -28,7 +29,7 @@ class Backend(ABC):
     :param type_: The type of the backend.
     """
 
-    _registry: Dict[BackendType, "Type[Backend]"] = {}
+    _registry: dict[BackendType, "type[Backend]"] = {}
 
     @classmethod
     def register(cls, backend_type: BackendType):
@@ -46,7 +47,7 @@ def register(cls, backend_type: BackendType):
         if not issubclass(cls, Backend):
             raise TypeError("Only subclasses of Backend can be registered")
 
-        def inner_wrapper(wrapped_class: Type["Backend"]):
+        def inner_wrapper(wrapped_class: type["Backend"]):
             cls._registry[backend_type] = wrapped_class
             logger.info("Registered backend type: {}", backend_type)
             return wrapped_class
@@ -103,7 +104,7 @@ def model(self) -> Optional[str]:
 
     @property
     @abstractmethod
-    def info(self) -> Dict[str, Any]:
+    def info(self) -> dict[str, Any]:
         """
         :return: The information about the backend.
         """
@@ -146,7 +147,7 @@ async def prepare_multiprocessing(self):
         ...
 
     @abstractmethod
-    async def available_models(self) -> List[str]:
+    async def available_models(self) -> list[str]:
         """
         Get the list of available models for the backend.
 
@@ -158,7 +159,7 @@ async def available_models(self) -> List[str]:
     @abstractmethod
     async def text_completions(
         self,
-        prompt: Union[str, List[str]],
+        prompt: Union[str, list[str]],
         request_id: Optional[str] = None,
         prompt_token_count: Optional[int] = None,
         output_token_count: Optional[int] = None,
@@ -190,7 +191,7 @@ async def chat_completions(
         self,
         content: Union[
             str,
-            List[Union[str, Dict[str, Union[str, Dict[str, str]]], Path, Image.Image]],
+            list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
             Any,
         ],
         request_id: Optional[str] = None,

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
@@ -1,8 +1,9 @@
 import base64
 import json
 import time
+from collections.abc import AsyncGenerator
 from pathlib import Path
-from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Union
+from typing import Any, Literal, Optional, Union
 
 import httpx
 from loguru import logger
@@ -111,7 +112,7 @@ def model(self) -> Optional[str]:
         return self._model
 
     @property
-    def info(self) -> Dict[str, Any]:
+    def info(self) -> dict[str, Any]:
         """
         :return: The information about the backend.
         """
@@ -157,7 +158,7 @@ async def prepare_multiprocessing(self):
             await self._async_client.aclose()
             self._async_client = None
 
-    async def available_models(self) -> List[str]:
+    async def available_models(self) -> list[str]:
         """
         Get the available models for the target server using the OpenAI models endpoint:
         /v1/models
@@ -176,7 +177,7 @@ async def available_models(self) -> List[str]:
 
     async def text_completions(  # type: ignore[override]
         self,
-        prompt: Union[str, List[str]],
+        prompt: Union[str, list[str]],
         request_id: Optional[str] = None,
         prompt_token_count: Optional[int] = None,
         output_token_count: Optional[int] = None,
@@ -232,7 +233,7 @@ async def chat_completions(  # type: ignore[override]
         self,
         content: Union[
             str,
-            List[Union[str, Dict[str, Union[str, Dict[str, str]]], Path, Image.Image]],
+            list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
             Any,
         ],
         request_id: Optional[str] = None,
@@ -318,7 +319,7 @@ def _get_async_client(self) -> httpx.AsyncClient:
 
         return client
 
-    def _headers(self) -> Dict[str, str]:
+    def _headers(self) -> dict[str, str]:
         headers = {
             "Content-Type": "application/json",
         }
@@ -335,8 +336,8 @@ def _headers(self) -> Dict[str, str]:
         return headers
 
     def _completions_payload(
-        self, orig_kwargs: Optional[Dict], max_output_tokens: Optional[int], **kwargs
-    ) -> Dict:
+        self, orig_kwargs: Optional[dict], max_output_tokens: Optional[int], **kwargs
+    ) -> dict:
         payload = orig_kwargs or {}
         payload.update(kwargs)
         payload["model"] = self.model
@@ -366,10 +367,10 @@ def _completions_payload(
     def _create_chat_messages(
         content: Union[
             str,
-            List[Union[str, Dict[str, Union[str, Dict[str, str]]], Path, Image.Image]],
+            list[Union[str, dict[str, Union[str, dict[str, str]]], Path, Image.Image]],
             Any,
         ],
-    ) -> List[Dict]:
+    ) -> list[dict]:
         if isinstance(content, str):
             return [
                 {
@@ -382,7 +383,7 @@ def _create_chat_messages(
             resolved_content = []
 
             for item in content:
-                if isinstance(item, Dict):
+                if isinstance(item, dict):
                     resolved_content.append(item)
                 elif isinstance(item, str):
                     resolved_content.append({"type": "text", "text": item})
@@ -430,8 +431,8 @@ async def _iterative_completions_request(
         request_id: Optional[str],
         request_prompt_tokens: Optional[int],
         request_output_tokens: Optional[int],
-        headers: Dict,
-        payload: Dict,
+        headers: dict,
+        payload: dict,
     ) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
         if type_ == "text_completions":
             target = f"{self.target}{TEXT_COMPLETIONS_PATH}"
@@ -551,7 +552,7 @@ async def _iterative_completions_request(
 
     @staticmethod
     def _extract_completions_delta_content(
-        type_: Literal["text_completions", "chat_completions"], data: Dict
+        type_: Literal["text_completions", "chat_completions"], data: dict
     ) -> Optional[str]:
         if "choices" not in data or not data["choices"]:
             return None
@@ -566,8 +567,8 @@ def _extract_completions_delta_content(
 
     @staticmethod
     def _extract_completions_usage(
-        data: Dict,
-    ) -> Optional[Dict[Literal["prompt", "output"], int]]:
+        data: dict,
+    ) -> Optional[dict[Literal["prompt", "output"], int]]:
         if "usage" not in data or not data["usage"]:
             return None
 

diff --git a/src/guidellm/backend/response.py b/src/guidellm/backend/response.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Literal, Optional
+from typing import Any, Literal, Optional
 
 from pydantic import computed_field
 
@@ -55,8 +55,8 @@ class RequestArgs(StandardBaseModel):
     """
 
     target: str
-    headers: Dict[str, str]
-    payload: Dict[str, Any]
+    headers: dict[str, str]
+    payload: dict[str, Any]
     timeout: Optional[float] = None
     http2: Optional[bool] = None
 

diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
@@ -3,12 +3,9 @@
 from pathlib import Path
 from typing import (
     Any,
-    Dict,
     Generic,
-    List,
     Literal,
     Optional,
-    Tuple,
     TypeVar,
     Union,
 )
@@ -258,7 +255,7 @@ class BenchmarkAggregator(
         ),
         discriminator="type_",
     )
-    extras: Dict[str, Any] = Field(
+    extras: dict[str, Any] = Field(
         description=(
             "Any additional information or metadata that was passed for this benchmark."
         )
@@ -292,9 +289,9 @@ class BenchmarkAggregator(
         default_factory=RequestsRunningStats,
     )
     results: StatusBreakdown[
-        List[SchedulerRequestResult[RequestT, ResponseT]],
-        List[SchedulerRequestResult[RequestT, ResponseT]],
-        List[SchedulerRequestResult[RequestT, ResponseT]],
+        list[SchedulerRequestResult[RequestT, ResponseT]],
+        list[SchedulerRequestResult[RequestT, ResponseT]],
+        list[SchedulerRequestResult[RequestT, ResponseT]],
         None,
     ] = Field(
         description=(
@@ -516,7 +513,7 @@ class GenerativeBenchmarkAggregator(
             "avaiable that match the preferred source."
         )
     )
-    processor_args: Optional[Dict[str, Any]] = Field(
+    processor_args: Optional[dict[str, Any]] = Field(
         description=(
             "Additional arguments to pass to the tokenizer if it requires "
             "any specific configuration for loading or processing."
@@ -636,12 +633,12 @@ def compile(self) -> GenerativeBenchmark:
 
     def _compile_results(
         self,
-    ) -> Tuple[
-        List[GenerativeTextResponseStats],
-        List[GenerativeTextErrorStats],
-        List[GenerativeTextErrorStats],
+    ) -> tuple[
+        list[GenerativeTextResponseStats],
+        list[GenerativeTextErrorStats],
+        list[GenerativeTextErrorStats],
     ]:
-        successful: List[GenerativeTextResponseStats] = [
+        successful: list[GenerativeTextResponseStats] = [
             GenerativeTextResponseStats(
                 request_id=result.request.request_id,
                 request_type=result.request.request_type,
@@ -670,7 +667,7 @@ def _compile_results(
             for result in self.results.successful
             if result.request and result.response
         ]
-        incomplete: List[GenerativeTextErrorStats] = [
+        incomplete: list[GenerativeTextErrorStats] = [
             GenerativeTextErrorStats(
                 error=result.response.error or "",
                 request_id=result.request.request_id,
@@ -700,7 +697,7 @@ def _compile_results(
             for result in self.results.incomplete
             if result.request and result.response
         ]
-        error: List[GenerativeTextErrorStats] = [
+        error: list[GenerativeTextErrorStats] = [
             GenerativeTextErrorStats(
                 error=result.response.error or "",
                 request_id=result.request.request_id,

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
@@ -1,6 +1,6 @@
 import random
 import uuid
-from typing import Any, Dict, List, Literal, Optional, TypeVar, Union
+from typing import Any, Literal, Optional, TypeVar, Union
 
 from pydantic import Field, computed_field
 
@@ -273,7 +273,7 @@ class Benchmark(StandardBaseModel):
             "requests for this benchmark."
         ),
     )
-    extras: Dict[str, Any] = Field(
+    extras: dict[str, Any] = Field(
         description=(
             "Any additional information or metadata that was passed for this benchmark."
         )
@@ -608,9 +608,9 @@ def duration(self) -> float:
         default=None,
     )
     requests: StatusBreakdown[
-        List[GenerativeTextResponseStats],
-        List[GenerativeTextErrorStats],
-        List[GenerativeTextErrorStats],
+        list[GenerativeTextResponseStats],
+        list[GenerativeTextErrorStats],
+        list[GenerativeTextErrorStats],
         None,
     ] = Field(
         description=(
@@ -663,14 +663,14 @@ def set_sample_size(self, sample_size: Optional[int]) -> "GenerativeBenchmark":
     @staticmethod
     def from_stats(
         run_id: str,
-        successful: List[GenerativeTextResponseStats],
-        incomplete: List[GenerativeTextErrorStats],
-        errored: List[GenerativeTextErrorStats],
+        successful: list[GenerativeTextResponseStats],
+        incomplete: list[GenerativeTextErrorStats],
+        errored: list[GenerativeTextErrorStats],
         args: BenchmarkArgs,
         run_stats: BenchmarkRunStats,
         worker: GenerativeRequestsWorkerDescription,
         requests_loader: GenerativeRequestLoaderDescription,
-        extras: Optional[Dict[str, Any]],
+        extras: Optional[dict[str, Any]],
     ) -> "GenerativeBenchmark":
         """
         Create a GenerativeBenchmark instance from the given statistics and metadata.
@@ -696,7 +696,7 @@ def from_stats(
             populated and calculated
         """
         total = successful + incomplete + errored
-        total_types: List[Literal["successful", "incomplete", "error"]] = [
+        total_types: list[Literal["successful", "incomplete", "error"]] = [
             *["successful"] * len(successful),  # type: ignore[list-item]
             *["incomplete"] * len(incomplete),  # type: ignore[list-item]
             *["error"] * len(errored),  # type: ignore[list-item]

diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
@@ -1,13 +1,11 @@
 import time
 import uuid
 from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator, Iterable
 from pathlib import Path
 from typing import (
     Any,
-    AsyncGenerator,
-    Dict,
     Generic,
-    Iterable,
     Literal,
     Optional,
     Union,
@@ -136,7 +134,7 @@ def __init__(
         worker: RequestsWorker[RequestT, ResponseT],
         request_loader: Iterable[RequestT],
         requests_loader_description: RequestLoaderDescription,
-        benchmark_save_extras: Optional[Dict[str, Any]] = None,
+        benchmark_save_extras: Optional[dict[str, Any]] = None,
     ):
         self.worker = worker
         self.scheduler: Scheduler[RequestT, ResponseT] = Scheduler(
@@ -294,9 +292,9 @@ def __init__(
         backend: Backend,
         request_loader: Iterable[GenerationRequest],
         request_loader_description: GenerativeRequestLoaderDescription,
-        benchmark_save_extras: Optional[Dict[str, Any]] = None,
+        benchmark_save_extras: Optional[dict[str, Any]] = None,
         processor: Optional[Union[str, Path, PreTrainedTokenizerBase]] = None,
-        processor_args: Optional[Dict[str, Any]] = None,
+        processor_args: Optional[dict[str, Any]] = None,
     ):
         super().__init__(
             worker=GenerativeRequestsWorker(backend),