EmbeddedLLM
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm/compilation/backends.py‎
Lines changed: 17 additions & 16 deletions b/‎vllm/compilation/backends.py‎
Lines changed: 17 additions & 16 deletions
diff --git a/‎vllm/compilation/compiler_interface.py‎
Lines changed: 19 additions & 19 deletions b/‎vllm/compilation/compiler_interface.py‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎vllm/compilation/decorators.py‎
Lines changed: 4 additions & 4 deletions b/‎vllm/compilation/decorators.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎vllm/compilation/fix_functionalization.py‎
Lines changed: 8 additions & 7 deletions b/‎vllm/compilation/fix_functionalization.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎vllm/compilation/fusion.py‎
Lines changed: 5 additions & 5 deletions b/‎vllm/compilation/fusion.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎vllm/compilation/fx_utils.py‎
Lines changed: 2 additions & 1 deletion b/‎vllm/compilation/fx_utils.py‎
Lines changed: 2 additions & 1 deletion
@@ -74,7 +74,6 @@ exclude = [
 # Python 3.8 typing. TODO: Remove these excludes after v1.0.0
 "vllm/adapter_commons/**/*.py" = ["UP006", "UP035"]
 "vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/compilation/**/*.py" = ["UP006", "UP035"]
 "vllm/core/**/*.py" = ["UP006", "UP035"]
 "vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
 "vllm/distributed/**/*.py" = ["UP006", "UP035"]
 
@@ -5,8 +5,9 @@
 import os
 import pprint
 import time
+from collections.abc import Sequence
 from contextlib import ExitStack
-from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
+from typing import Any, Callable, Optional
 from unittest.mock import patch
 
 import torch
@@ -56,7 +57,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: Dict[Tuple[Optional[int], int, str], Any] = dict()
+        self.cache: dict[tuple[Optional[int], int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -90,7 +91,7 @@ def save_to_file(self):
 
     def load(self,
              graph: fx.GraphModule,
-             example_inputs: List[Any],
+             example_inputs: list[Any],
              graph_index: int,
              runtime_shape: Optional[int] = None) -> Optional[Callable]:
         if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
@@ -186,7 +187,7 @@ class SplitItem:
 
 
 def split_graph(graph: fx.GraphModule,
-                ops: List[str]) -> Tuple[fx.GraphModule, List[SplitItem]]:
+                ops: list[str]) -> tuple[fx.GraphModule, list[SplitItem]]:
     # split graph by ops
     subgraph_id = 0
     node_to_subgraph_id = {}
@@ -252,7 +253,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
     """
 
     def __init__(self, module: torch.fx.GraphModule,
-                 compile_submod_names: List[str], vllm_config: VllmConfig,
+                 compile_submod_names: list[str], vllm_config: VllmConfig,
                  graph_pool, vllm_backend: "VllmBackend"):
         super().__init__(module)
         from torch._guards import detect_fake_mode
@@ -274,8 +275,8 @@ def run(self, *args):
             return super().run(*fake_args)
 
     def call_module(self, target: torch.fx.node.Target,
-                    args: Tuple[torch.fx.node.Argument,
-                                ...], kwargs: Dict[str, Any]) -> Any:
+                    args: tuple[torch.fx.node.Argument,
+                                ...], kwargs: dict[str, Any]) -> Any:
         assert isinstance(target, str)
         output = super().call_module(target, args, kwargs)
 
@@ -326,12 +327,12 @@ class VllmBackend:
     graph: fx.GraphModule
     # the stiching graph module for all the piecewise graphs
     split_gm: fx.GraphModule
-    piecewise_graphs: List[SplitItem]
+    piecewise_graphs: list[SplitItem]
     returned_callable: Callable
     # Inductor passes to run on the graph pre-defunctionalization
     post_grad_passes: Sequence[Callable]
-    sym_tensor_indices: List[int]
-    input_buffers: List[torch.Tensor]
+    sym_tensor_indices: list[int]
+    input_buffers: list[torch.Tensor]
     compiler_manager: CompilerManager
 
     def __init__(
@@ -573,14 +574,14 @@ class ConcreteSizeEntry:
 
     # for cudagraph debugging, track the input addresses
     # during capture, and check if they are the same during replay
-    input_addresses: Optional[List[int]] = None
+    input_addresses: Optional[list[int]] = None
 
 
 class PiecewiseBackend:
 
     def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
                  graph_pool: Any, piecewise_compile_index: int,
-                 total_piecewise_compiles: int, sym_shape_indices: List[int],
+                 total_piecewise_compiles: int, sym_shape_indices: list[int],
                  compiled_graph_for_general_shape: Callable,
                  vllm_backend: VllmBackend):
         """
@@ -608,9 +609,9 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.is_last_graph = (
             piecewise_compile_index == total_piecewise_compiles - 1)
 
-        self.compile_sizes: Set[int] = set(
+        self.compile_sizes: set[int] = set(
             self.compilation_config.compile_sizes)
-        self.cudagraph_capture_sizes: Set[int] = set(
+        self.cudagraph_capture_sizes: set[int] = set(
             self.compilation_config.cudagraph_capture_sizes
         ) if self.compilation_config.use_cudagraph else set()
 
@@ -624,11 +625,11 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
 
         # the entries for different shapes that we need to either
         # compile or capture cudagraph
-        self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
+        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
 
         # to_be_compiled_sizes tracks the remaining sizes to compile,
         # and updates during the compilation process, so we need to copy it
-        self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
+        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
         for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
 
@@ -4,7 +4,7 @@
 import hashlib
 import os
 from contextlib import ExitStack
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Optional
 from unittest.mock import patch
 
 import torch
@@ -48,11 +48,11 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
     def compile(
         self,
         graph: fx.GraphModule,
-        example_inputs: List[Any],
-        compiler_config: Dict[str, Any],
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
-    ) -> Tuple[Optional[Callable], Optional[Any]]:
+    ) -> tuple[Optional[Callable], Optional[Any]]:
         """
         Compile the graph with the given example inputs and compiler config,
         with a runtime shape. If the `runtime_shape` is None, it means
@@ -82,7 +82,7 @@ def compile(
     def load(self,
              handle: Any,
              graph: fx.GraphModule,
-             example_inputs: List[Any],
+             example_inputs: list[Any],
              graph_index: int,
              runtime_shape: Optional[int] = None) -> Callable:
         """
@@ -120,7 +120,7 @@ class AlwaysHitShapeEnv:
     """
 
     def __init__(self) -> None:
-        self.guards: List[Any] = []
+        self.guards: list[Any] = []
 
     def evaluate_guards_expression(self, *args, **kwargs):
         return True
@@ -132,8 +132,8 @@ def produce_guards_expression(self, *args, **kwargs):
         return ""
 
 
-def get_inductor_factors() -> List[Any]:
-    factors: List[Any] = []
+def get_inductor_factors() -> list[Any]:
+    factors: list[Any] = []
     # summarize system state
     from torch._inductor.codecache import CacheBase
     system_factors = CacheBase.get_system()
@@ -169,11 +169,11 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
     def compile(
         self,
         graph: fx.GraphModule,
-        example_inputs: List[Any],
-        compiler_config: Dict[str, Any],
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
-    ) -> Tuple[Optional[Callable], Optional[Any]]:
+    ) -> tuple[Optional[Callable], Optional[Any]]:
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
@@ -201,7 +201,7 @@ def compile(
     def load(self,
              handle: Any,
              graph: fx.GraphModule,
-             example_inputs: List[Any],
+             example_inputs: list[Any],
              graph_index: int,
              runtime_shape: Optional[int] = None) -> Callable:
         assert isinstance(handle, tuple)
@@ -256,11 +256,11 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
     def compile(
         self,
         graph: fx.GraphModule,
-        example_inputs: List[Any],
-        compiler_config: Dict[str, Any],
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
-    ) -> Tuple[Optional[Callable], Optional[Any]]:
+    ) -> tuple[Optional[Callable], Optional[Any]]:
         from torch._inductor.compile_fx import compile_fx
         current_config = {}
         if compiler_config is not None:
@@ -420,7 +420,7 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
     def load(self,
              handle: Any,
              graph: fx.GraphModule,
-             example_inputs: List[Any],
+             example_inputs: list[Any],
              graph_index: int,
              runtime_shape: Optional[int] = None) -> Callable:
         assert isinstance(handle, tuple)
@@ -522,11 +522,11 @@ class EagerAdaptor(CompilerInterface):
     def compile(
         self,
         graph: fx.GraphModule,
-        example_inputs: List[Any],
-        compiler_config: Dict[str, Any],
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
         runtime_shape: Optional[int] = None,
         key: Optional[str] = None,
-    ) -> Tuple[Optional[Callable], Optional[Any]]:
+    ) -> tuple[Optional[Callable], Optional[Any]]:
         # we don't need to compile the graph, just return the graph itself.
         # It does not support caching, return None for the handle.
         return graph, None
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import inspect
-from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
+from typing import Callable, Optional, TypeVar, Union, overload
 from unittest.mock import patch
 
 import torch
@@ -25,7 +25,7 @@
 @overload
 def support_torch_compile(
     *,
-    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]],
+    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]],
 ) -> Callable[[_T], _T]:
     ...
 
@@ -38,7 +38,7 @@ def support_torch_compile(cls: _T) -> _T:
 def support_torch_compile(
     cls: Optional[_T] = None,
     *,
-    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None,
+    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None,
 ) -> Union[Callable[[_T], _T], _T]:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -131,7 +131,7 @@ def cls_decorator_helper(cls: _T) -> _T:
 
 def _support_torch_compile(
     cls: _T,
-    dynamic_arg_dims: Dict[str, Union[int, List[int]]],
+    dynamic_arg_dims: dict[str, Union[int, list[int]]],
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
 
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import operator
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from collections.abc import Iterable
+from typing import Optional, Union
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
@@ -27,7 +28,7 @@ def __call__(self, graph: torch.fx.Graph):
         self.begin()
         self.dump_graph(graph, "before_fix_functionalization")
 
-        self.nodes_to_remove: List[torch.fx.Node] = []
+        self.nodes_to_remove: list[torch.fx.Node] = []
         count = 0
         for node in graph.nodes:
             if not is_func(node, auto_functionalized):
@@ -117,8 +118,8 @@ def _remove(self, node_or_nodes: Union[torch.fx.Node,
     def defunctionalize(self,
                         graph: torch.fx.Graph,
                         node: torch.fx.Node,
-                        mutated_args: Dict[int, Union[torch.fx.Node, str]],
-                        args: Optional[Tuple[Union[torch.fx.Node, str],
+                        mutated_args: dict[int, Union[torch.fx.Node, str]],
+                        args: Optional[tuple[Union[torch.fx.Node, str],
                                              ...]] = None):
         """
         De-functionalize a node by replacing it with a call to the original.
@@ -130,7 +131,7 @@ def defunctionalize(self,
         self._remove(node)
 
     def replace_users_with_mutated_args(self, node: torch.fx.Node,
-                                        mutated_args: Dict[int,
+                                        mutated_args: dict[int,
                                                            Union[torch.fx.Node,
                                                                  str]]):
         """
@@ -146,7 +147,7 @@ def replace_users_with_mutated_args(self, node: torch.fx.Node,
             user.replace_all_uses_with(arg)
             self._remove(user)
 
-    def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]:
+    def getitem_users(self, node: torch.fx.Node) -> dict[int, torch.fx.Node]:
         """
         Returns the operator.getitem users of the auto-functionalized node,
         indexed by the index they are getting.
@@ -161,7 +162,7 @@ def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]:
     def insert_defunctionalized(self,
                                 graph: torch.fx.Graph,
                                 node: torch.fx.Node,
-                                args: Optional[Tuple[Union[torch.fx.Node, str],
+                                args: Optional[tuple[Union[torch.fx.Node, str],
                                                      ...]] = None):
         """
         Insert a new defunctionalized node into the graph before node.
 
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
+from typing import Callable, NamedTuple, Optional
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -57,7 +57,7 @@ def __str__(self):
 kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True)
 kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True)
 
-QUANT_OPS: Dict[QuantKey, OpOverload] = {
+QUANT_OPS: dict[QuantKey, OpOverload] = {
     kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa
     kFp8DynamicTensorSym:
     torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa
@@ -80,7 +80,7 @@ def __str__(self):
                 f"{'' if self.fused_add else 'out'} residual)")
 
 
-FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = {
+FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = {
     FusedRMSQuantKey(kFp8StaticTensorSym, False):
     torch.ops._C.rms_norm_static_fp8_quant.default,  # noqa
     FusedRMSQuantKey(kFp8StaticTensorSym, True):
@@ -101,7 +101,7 @@ def __init__(self, match: pm.Match, quant_op, fused_op):
         self.QUANT_OP = quant_op  # in-place quant op
         self.FUSED_OP = fused_op  # in-place fused quant op
 
-    def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node,
+    def insert_fused_node(self, fused_return_mapping: dict[int, tuple[fx.Node,
                                                                       int]],
                           **kwargs):
         """
@@ -548,7 +548,7 @@ def __init__(self, config: VllmConfig):
             "FusionPass singleton instance already exists"
         super().__init__(config)
 
-        self.matches: List[MultiOutputMatch] = []
+        self.matches: list[MultiOutputMatch] = []
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="fusion_pass")
 
 
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import operator
-from typing import Iterable, Optional
+from collections.abc import Iterable
+from typing import Optional
 
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized