zzzyq
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm/model_executor/layers/fused_moe/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎vllm/model_executor/layers/fused_moe/__init__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deep_gemm_moe.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/layers/fused_moe/deep_gemm_moe.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_moe.py‎
Lines changed: 23 additions & 23 deletions b/‎vllm/model_executor/layers/fused_moe/fused_moe.py‎
Lines changed: 23 additions & 23 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 4 additions & 4 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/moe_align_block_size.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/layers/fused_moe/moe_align_block_size.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py‎
Lines changed: 7 additions & 7 deletions b/‎vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/utils.py‎
Lines changed: 4 additions & 4 deletions b/‎vllm/model_executor/layers/fused_moe/utils.py‎
Lines changed: 4 additions & 4 deletions
@@ -80,7 +80,6 @@ exclude = [
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/lora/**/*.py" = ["UP006", "UP035"]
-"vllm/model_executor/layers/**/*.py" = ["UP006", "UP035"]
 "vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"]
 "vllm/model_executor/models/**/*.py" = ["UP006", "UP035"]
 "vllm/platforms/**/*.py" = ["UP006", "UP035"]
 
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from contextlib import contextmanager
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
-_config: Optional[Dict[str, Any]] = None
+_config: Optional[dict[str, Any]] = None
 
 
 @contextmanager
@@ -19,7 +19,7 @@ def override_config(config):
     _config = old_config
 
 
-def get_config() -> Optional[Dict[str, Any]]:
+def get_config() -> Optional[dict[str, Any]]:
     return _config
 
 
 
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import importlib.util
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -61,7 +61,7 @@ def _moe_permute(
     global_num_experts: int,
     expert_map: Optional[torch.Tensor],
     block_m: int,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
            Optional[torch.Tensor]]:
     """
     Determine the sorted_token_ids, expert_ids for the given problem size.
 
@@ -3,7 +3,7 @@
 import functools
 import json
 import os
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 
@@ -472,14 +472,14 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
                             mul_routed_weight: bool,
                             top_k: int,
-                            config: Dict[str, Any],
+                            config: dict[str, Any],
                             compute_type: tl.dtype,
                             use_fp8_w8a8: bool,
                             use_int8_w8a8: bool,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
                             per_channel_quant: bool,
-                            block_shape: Optional[List[int]] = None) -> None:
+                            block_shape: Optional[list[int]] = None) -> None:
     assert topk_weights is not None or not mul_routed_weight
     assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -622,7 +622,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
 def get_config_file_name(E: int,
                          N: int,
                          dtype: Optional[str],
-                         block_shape: Optional[List[int]] = None) -> str:
+                         block_shape: Optional[list[int]] = None) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
     block_shape_selector = ("" if not block_shape or not all(block_shape) else
@@ -638,7 +638,7 @@ def get_moe_configs(
     dtype: Optional[str],
     block_n: Optional[int] = None,
     block_k: Optional[int] = None,
-) -> Optional[Dict[int, Any]]:
+) -> Optional[dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
 
@@ -670,7 +670,7 @@ def get_moe_configs(
     return None
 
 
-def get_moe_wna16_block_config(config: Dict[str,
+def get_moe_wna16_block_config(config: dict[str,
                                             int], use_moe_wna16_cuda: bool,
                                num_valid_tokens: int, size_k: int, size_n: int,
                                num_experts: int, group_size: int,
@@ -742,8 +742,8 @@ def get_default_config(
     topk: int,
     dtype: Optional[str],
     is_marlin: bool,
-    block_shape: Optional[List[int]] = None,
-) -> Dict[str, int]:
+    block_shape: Optional[list[int]] = None,
+) -> dict[str, int]:
     if dtype == "fp8_w8a8" and block_shape is not None:
         # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
         # BLOCK_SIZE_K must be divisible by block_shape[1]
@@ -795,13 +795,13 @@ def get_default_config(
 
 
 def try_get_optimal_moe_config(
-    w1_shape: Tuple[int, ...],
-    w2_shape: Tuple[int, ...],
+    w1_shape: tuple[int, ...],
+    w2_shape: tuple[int, ...],
     top_k: int,
     dtype: Optional[str],
     M: int,
     is_marlin: bool = False,
-    block_shape: Optional[List[int]] = None,
+    block_shape: Optional[list[int]] = None,
 ):
     from vllm.model_executor.layers.fused_moe import get_config
     override_config = get_config()
@@ -855,7 +855,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
 
@@ -895,7 +895,7 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     e_score_correction_bias: Optional[torch.Tensor] = None
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
 
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
@@ -982,7 +982,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           w2_zp: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
                           a2_scale: Optional[torch.Tensor] = None,
-                          block_shape: Optional[List[int]] = None) -> None:
+                          block_shape: Optional[list[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        activation, apply_router_weight_on_input, use_fp8_w8a8,
                        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
@@ -1012,7 +1012,7 @@ def inplace_fused_experts_fake(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None) -> None:
+        block_shape: Optional[list[int]] = None) -> None:
     pass
 
 
@@ -1046,7 +1046,7 @@ def outplace_fused_experts(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None) -> torch.Tensor:
+        block_shape: Optional[list[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
                               False, activation, apply_router_weight_on_input,
                               use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
@@ -1076,7 +1076,7 @@ def outplace_fused_experts_fake(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None) -> torch.Tensor:
+        block_shape: Optional[list[int]] = None) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -1129,7 +1129,7 @@ def fused_experts(hidden_states: torch.Tensor,
                   w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[List[int]] = None,
+                  block_shape: Optional[list[int]] = None,
                   allow_deep_gemm: bool = False) -> torch.Tensor:
     if (allow_deep_gemm and use_fp8_w8a8
             and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
@@ -1184,8 +1184,8 @@ def moe_kernel_prepare_input(
     use_int8_w8a16: bool,
     use_int4_w4a16: bool,
     per_channel_quant: bool,
-    block_shape: Optional[List[int]] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     if use_fp8_w8a8:
         assert B_scale is not None
         if block_shape is None:
@@ -1248,7 +1248,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        w2_zp: Optional[torch.Tensor] = None,
                        a1_scale: Optional[torch.Tensor] = None,
                        a2_scale: Optional[torch.Tensor] = None,
-                       block_shape: Optional[List[int]] = None):
+                       block_shape: Optional[list[int]] = None):
     # Check constraints.
     if use_int4_w4a16:
         assert hidden_states.shape[1] // 2 == w1.shape[
@@ -1452,7 +1452,7 @@ def fused_moe(
     w2_zp: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[List[int]] = None,
+    block_shape: Optional[list[int]] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1497,7 +1497,7 @@ def fused_moe(
         a1.
     - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
         a2.
-    - block_shape: (Optional[List[int]]): Optional block size for block-wise
+    - block_shape: (Optional[list[int]]): Optional block size for block-wise
         quantization.
 
     Returns:
 
@@ -2,7 +2,7 @@
 
 from abc import abstractmethod
 from enum import Enum
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, Optional
 
 import torch
 import torch.nn.functional as F
@@ -326,7 +326,7 @@ def forward_tpu(
 
 def determine_expert_map(
         ep_size: int, ep_rank: int,
-        global_num_experts: int) -> Tuple[int, Optional[torch.Tensor]]:
+        global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]:
     """
         Calculates how many experts should be assigned to each rank for EP and
         creates a mapping from global to local expert index. Experts are
@@ -338,7 +338,7 @@ def determine_expert_map(
             global_num_experts (int): The total number of experts in the model.
 
         Returns:
-            Tuple[int, Optional[torch.Tensor]]: A tuple containing:
+            tuple[int, Optional[torch.Tensor]]: A tuple containing:
                 - local_num_experts (int): The number of experts assigned
                     to the current rank.
                 - expert_map (Optional[torch.Tensor]): A tensor of shape
@@ -909,7 +909,7 @@ def forward_impl(self, hidden_states: torch.Tensor,
     def make_expert_params_mapping(
             cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
             ckpt_up_proj_name: str,
-            num_experts: int) -> List[Tuple[str, str, int, str]]:
+            num_experts: int) -> list[tuple[str, str, int, str]]:
 
         return [
             # (param_name, weight_name, expert_id, shard_id)
 
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -153,7 +153,7 @@ def moe_align_block_size(
     num_experts: int,
     expert_map: Optional[torch.Tensor] = None,
     pad_sorted_ids: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
     size for matrix multiplication.
 
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -15,7 +15,7 @@ def moe_permute(
     expert_map: Optional[torch.Tensor] = None,
     align_block_size: Optional[int] = None,
     fill_invalid_expert: int = -1
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     This function expands and permutes activation to gather uncontinuous tokens 
       for each expert.
 
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from functools import cache
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -97,7 +97,7 @@ def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl(
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
         a1_scale: torch.Tensor,
-        block_shape: List[int],
+        block_shape: list[int],
         smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
     from aiter import fmoe_fp8_blockscale_g1u1
     from aiter.fused_moe_bf16_asm import moe_sorting_ck
@@ -142,7 +142,7 @@ def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake(
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
         a1_scale: torch.Tensor,
-        block_shape: List[int],
+        block_shape: list[int],
         smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
 
     return torch.empty_like(a1, dtype=hidden_states_dtype)
@@ -280,7 +280,7 @@ def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
                              w2_zp: Optional[torch.Tensor] = None,
                              a1_scale: Optional[torch.Tensor] = None,
                              a2_scale: Optional[torch.Tensor] = None,
-                             block_shape: Optional[List[int]] = None,
+                             block_shape: Optional[list[int]] = None,
                              allow_deep_gemm: bool = False) -> torch.Tensor:
 
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
@@ -372,14 +372,14 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
                             topk_indices: torch.Tensor,
                             token_expert_indices: torch.Tensor,
                             gating_output: torch.Tensor,
-                            renormalize: bool) -> Tuple[torch.Tensor, ...]:
+                            renormalize: bool) -> tuple[torch.Tensor, ...]:
     torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
                                            token_expert_indices, gating_output,
                                            renormalize)
     return topk_weights, topk_indices
 
 
-def shuffle_weights(*tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
     """
     Applies shuffle_weight function from AITER to each 
     input tensor and returns them.
@@ -395,7 +395,7 @@ def shuffle_weights(*tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
 
 
 def expand_weights(*tensors: torch.Tensor,
-                   expansion_dims: list[int]) -> Tuple[torch.Tensor, ...]:
+                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
     """
     Expands the dimensions of input tensors.
 
 
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from math import prod
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -10,7 +10,7 @@
 from vllm.utils import cdiv
 
 
-def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
+def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
     """
     Shrink the given tensor and apply the given view to it.  This is
     used to resize the intermediate fused_moe caches.
@@ -22,8 +22,8 @@ def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
 def _fp8_quantize(
     A: torch.Tensor,
     A_scale: Optional[torch.Tensor],
-    block_shape: Optional[List[int]],
-) -> Tuple[torch.Tensor, torch.Tensor]:
+    block_shape: Optional[list[int]],
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Perform fp8 quantization on the inputs.  If a block_shape
     is provided, the output will be blocked.