fix Typing error: replace some list[int] to List[int]

fhl2000 · fhl2000 · commit d5943f04a15c · 2025-06-25T11:12:11.000+08:00
Signed-off-by: fhl &lt;2410591650@qq.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
+from typing import Optional, List
 
 import torch
 
@@ -21,7 +21,7 @@ def __init__(self,
                  use_int8_w8a16: bool = False,
                  use_int4_w4a16: bool = False,
                  per_channel_quant: bool = False,
-                 block_shape: Optional[list[int]] = None,
+                 block_shape: Optional[List[int]] = None,
                  allow_deep_gemm: bool = False):
         super().__init__()
         assert not use_int8_w8a8, "NYI"
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
+from typing import Optional, List
 
 import deep_ep
 import torch
@@ -22,7 +22,7 @@ def __init__(self,
                  dp_size: int,
                  rank_expert_offset: int,
                  quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None):
+                 block_shape: Optional[List[int]] = None):
         super().__init__()
         self.buffer = buffer
         self.world_size = world_size
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional, Union
+from typing import Optional, Union, List
 
 import deep_ep
 import torch
@@ -43,7 +43,7 @@ def __init__(self,
                  dp_size: int,
                  max_tokens_per_rank: int,
                  quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None,
+                 block_shape: Optional[List[int]] = None,
                  use_fp8_dispatch: bool = False):
         super().__init__()
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
-from typing import Optional
+from typing import Optional, List
 
 import torch
 import triton
@@ -325,7 +325,7 @@ def invoke_moe_batched_triton_kernel(
         use_int8_w8a16: bool,
         use_int4_w4a16: bool,
         config: dict[str, int],
-        block_shape: Optional[list[int]] = None):
+        block_shape: Optional[List[int]] = None):
 
     assert not use_int4_w4a16
     max_num_tokens = A.size(1)
@@ -496,7 +496,7 @@ def __init__(
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
-        block_shape: Optional[list[int]] = None,
+        block_shape: Optional[List[int]] = None,
         block_m: Optional[int] = None,
     ):
         super().__init__()
@@ -596,7 +596,7 @@ def __init__(
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
         per_channel_quant: bool = False,
-        block_shape: Optional[list[int]] = None,
+        block_shape: Optional[List[int]] = None,
         world_size: int = 1,
         dp_size: int = 1,
     ):
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -4,7 +4,7 @@
 import functools
 import json
 import os
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, List
 
 import torch
 
@@ -481,7 +481,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
                             per_channel_quant: bool,
-                            block_shape: Optional[list[int]] = None) -> None:
+                            block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights is not None or not mul_routed_weight
     assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -638,7 +638,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
 def get_config_file_name(E: int,
                          N: int,
                          dtype: Optional[str],
-                         block_shape: Optional[list[int]] = None) -> str:
+                         block_shape: Optional[List[int]] = None) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
     block_shape_selector = ("" if not block_shape or not all(block_shape) else
@@ -758,7 +758,7 @@ def get_default_config(
     topk: int,
     dtype: Optional[str],
     is_marlin: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> dict[str, int]:
     if dtype == "fp8_w8a8" and block_shape is not None:
         # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
@@ -817,7 +817,7 @@ def try_get_optimal_moe_config(
     dtype: Optional[str],
     M: int,
     is_marlin: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ):
     from vllm.model_executor.layers.fused_moe import get_config
     override_config = get_config()
@@ -1014,7 +1014,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           w2_zp: Optional[torch.Tensor] = None,
                           a1_scale: Optional[torch.Tensor] = None,
                           a2_scale: Optional[torch.Tensor] = None,
-                          block_shape: Optional[list[int]] = None) -> None:
+                          block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        activation, apply_router_weight_on_input, use_fp8_w8a8,
                        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
@@ -1044,7 +1044,7 @@ def inplace_fused_experts_fake(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> None:
+        block_shape: Optional[List[int]] = None) -> None:
     pass
 
 
@@ -1078,7 +1078,7 @@ def outplace_fused_experts(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+        block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
                               False, activation, apply_router_weight_on_input,
                               use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
@@ -1108,7 +1108,7 @@ def outplace_fused_experts_fake(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+        block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -1158,7 +1158,7 @@ def fused_experts(hidden_states: torch.Tensor,
                   w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[list[int]] = None,
+                  block_shape: Optional[List[int]] = None,
                   allow_deep_gemm: bool = False) -> torch.Tensor:
     # For now, disable DeepGemm for small N (<= 512) until better
     # permute/unpermute ops are available.
@@ -1229,7 +1229,7 @@ def fused_experts_impl(
     w2_zp: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> torch.Tensor:
     # Check constraints.
     if use_int4_w4a16:
@@ -1430,7 +1430,7 @@ def fused_moe(
     w2_zp: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1526,7 +1526,7 @@ def __init__(
         use_int8_w8a16: bool,
         use_int4_w4a16: bool,
         per_channel_quant: bool,
-        block_shape: Optional[list[int]] = None,
+        block_shape: Optional[List[int]] = None,
         block_m: Optional[int] = None,
     ):
         super().__init__()
@@ -1700,7 +1700,7 @@ def modular_triton_fused_moe(
     use_int8_w8a16: bool,
     use_int4_w4a16: bool,
     per_channel_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> mk.FusedMoEModularKernel:
     qtype = get_config_qtype(
         use_fp8_w8a8=use_fp8_w8a8,
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from typing import Optional, List
 
 import pplx_kernels as pplx
 import torch
@@ -21,7 +21,7 @@ def __init__(self,
                  rank: int,
                  dp_size: int,
                  quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None,
+                 block_shape: Optional[List[int]] = None,
                  per_act_token: bool = False):
         super().__init__()
         assert max_num_tokens > 0
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from typing import Optional, List
 
 import torch
 
@@ -17,7 +17,7 @@ def __init__(
         self,
         quant_dtype: Optional[torch.dtype] = None,
         per_channel_quant: bool = False,
-        block_shape: Optional[list[int]] = None,
+        block_shape: Optional[List[int]] = None,
     ):
         super().__init__()
         self.per_channel_quant = per_channel_quant
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
 from functools import cache
-from typing import Optional
+from typing import Optional, List
 
 import torch
 
@@ -315,7 +315,7 @@ def rocm_aiter_fused_experts(
         w2_scale: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+        block_shape: Optional[List[int]] = None) -> torch.Tensor:
 
     activation_method = (ActivationMethod.SILU
                          if activation == "silu" else ActivationMethod.GELU)
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from typing import Optional, List
 
 import torch
 
@@ -18,7 +18,7 @@ def __init__(self,
                  use_int8_w8a16: bool = False,
                  use_int4_w4a16: bool = False,
                  per_channel_quant: bool = False,
-                 block_shape: Optional[list[int]] = None,
+                 block_shape: Optional[List[int]] = None,
                  block_m: Optional[int] = None,
                  allow_deep_gemm: bool = False):
         super().__init__()
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from math import prod
-from typing import Optional
+from typing import Optional, List
 
 import torch
 
@@ -27,7 +27,7 @@ def _fp8_quantize(
     A: torch.Tensor,
     A_scale: Optional[torch.Tensor],
     per_act_token: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Perform fp8 quantization on the inputs.  If a block_shape
@@ -49,7 +49,7 @@ def _int8_quantize(
     A: torch.Tensor,
     A_scale: Optional[torch.Tensor],
     per_act_token: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Perform int8 quantization on the inputs.  If a block_shape
@@ -77,7 +77,7 @@ def moe_kernel_quantize_input(
     A_scale: Optional[torch.Tensor],
     qtype: Optional[torch.dtype],
     per_channel_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: Optional[List[int]] = None,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     if qtype == torch.float8_e4m3fn:
         return _fp8_quantize(A, A_scale, per_channel_quant, block_shape)
diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import importlib.util
 import logging
-
+from typing import List
 import torch
 
 from vllm.platforms import current_platform
@@ -20,7 +20,7 @@ def prepare_block_fp8_matmul_inputs(
     B: torch.Tensor,
     As: torch.Tensor,
     Bs: torch.Tensor,
-    block_size: list[int],
+    block_size: List[int],
     output_dtype: torch.dtype = torch.float16,
 ) -> tuple[int, int, int, torch.Tensor]:
     assert len(block_size) == 2
@@ -51,7 +51,7 @@ def w8a8_block_fp8_matmul_deepgemm(
     B: torch.Tensor,
     As: torch.Tensor,
     Bs: torch.Tensor,
-    block_size: list[int],
+    block_size: List[int],
     output_dtype: torch.dtype,
 ) -> torch.Tensor:
     M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size,
@@ -67,7 +67,7 @@ def w8a8_block_fp8_matmul_deepgemm_fake(
     B: torch.Tensor,
     As: torch.Tensor,
     Bs: torch.Tensor,
-    block_size: list[int],
+    block_size: List[int],
     output_dtype: torch.dtype,
 ) -> torch.Tensor:
     M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -66,6 +66,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            breakpoint()
             x.data = ops.gptq_marlin_repack(x.data.contiguous(),
                                             perm=layer.g_idx_sort_indices,
                                             size_k=c.partition_weight_shape[0],
@@ -104,6 +105,7 @@ def transform_w_s(x):
                     num_bits=c.weight_type.size_bits))
         else:
             setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
+        breakpoint()
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py