Skip to content

Commit 1cb6b1d

Browse files
committed
merge
Signed-off-by: Bill Nell <[email protected]>
1 parent 980262f commit 1cb6b1d

File tree

11 files changed

+44
-44
lines changed

11 files changed

+44
-44
lines changed

vllm/model_executor/layers/fused_moe/cutlass_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def workspace_shapes(
3838
K: int,
3939
topk: int,
4040
num_experts: int,
41-
) -> Tuple[int, int, torch.dtype]:
41+
) -> tuple[int, int, torch.dtype]:
4242
# Note that K, N are transposed
4343
N, K = K, N
4444
workspace1 = M * topk * max(2 * N, K)

vllm/model_executor/layers/fused_moe/deep_gemm_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
import functools
33
import importlib.util
4-
from typing import Optional, Tuple
4+
from typing import Optional
55

66
import torch
77

@@ -83,7 +83,7 @@ def workspace_shapes(
8383
K: int,
8484
topk: int,
8585
num_experts: int,
86-
) -> Tuple[int, int, torch.dtype]:
86+
) -> tuple[int, int, torch.dtype]:
8787
block_m = self.block_shape[0]
8888
M_sum = (M * topk) + num_experts * (block_m - 1)
8989
M_sum = round_up(M_sum, block_m)

vllm/model_executor/layers/fused_moe/fused_batched_moe.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""Fused batched MoE kernel."""
3-
from typing import List, Optional, Tuple
3+
from typing import Optional
44

55
import torch
66
import triton
@@ -406,7 +406,7 @@ def prepare(
406406
num_experts: int,
407407
expert_map: Optional[torch.Tensor],
408408
apply_router_weight_on_input: bool,
409-
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
409+
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
410410
assert a1.dim() == 2
411411
assert topk_ids.dim() == 2
412412
assert topk_ids.size(0) == a1.size(0)
@@ -495,7 +495,7 @@ def __init__(
495495
use_int8_w8a8: bool = False,
496496
use_int8_w8a16: bool = False,
497497
use_int4_w4a16: bool = False,
498-
block_shape: Optional[List[int]] = None,
498+
block_shape: Optional[list[int]] = None,
499499
block_m: Optional[int] = None,
500500
):
501501
super().__init__()
@@ -517,7 +517,7 @@ def workspace_shapes(
517517
K: int,
518518
topk: int,
519519
num_experts: int,
520-
) -> Tuple[int, int, torch.dtype]:
520+
) -> tuple[int, int, torch.dtype]:
521521
assert a.dim() == 2
522522
num_dp = self.world_size // self.dp_size
523523
max_num_tokens = a.size(
@@ -600,7 +600,7 @@ def __init__(
600600
use_int8_w8a8: bool = False,
601601
use_int8_w8a16: bool = False,
602602
use_int4_w4a16: bool = False,
603-
block_shape: Optional[List[int]] = None,
603+
block_shape: Optional[list[int]] = None,
604604
world_size: int = 1,
605605
dp_size: int = 1,
606606
):
@@ -624,7 +624,7 @@ def workspace_shapes(
624624
K: int,
625625
topk: int,
626626
num_experts: int,
627-
) -> Tuple[int, int, torch.dtype]:
627+
) -> tuple[int, int, torch.dtype]:
628628
assert a.dim() == 2
629629
num_dp = self.world_size // self.dp_size
630630
max_num_tokens = a.size(

vllm/model_executor/layers/fused_moe/fused_moe.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import functools
44
import json
55
import os
6-
from typing import Any, Callable, Dict, List, Optional, Tuple
6+
from typing import Any, Callable, Optional
77

88
import torch
99

@@ -757,8 +757,8 @@ def get_default_config(
757757
topk: int,
758758
dtype: Optional[str],
759759
is_marlin: bool,
760-
block_shape: Optional[List[int]] = None,
761-
) -> Dict[str, int]:
760+
block_shape: Optional[list[int]] = None,
761+
) -> dict[str, int]:
762762
if dtype == "fp8_w8a8" and block_shape is not None:
763763
# Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
764764
# BLOCK_SIZE_K must be divisible by block_shape[1]
@@ -816,7 +816,7 @@ def try_get_optimal_moe_config(
816816
dtype: Optional[str],
817817
M: int,
818818
is_marlin: bool = False,
819-
block_shape: Optional[List[int]] = None,
819+
block_shape: Optional[list[int]] = None,
820820
):
821821
from vllm.model_executor.layers.fused_moe import get_config
822822
override_config = get_config()
@@ -871,7 +871,7 @@ def fused_topk(
871871
topk: int,
872872
renormalize: bool,
873873
indices_type: Optional[torch.dtype] = None,
874-
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
874+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
875875
assert hidden_states.shape[0] == gating_output.shape[0], (
876876
"Number of tokens mismatch")
877877

@@ -1013,7 +1013,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
10131013
w2_zp: Optional[torch.Tensor] = None,
10141014
a1_scale: Optional[torch.Tensor] = None,
10151015
a2_scale: Optional[torch.Tensor] = None,
1016-
block_shape: Optional[List[int]] = None) -> None:
1016+
block_shape: Optional[list[int]] = None) -> None:
10171017
fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
10181018
activation, apply_router_weight_on_input, use_fp8_w8a8,
10191019
use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
@@ -1043,7 +1043,7 @@ def inplace_fused_experts_fake(
10431043
w2_zp: Optional[torch.Tensor] = None,
10441044
a1_scale: Optional[torch.Tensor] = None,
10451045
a2_scale: Optional[torch.Tensor] = None,
1046-
block_shape: Optional[List[int]] = None) -> None:
1046+
block_shape: Optional[list[int]] = None) -> None:
10471047
pass
10481048

10491049

@@ -1077,7 +1077,7 @@ def outplace_fused_experts(
10771077
w2_zp: Optional[torch.Tensor] = None,
10781078
a1_scale: Optional[torch.Tensor] = None,
10791079
a2_scale: Optional[torch.Tensor] = None,
1080-
block_shape: Optional[List[int]] = None) -> torch.Tensor:
1080+
block_shape: Optional[list[int]] = None) -> torch.Tensor:
10811081
return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
10821082
False, activation, apply_router_weight_on_input,
10831083
use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
@@ -1107,7 +1107,7 @@ def outplace_fused_experts_fake(
11071107
w2_zp: Optional[torch.Tensor] = None,
11081108
a1_scale: Optional[torch.Tensor] = None,
11091109
a2_scale: Optional[torch.Tensor] = None,
1110-
block_shape: Optional[List[int]] = None) -> torch.Tensor:
1110+
block_shape: Optional[list[int]] = None) -> torch.Tensor:
11111111
return torch.empty_like(hidden_states)
11121112

11131113

@@ -1228,7 +1228,7 @@ def fused_experts_impl(
12281228
w2_zp: Optional[torch.Tensor] = None,
12291229
a1_scale: Optional[torch.Tensor] = None,
12301230
a2_scale: Optional[torch.Tensor] = None,
1231-
block_shape: Optional[List[int]] = None,
1231+
block_shape: Optional[list[int]] = None,
12321232
) -> torch.Tensor:
12331233
# Check constraints.
12341234
if use_int4_w4a16:
@@ -1429,7 +1429,7 @@ def fused_moe(
14291429
w2_zp: Optional[torch.Tensor] = None,
14301430
a1_scale: Optional[torch.Tensor] = None,
14311431
a2_scale: Optional[torch.Tensor] = None,
1432-
block_shape: Optional[List[int]] = None,
1432+
block_shape: Optional[list[int]] = None,
14331433
) -> torch.Tensor:
14341434
"""
14351435
This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1525,7 +1525,7 @@ def __init__(
15251525
use_int8_w8a16: bool,
15261526
use_int4_w4a16: bool,
15271527
per_channel_quant: bool,
1528-
block_shape: Optional[List[int]] = None,
1528+
block_shape: Optional[list[int]] = None,
15291529
block_m: Optional[int] = None,
15301530
):
15311531
super().__init__()
@@ -1549,7 +1549,7 @@ def workspace_shapes(
15491549
K: int,
15501550
topk: int,
15511551
num_experts: int,
1552-
) -> Tuple[int, int, torch.dtype]:
1552+
) -> tuple[int, int, torch.dtype]:
15531553
factor = num_experts if a.dim() == 3 else 1
15541554
workspace1 = M * topk * max(N * 2, K) * factor
15551555
workspace2 = M * topk * N * factor
@@ -1697,7 +1697,7 @@ def modular_triton_fused_moe(
16971697
use_int8_w8a16: bool,
16981698
use_int4_w4a16: bool,
16991699
per_channel_quant: bool,
1700-
block_shape: Optional[List[int]] = None,
1700+
block_shape: Optional[list[int]] = None,
17011701
) -> mk.FusedMoEModularKernel:
17021702
qtype = get_config_qtype(
17031703
use_fp8_w8a8=use_fp8_w8a8,

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from abc import abstractmethod
66
from dataclasses import dataclass
77
from enum import Enum
8-
from typing import Callable, List, Optional, Tuple
8+
from typing import Callable, Optional
99
from weakref import WeakValueDictionary
1010

1111
import torch

vllm/model_executor/layers/fused_moe/modular_kernel.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
from abc import ABC, abstractmethod
3-
from typing import Optional, Tuple
3+
from typing import Optional
44

55
import torch
66

@@ -41,7 +41,7 @@ def _moe_problem_size(
4141
w1: torch.Tensor,
4242
w2: torch.Tensor,
4343
topk_ids: torch.Tensor,
44-
) -> Tuple[int, int, int, int, int]:
44+
) -> tuple[int, int, int, int, int]:
4545
"""
4646
Extract the MoE problem size from the given tensor arguments:
4747
- a: The hidden states, input to the MoE layer.
@@ -93,7 +93,7 @@ def prepare(
9393
num_experts: int,
9494
expert_map: Optional[torch.Tensor],
9595
apply_router_weight_on_input: bool,
96-
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
96+
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
9797
"""
9898
Perform any quantization (and/or) dispatching needed
9999
for this kernel.
@@ -153,7 +153,7 @@ def workspace_shapes(
153153
K: int,
154154
topk: int,
155155
num_experts: int,
156-
) -> Tuple[int, int, torch.dtype]:
156+
) -> tuple[int, int, torch.dtype]:
157157
"""
158158
Compute the number of elements for the temporary outputs of the two
159159
gemms and activation in the fused expert function. Since the

vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
2-
from typing import Optional, Tuple
2+
from typing import Optional
33

44
import torch
55

@@ -16,7 +16,7 @@ def _moe_permute(
1616
global_num_experts: int,
1717
expert_map: Optional[torch.Tensor],
1818
block_m: int,
19-
) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
19+
) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
2020
Optional[torch.Tensor]]:
2121
"""
2222
Determine the sorted_token_ids, expert_ids for the given problem size.

vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
2-
from typing import List, Optional, Tuple
2+
from typing import Optional
33

44
import pplx_kernels as pplx
55
import torch
@@ -21,7 +21,7 @@ def __init__(self,
2121
rank: int,
2222
dp_size: int,
2323
quant_dtype: Optional[torch.dtype] = None,
24-
block_shape: Optional[List[int]] = None):
24+
block_shape: Optional[list[int]] = None):
2525
super().__init__()
2626
assert max_num_tokens > 0
2727
self.a2a = a2a
@@ -42,7 +42,7 @@ def prepare(
4242
num_experts: int,
4343
expert_map: Optional[torch.Tensor],
4444
apply_router_weight_on_input: bool,
45-
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
45+
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
4646
num_tokens = a1.size(0) # M
4747
hidden_dim = a1.size(-1) # K
4848

vllm/model_executor/layers/fused_moe/prepare_finalize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
2-
from typing import Optional, Tuple
2+
from typing import Optional
33

44
import torch
55

@@ -33,7 +33,7 @@ def prepare(
3333
num_experts: int,
3434
expert_map: Optional[torch.Tensor],
3535
apply_router_weight_on_input: bool = False,
36-
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
36+
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
3737
if apply_router_weight_on_input:
3838
topk = topk_ids.size(1)
3939
# TODO: this only works for topK=1, will need to update for topK>1

vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
2-
from typing import List, Optional, Tuple
2+
from typing import Optional
33

44
import torch
55

@@ -17,7 +17,7 @@ def __init__(self,
1717
use_int8_w8a16: bool = False,
1818
use_int4_w4a16: bool = False,
1919
per_channel_quant: bool = False,
20-
block_shape: Optional[List[int]] = None,
20+
block_shape: Optional[list[int]] = None,
2121
block_m: Optional[int] = None,
2222
allow_deep_gemm: bool = False):
2323
super().__init__()
@@ -40,7 +40,7 @@ def workspace_shapes(
4040
K: int,
4141
topk: int,
4242
num_experts: int,
43-
) -> Tuple[int, int, torch.dtype]:
43+
) -> tuple[int, int, torch.dtype]:
4444
# Note: the deep gemm workspaces are strictly larger than the triton
4545
# workspaces so we can be pessimistic here and allocate for DeepGemm
4646
# even if we fall back to triton later, e.g. if expert maps are set.

0 commit comments

Comments
 (0)