File tree Expand file tree Collapse file tree 4 files changed +8
-8
lines changed
vllm/model_executor/layers/fused_moe Expand file tree Collapse file tree 4 files changed +8
-8
lines changed Original file line number Diff line number Diff line change 8
8
import vllm .model_executor .layers .fused_moe .modular_kernel as mk
9
9
from vllm import _custom_ops as ops
10
10
from vllm .model_executor .layers .fused_moe .prepare_finalize import (
11
- StandardPrepareAndFinalize )
11
+ MoEPrepareAndFinalizeNoEP )
12
12
from vllm .model_executor .layers .fused_moe .utils import _fp8_perm , _resize_cache
13
13
from vllm .scalar_type import scalar_types
14
14
@@ -241,7 +241,7 @@ def cutlass_moe_fp8(
241
241
a2_scale .numel () != 1 if a2_scale is not None else False )
242
242
243
243
fn = mk .FusedMoEModularKernel (
244
- StandardPrepareAndFinalize (
244
+ MoEPrepareAndFinalizeNoEP (
245
245
per_channel_quant = per_act_token ,
246
246
quant_dtype = torch .float8_e4m3fn ,
247
247
),
Original file line number Diff line number Diff line change 10
10
from vllm .model_executor .layers .fused_moe .moe_permute_unpermute import (
11
11
_moe_permute )
12
12
from vllm .model_executor .layers .fused_moe .prepare_finalize import (
13
- StandardPrepareAndFinalize )
13
+ MoEPrepareAndFinalizeNoEP )
14
14
from vllm .model_executor .layers .fused_moe .utils import (_fp8_quantize ,
15
15
_resize_cache )
16
16
from vllm .utils import round_up
@@ -205,8 +205,8 @@ def deep_gemm_moe_fp8(
205
205
- torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
206
206
"""
207
207
fn = mk .FusedMoEModularKernel (
208
- StandardPrepareAndFinalize (quant_dtype = torch .float8_e4m3fn ,
209
- block_shape = deep_gemm_block_shape ()),
208
+ MoEPrepareAndFinalizeNoEP (quant_dtype = torch .float8_e4m3fn ,
209
+ block_shape = deep_gemm_block_shape ()),
210
210
DeepGemmExperts (),
211
211
)
212
212
return fn (
Original file line number Diff line number Diff line change 16
16
from vllm .model_executor .layers .fused_moe .moe_align_block_size import (
17
17
moe_align_block_size )
18
18
from vllm .model_executor .layers .fused_moe .prepare_finalize import (
19
- StandardPrepareAndFinalize )
19
+ MoEPrepareAndFinalizeNoEP )
20
20
from vllm .model_executor .layers .fused_moe .utils import (
21
21
_resize_cache , moe_kernel_quantize_input )
22
22
from vllm .platforms import current_platform
@@ -1706,7 +1706,7 @@ def modular_triton_fused_moe(
1706
1706
use_int4_w4a16 = use_int4_w4a16 ,
1707
1707
)
1708
1708
return mk .FusedMoEModularKernel (
1709
- StandardPrepareAndFinalize (
1709
+ MoEPrepareAndFinalizeNoEP (
1710
1710
quant_dtype = qtype ,
1711
1711
per_channel_quant = per_channel_quant ,
1712
1712
block_shape = block_shape ,
Original file line number Diff line number Diff line change 10
10
moe_kernel_quantize_input )
11
11
12
12
13
- class StandardPrepareAndFinalize (mk .FusedMoEPrepareAndFinalize ):
13
+ class MoEPrepareAndFinalizeNoEP (mk .FusedMoEPrepareAndFinalize ):
14
14
15
15
def __init__ (
16
16
self ,
You can’t perform that action at this time.
0 commit comments