Skip to content

Commit 980262f

Browse files
committed
review comments
Signed-off-by: Bill Nell <[email protected]>
1 parent 9461d73 commit 980262f

File tree

2 files changed

+5
-8
lines changed

2 files changed

+5
-8
lines changed

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@
5555
fused_moe_pallas = None # type: ignore
5656
logger = init_logger(__name__)
5757

58-
MOE_DP_CHUNK_SIZE = 256
58+
# Note: this limit is somewhat arbitrary and might be changed later.
59+
MOE_DP_CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
5960

6061

6162
@dataclass
@@ -435,8 +436,6 @@ def set_prepare_finalize(
435436

436437
experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
437438

438-
self.using_pplx = False
439-
440439
if isinstance(prepare_finalize,
441440
(BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
442441
logger.debug("BatchedTritonExperts %s", self.moe)
@@ -450,8 +449,6 @@ def set_prepare_finalize(
450449
use_int4_w4a16=False,
451450
block_shape=None,
452451
)
453-
self.using_pplx = isinstance(prepare_finalize,
454-
PplxPrepareAndFinalize)
455452
else:
456453
logger.debug("TritonExperts %s", self.moe)
457454
experts = TritonExperts(
@@ -499,7 +496,7 @@ def forward_cuda(
499496
custom_routing_function=custom_routing_function,
500497
scoring_func=scoring_func,
501498
e_score_correction_bias=e_score_correction_bias,
502-
indices_type=torch.uint32 if self.using_pplx else None)
499+
indices_type=torch.uint32 if self.use_pplx_kernels else None)
503500

504501
if self.rocm_aiter_moe_enabled:
505502
return self.rocm_aiter_fused_experts(
@@ -828,7 +825,8 @@ def __init__(
828825
hidden_dim=hidden_size,
829826
num_local_experts=self.local_num_experts,
830827
moe_parallel_config=self.moe_parallel_config,
831-
in_dtype=params_dtype, # TODO: is this right?
828+
# TODO (bnell): this needs to be fixed for quantized types.
829+
in_dtype=params_dtype,
832830
)
833831

834832
# Note: get_quant_method will look at the layer's local_num_experts

vllm/platforms/cuda.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,6 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
158158
"currently not supported with CUDA Graphs.")
159159
vllm_config.model_config.enforce_eager = True
160160
compilation_config.use_cudagraph = False
161-
compilation_config.use_inductor = False
162161

163162
@classmethod
164163
def get_current_memory_usage(cls,

0 commit comments

Comments
 (0)