Skip to content

Commit a55cf41

Browse files
authored
[Compilation][WideEP] Enable Piecewise CUDAGraph for DeepEPHT (vllm-project#24123)
1 parent 6fb2788 commit a55cf41

File tree

2 files changed

+21
-10
lines changed

2 files changed

+21
-10
lines changed

vllm/config/compilation.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,8 @@ def set_splitting_ops_for_v1(self):
546546
# full cudagraph outside the fx graph. This reduces some cpu
547547
# overhead when the runtime batch_size is not cudagraph captured.
548548
# see https://github.com/vllm-project/vllm/pull/20059 for details.
549-
self.splitting_ops = self._attention_ops
549+
# make a copy to avoid mutating the class-level list via reference.
550+
self.splitting_ops = list(self._attention_ops)
550551
elif len(self.splitting_ops) == 0:
551552
logger.warning_once("Using piecewise compilation with empty "
552553
"splitting_ops.")
@@ -561,6 +562,18 @@ def set_splitting_ops_for_v1(self):
561562
self.cudagraph_mode = CUDAGraphMode.FULL
562563
self.splitting_ops = []
563564

565+
if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
566+
# exclude MoE dispatch/combine from capture by ensuring
567+
# piecewise splitting includes them, so communication remains
568+
# outside CUDA graphs while compute can still be graphed.
569+
moe_ops = [
570+
"vllm.moe_forward",
571+
"vllm.moe_forward_shared",
572+
]
573+
for op in moe_ops:
574+
if op not in self.splitting_ops:
575+
self.splitting_ops.append(op)
576+
564577
def splitting_ops_contain_attention(self) -> bool:
565578
return self.splitting_ops is not None and all(
566579
op in self.splitting_ops for op in self._attention_ops)

vllm/platforms/cuda.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -183,16 +183,14 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
183183
compilation_config = vllm_config.compilation_config
184184
if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
185185
and parallel_config.data_parallel_size > 1
186-
and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
186+
and compilation_config.cudagraph_mode
187+
not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]):
187188
logger.info(
188-
"Data Parallel: disabling cudagraphs since DP "
189-
"with DeepEP high-throughput kernels are not CUDA Graph "
190-
"compatible. The DeepEP low-latency kernels are CUDA Graph "
191-
"compatible. Set the all_to_all backend to deepep_low_latency "
192-
"to use those kernels instead.")
193-
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
194-
if model_config is not None:
195-
model_config.enforce_eager = True
189+
"Data Parallel with DeepEP high-throughput: using PIECEWISE "
190+
"CUDA graphs and excluding MoE ops from capture. Set "
191+
"VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE "
192+
"graphs captured as well.")
193+
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
196194

197195
@classmethod
198196
def get_current_memory_usage(cls,

0 commit comments

Comments
 (0)