@@ -548,27 +548,7 @@ def set_splitting_ops_for_v1(self):
548
548
"set_splitting_ops_for_v1 should only be called when "
549
549
"level is CompilationLevel.PIECEWISE" )
550
550
551
- if self .splitting_ops is None :
552
- # NOTE: When using full cudagraph, instead of setting an empty
553
- # list and capture the full cudagraph inside the flattened fx
554
- # graph, we keep the piecewise fx graph structure but capture the
555
- # full cudagraph outside the fx graph. This reduces some cpu
556
- # overhead when the runtime batch_size is not cudagraph captured.
557
- # see https://github.com/vllm-project/vllm/pull/20059 for details.
558
- self .splitting_ops = self ._attention_ops
559
- elif len (self .splitting_ops ) == 0 :
560
- logger .warning_once ("Using piecewise compilation with empty "
561
- "splitting_ops." )
562
- if self .cudagraph_mode == CUDAGraphMode .PIECEWISE :
563
- logger .warning_once (
564
- "When compilation level is piecewise with empty "
565
- "splitting_ops, PIECEWISE cudagraph_mode will be "
566
- "treated as FULL cudagraph_mode. Please ensure you are "
567
- "using attention backends that support cudagraph or set "
568
- "cudagraph_mode to NONE explicitly if encountering "
569
- "any problems." )
570
- self .cudagraph_mode = CUDAGraphMode .FULL
571
- self .splitting_ops = []
551
+ self .splitting_ops = []
572
552
573
553
def splitting_ops_contain_attention (self ) -> bool :
574
554
return self .splitting_ops is not None and all (
0 commit comments