padding bugfix

SageMoore · SageMoore · commit b6d162f33e57 · 2025-09-03T03:42:18.000Z
Signed-off-by: Sage Moore &lt;sage@neuralmagic.com&gt;
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -93,27 +93,32 @@ def num_tokens_across_dp(num_tokens: int, dp_size: int,
         return num_tokens_tensor
 
     @staticmethod
-    def should_ubatch_across_dp(should_ubatch: bool, num_tokens_per_ubatch: int, dp_size: int,
+    def should_ubatch_across_dp(should_ubatch: bool, orig_num_tokens_per_ubatch: int,
+                                padded_num_tokens_per_ubatch: int, dp_size: int,
                                 dp_rank: int) -> tuple[bool, Optional[torch.Tensor]]:
 
-        tensor = torch.zeros(3, dp_size, device="cpu", dtype=torch.int32)
-        tensor[0][dp_rank] = num_tokens_per_ubatch
-        tensor[1][dp_rank] = 1 if should_ubatch else 0
+        tensor = torch.zeros(3, dp_size, device="cuda", dtype=torch.int32)
+        tensor[0][dp_rank] = orig_num_tokens_per_ubatch
+        tensor[1][dp_rank] = padded_num_tokens_per_ubatch
+        tensor[2][dp_rank] = 1 if should_ubatch else 0
 
 
         from vllm.distributed.parallel_state import get_dp_group
-        dist.all_reduce(tensor, group=get_dp_group().cpu_group)
+        dist.all_reduce(tensor, group=get_dp_group().device_group)
 
-        result: bool = bool(torch.all(tensor[1]== 1).item())
+        result: bool = bool(torch.all(tensor[2]== 1).item())
         if not result:
             return result, None
         
-        min_num_tokens_per_ubatch = tensor[0].min().item()
-        max_num_tokens_per_ubatch = tensor[0].max().item()
-        if max_num_tokens_per_ubatch >= 2 * min_num_tokens_per_ubatch:
-            logger.debug(f"Aborting ubatching {min_num_tokens_per_ubatch} {max_num_tokens_per_ubatch}")
+        orig_num_tokens_tensor = tensor[0, :]
+        padded_num_tokens_tensor = tensor[1, :]
+
+        orig_min_num_tokens = orig_num_tokens_tensor.min().item()
+        padded_max_num_tokens = padded_num_tokens_tensor.max().item()
+        if padded_max_num_tokens >= 2 * orig_min_num_tokens:
+            logger.debug(f"Aborting ubatching {orig_min_num_tokens} {padded_max_num_tokens}")
             return False, None
-        return result, tensor[0, :]
+        return result, padded_num_tokens_tensor
 
     @staticmethod
     def make(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1539,7 +1539,7 @@ def get_dp_padding_ubatch(
         if ubatch_slices is None:
             (should_ubatch,
              num_tokens_across_dp) = self.should_ubatch_with_num_tokens(
-                 False, 0)
+                 False, 0, 0)
             assert should_ubatch is False
             assert num_tokens_across_dp is None
             return should_ubatch, 0, num_tokens_across_dp
@@ -1581,9 +1581,9 @@ def get_dp_padding_ubatch(
             should_ubatch = False
 
         # Note that we compute the number of padded tokens per ubatch
-        (should_ubatch,
-         num_tokens_across_dp) = self.should_ubatch_with_num_tokens(
-             should_ubatch, num_tokens_per_ubatch)
+        (should_ubatch, 
+         num_tokens_across_dp) = self.should_ubatch_with_num_tokens(should_ubatch,
+            num_tokens_unpadded // 2, num_tokens_per_ubatch)
         if not should_ubatch:
             assert num_tokens_across_dp is None
             return should_ubatch, 0, num_tokens_across_dp
@@ -1607,7 +1607,7 @@ def get_dp_padding_ubatch(
     def pad_out_ubatch_first_stage(self, ubatch_slices: UBatchSlices,
                                    num_pad_tokens: int):
         original_num_tokens = ubatch_slices[1].token_slice.stop
-        assert num_pad_tokens < original_num_tokens
+        assert num_pad_tokens < original_num_tokens, f"num_pad_tokens {num_pad_tokens} original_num_tokens {original_num_tokens}"
         total_num_tokens_per_ubatch = (original_num_tokens +
                                        num_pad_tokens) // 2
         padded_first_ubatch_slice = slice(0, total_num_tokens_per_ubatch)
@@ -1631,16 +1631,16 @@ def pad_out_ubatch_second_stage(self, ubatch_slices: UBatchSlices,
         ubatch_slices[1] = UbatchSlice(padded_second_ubatch_slice,
                                        padded_second_ubatch_slice)
 
-    def should_ubatch_with_num_tokens(
-        self,
-        should_ubatch: bool,
-        num_tokens_per_ubatch: int,
-    ) -> tuple[bool, Optional[torch.Tensor]]:
+    def should_ubatch_with_num_tokens(self, should_ubatch: bool, orig_num_tokens_per_ubatch: int,
+                                      padded_num_tokens_per_ubatch: int,
+                      ) -> tuple[bool, Optional[torch.Tensor]]:
         dp_size = self.vllm_config.parallel_config.data_parallel_size
         dp_rank = self.vllm_config.parallel_config.data_parallel_rank
-        return DPMetadata.should_ubatch_across_dp(should_ubatch,
-                                                  num_tokens_per_ubatch,
-                                                  dp_size, dp_rank)
+        return DPMetadata.should_ubatch_across_dp(should_ubatch, 
+                                                  orig_num_tokens_per_ubatch, 
+                                                  padded_num_tokens_per_ubatch, 
+                                                  dp_size, 
+                                                  dp_rank)
 
     def _pool(
         self,
@@ -2472,6 +2472,7 @@ def _dummy_run(
             should_ubatch, _ = self.should_ubatch_with_num_tokens(
                 should_ubatch,
                 num_tokens // 2,
+                num_tokens // 2,
             )
         assert cudagraph_runtime_mode in {
             CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL