add dp aware padding

wuxun-zhang · wuxun-zhang · commit 1aaa6fc144bb · 2025-08-14T05:29:29.000+03:00
Signed-off-by: Wuxun Zhang &lt;wuxun.zhang@intel.com&gt;
diff --git a/examples/data_parallel.py b/examples/data_parallel.py
@@ -31,6 +31,7 @@
 
 import os
 from time import sleep
+import torch
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
@@ -89,6 +90,20 @@ def parse_args():
     return parser.parse_args()
 
 
+def generate_random_token_ids(repeat=1) -> list[int]:
+    """
+    For testing different seuquence length in data parallel scenario
+    """
+    candidate_lens = [230, 560]
+    prompts = []
+    for num_tokens in candidate_lens:
+        tokens = torch.randint(
+            low=0, high=10000, size=(num_tokens,), dtype=torch.int32
+        )
+        [prompts.append(tokens.tolist()) for _ in range(repeat)]
+    return prompts
+
+
 def main(
     model,
     dp_size,
@@ -111,13 +126,8 @@ def main(
     # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
     # engine processes.
 
-    # Sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ] * 100
+    # generate prompts with different length to demonstrate DP aware padding.
+    prompts = generate_random_token_ids()
 
     # with DP, each rank should process different prompts.
     # usually all the DP ranks process a full dataset,
@@ -134,7 +144,6 @@ def start(rank):
         # if any rank has no prompts to process,
         # we need to set a placeholder prompt
         prompts = ["Placeholder"]
-    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
 
     # Create a sampling params object.
     # since we are doing data parallel, every rank can have different
@@ -154,7 +163,7 @@ def start(rank):
         max_num_seqs=max_num_seqs,
         gpu_memory_utilization=gpu_memory_utilization,
     )
-    outputs = llm.generate(prompts, sampling_params)
+    outputs = llm.generate(None, sampling_params, prompts)
     # Print the outputs.
     for i, output in enumerate(outputs):
         if i >= 5:
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1150,6 +1150,12 @@ def _form_prefill_batch(self, contents):
 
         target_bs, target_seq, target_blocks = self._get_prompt_bucketing_fn()(
             query_lens, num_context_blocks)
+
+        # dp aware padding
+        target_bs = self.get_dp_padding(target_bs)
+        target_seq = self.get_dp_padding(target_seq)
+        target_blocks = self.get_dp_padding(target_blocks)
+
         token_ids = self._align_and_pad(contents.token_ids,
                                         (target_bs, target_seq),
                                         itertools.repeat(-1))
@@ -1266,6 +1272,9 @@ def _prepare_decode_inputs(self, num_decodes,
         padded_batch_size = self.bucketing_manager.find_decode_bucket(
             num_decodes, sum(num_blocks))[0]
 
+        # dp aware padding
+        padded_batch_size = self.get_dp_padding(padded_batch_size)
+
         block_tables_list = []
         for i, n in enumerate(num_blocks):
             seq_block_table = block_table_cpu_tensor[i, :n].tolist()
@@ -1365,8 +1374,6 @@ def _prepare_inputs(
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
 
-        # TODO wuxun: consider dp aware padding for bs, block bucket, etc.
-
         num_reqs = num_prefills + num_decodes
 
         # Get the number of scheduled tokens for each request.
@@ -1406,7 +1413,6 @@ def _check_config(self, batch_size, seq_len, num_blocks, attn_metadata,
                 "Configuration: (%s, %s, %s, %s) was not warmed-up!", phase,
                 batch_size, seq_len, num_blocks)
 
-    # TODO wuxun: dp padding for prefill/decode inputs
     def get_dp_padding(self,
                        num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
         dp_size = self.vllm_config.parallel_config.data_parallel_size
@@ -1426,11 +1432,11 @@ def get_dp_padding(self,
         num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
             num_tokens, dp_size, dp_rank)
         max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
-        num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
-                                                dp_size,
-                                                device="cpu",
-                                                dtype=torch.int32)
-        return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
+        # num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
+        #                                         dp_size,
+        #                                         device="cpu",
+        #                                         dtype=torch.int32).item()
+        return max_tokens_across_dp_cpu
 
     def _execute_model_generic(self,
                                token_ids,
@@ -2481,36 +2487,40 @@ def profile_run(self) -> None:
         # it is important to create tensors inside the loop, rather than
         # multiplying the list, to avoid Dynamo from treating them as
         # tensor aliasing.
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        kv_caches = [None] * num_layers
-
-        # Run empty prefill forwards - prefill max batch and prefill max seq
-        self.warmup_scenario(batch_size=1,
-                             seq_or_block=self.max_model_len,
-                             is_prompt=True,
-                             kv_caches=kv_caches)
-        max_seq_len = math.ceil(
-            (self.max_num_tokens // self.max_prefill_batch_size) /
-            self.block_size) * self.block_size
-        self.warmup_scenario(batch_size=self.max_prefill_batch_size,
-                             seq_or_block=max_seq_len,
-                             is_prompt=True,
-                             kv_caches=kv_caches)
+        # num_layers = self.model_config.get_num_layers(self.parallel_config)
+        # kv_caches = [None] * num_layers
+
+        max_num_batched_tokens = self.max_num_tokens
+        max_prefill_batch_size = self.max_prefill_batch_size
+        max_seq_len = (max_num_batched_tokens + max_prefill_batch_size -
+                       1) // max_prefill_batch_size
+        if max_seq_len % self.block_size != 0:
+            max_seq_len = ((max_seq_len + self.block_size - 1) //
+                           self.block_size) * self.block_size
+
+        prompt_cfg = (max_prefill_batch_size, max_seq_len, 0)
+        decode_cfg = None
+
+        self._execute_dummy_scenario(prompt_cfg, decode_cfg)
+
+        # # Run empty prefill forwards - prefill max batch and prefill max seq
+        # self.warmup_scenario(batch_size=1,
+        #                      seq_or_block=self.max_model_len,
+        #                      is_prompt=True,
+        #                      kv_caches=kv_caches)
+        # max_seq_len = math.ceil(
+        #     (self.max_num_tokens // self.max_prefill_batch_size) /
+        #     self.block_size) * self.block_size
+        # self.warmup_scenario(batch_size=self.max_prefill_batch_size,
+        #                      seq_or_block=max_seq_len,
+        #                      is_prompt=True,
+        #                      kv_caches=kv_caches)
 
     def _dummy_run(self, max_num_batched_tokens: int) -> None:
-        # TODO wuxun: dummy run implementation
         assert max_num_batched_tokens == 1
-        # self.warmup_scenario(max_num_batched_tokens,
-        #                      1,
-        #                      1,
-        #                      is_prompt=False,
-        #                      kv_caches=None,
-        #                      num_iters=1,
-        #                      is_pt_profiler_run=False,
-        #                      align_worker=True,
-        #                      is_dummy_run=True)
-        prompt_cfg = 1, 1, 0
-        decode_cfg = None
+        prompt_cfg = None
+        decode_cfg = 1, 1
+        # add dummy decode run
         self._execute_dummy_scenario(prompt_cfg, decode_cfg)
         return