fix rebase error

wuxun-zhang · wuxun-zhang · commit afe1a72a2d11 · 2025-09-03T11:22:24.000+03:00
Signed-off-by: Wuxun Zhang &lt;wuxun.zhang@intel.com&gt;
diff --git a/vllm_gaudi/distributed/device_communicators/hpu_communicator.py b/vllm_gaudi/distributed/device_communicators/hpu_communicator.py
@@ -8,7 +8,7 @@
 from vllm.distributed.device_communicators.base_device_communicator \
     import DeviceCommunicatorBase
 from vllm.forward_context import get_forward_context
-from vllm.distributed.parallel_state import get_dp_group
+from vllm.distributed.parallel_state import GroupCoordinator, get_dp_group
 
 import habana_frameworks.torch as htorch  # noqa: F401
 
@@ -22,7 +22,7 @@ def __init__(self,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
 
-        self.dp_group = None
+        self.dp_group: Optional[GroupCoordinator] = None
         self.dp_rank = 0
         self.dp_world_size = 1
         # assume EP is enabled along with DP
@@ -31,22 +31,6 @@ def __init__(self,
             self.dp_rank = self.dp_group.rank_in_group
             self.dp_world_size = self.dp_group.world_size
 
-    def naive_multicast(self, x: torch.Tensor,
-                        cu_tokens_across_dp_cpu: torch.Tensor) -> torch.Tensor:
-        assert x.dim() == 2, "Input tensor must be 2D"
-        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
-                             device=x.device,
-                             dtype=x.dtype)
-        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
-            self.dp_rank - 1]
-        end = cu_tokens_across_dp_cpu[self.dp_rank]
-        buffer[start:end, :].copy_(x)
-        for idx in range(self.dp_world_size):
-            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
-            end = cu_tokens_across_dp_cpu[idx]
-            self.dp_group.broadcast(buffer[start:end, :], idx)
-        return buffer
-
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
         # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
@@ -81,6 +65,7 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
     def dispatch(
             self, hidden_states: torch.Tensor,
             router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self.dp_group is not None
         assert hidden_states.dim() == 2, "Input hidden states must be 2D"
         input_size = hidden_states.size()
         # Allocate output tensor.
@@ -109,6 +94,7 @@ def dispatch(
     def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if htorch.utils.internal.is_lazy():
             htorch.core.mark_step()
+        assert self.dp_group is not None
         assert hidden_states.dim() == 2, "Input hidden states must be 2D"
         cu_tokens_across_dp_cpu = get_forward_context(
         ).dp_metadata.cu_tokens_across_dp_cpu
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1587,7 +1587,7 @@ def _form_prefill_batch(self, contents):
 
     def _create_dummy_prefill_batch_contents(
             self, num_prefills: int) -> list[PrefillInputData]:
-        req_id = -1
+        req_id = str(-1)
         context_len = 0
         query_len = 128
         prompt_tokens = 128
@@ -1616,26 +1616,30 @@ def _create_dummy_prefill_batch_contents(
     def _prepare_prefill_inputs(
         self, num_prefills, num_decodes, num_scheduled_tokens: list[int]
     ) -> tuple[PrefillInputData, Optional[PrefillInputData]]:
-        all_batch_contents, num_pad_across_dp = self._extract_prefill_batch_contents(
-            num_prefills, num_decodes, num_scheduled_tokens)
+        all_batch_contents, num_pad_across_dp = \
+            self._extract_prefill_batch_contents(
+                num_prefills, num_decodes, num_scheduled_tokens)
         all_batches = [
             self._form_prefill_batch(bc) for bc in all_batch_contents
         ]
         merge_contents(all_batches[0], *all_batches[1:])
 
         dummy_prefill_input_batches = None
         if num_pad_across_dp > 0:
-            dummy_prefill_input_batches = self._create_dummy_prefill_batch_contents(
-                num_pad_across_dp)
+            dummy_prefill_input_batches = \
+                self._create_dummy_prefill_batch_contents(num_pad_across_dp)
             merge_contents(dummy_prefill_input_batches[0],
                            *dummy_prefill_input_batches[1:])
         return all_batches[0], dummy_prefill_input_batches[
             0] if dummy_prefill_input_batches else None
 
     def _create_decode_input_data(
-            self, num_decodes, num_scheduled_tokens, context_lens,
-            block_table_cpu_tensor, num_computed_tokens_cpu,
-            token_ids_cpu) -> tuple[DecodeInputData, int]:
+            self,
+            num_decodes,
+            num_scheduled_tokens,
+            context_lens,
+            block_table_cpu_tensor,
+            scheduler_output=None) -> tuple[DecodeInputData, int]:
         # NOTE(kzawora): the +1 is what causes this entire thing to work,
         # as in the paged attention, we don't fetch just the context from cache,
         # but also kvs for the current token
@@ -1842,7 +1846,10 @@ def _create_decode_input_data(
             spec_decode_metadata=spec_decode_metadata), num_pad_across_dp
 
     def _prepare_decode_inputs(
-        self, num_decodes, num_scheduled_tokens
+        self,
+        num_decodes,
+        num_scheduled_tokens,
+        scheduler_output=None
     ) -> tuple[DecodeInputData, Optional[DecodeInputData]]:
         # Decodes run as one single padded batch with shape [batch, 1]
         #
@@ -1861,9 +1868,7 @@ def _prepare_decode_inputs(
         return self._create_decode_input_data(
             num_decodes, num_scheduled_tokens,
             self.input_batch.num_computed_tokens_cpu[:num_decodes],
-            self.input_batch.block_table[0].get_cpu_tensor(),
-            self.input_batch.num_computed_tokens_cpu,
-            self.input_batch.token_ids_cpu)
+            self.input_batch.block_table[0].get_cpu_tensor(), scheduler_output)
 
     def _create_dummy_decode_input_data(self) -> DecodeInputData:
         # create dummy decode input data with batch size 1
@@ -1872,12 +1877,13 @@ def _create_dummy_decode_input_data(self) -> DecodeInputData:
         context_lens = [128]
         block_table_cpu_tensor = torch.zeros([self._PAD_BLOCK_ID],
                                              dtype=torch.int32).reshape(1, -1)
-        num_computed_tokens_cpu = np.array([128], dtype=np.int32)
-        token_ids = np.array(list(int(i) for i in range(context_lens[0])))
+        # num_computed_tokens_cpu = np.array([128], dtype=np.int32)
+        # token_ids = np.array(list(int(i) for i in range(context_lens[0])))
 
-        return self._create_decode_input_data(
-            num_dummy_decodes, num_dummy_scheduled_tokens, context_lens,
-            block_table_cpu_tensor, num_computed_tokens_cpu, token_ids)[0]
+        return self._create_decode_input_data(num_dummy_decodes,
+                                              num_dummy_scheduled_tokens,
+                                              context_lens,
+                                              block_table_cpu_tensor)[0]
 
     def _get_cumsum_and_arange(
         self,
@@ -2052,8 +2058,7 @@ def _check_config(self, batch_size, seq_len, num_blocks, attn_metadata,
         if not seen and not warmup_mode:
             logger.warning("Configuration: %s was not warmed-up!", cfg)
 
-    def get_dp_padding(self,
-                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
+    def get_dp_padding(self, num_tokens: int) -> int:
         dp_size = self.vllm_config.parallel_config.data_parallel_size
         dp_rank = self.vllm_config.parallel_config.data_parallel_rank
 
@@ -2364,9 +2369,11 @@ def execute_model(
         with self.profiler.record_event('internal', 'prepare_input_tensors'):
             prefill_input_data, decode_input_data = self._prepare_inputs(
                 scheduler_output, num_prefills, num_decodes)
-        prefill_data, dummy_prefill_input_data_batches_across_dp = prefill_input_data
-        num_pad_prefill_batch_across_dp = 0 if dummy_prefill_input_data_batches_across_dp is None else len(
-            dummy_prefill_input_data_batches_across_dp.request_ids)
+        prefill_data, \
+            dummy_prefill_input_data_batches_across_dp = prefill_input_data
+        num_pad_prefill_batch_across_dp = \
+            0 if dummy_prefill_input_data_batches_across_dp is None \
+            else len(dummy_prefill_input_data_batches_across_dp.request_ids)
         decode_data, dummy_decode_input_data_across_dp = decode_input_data
         #FIXME(kzawora): Currently there's no handling of logprobs. Fix that
         # later.
@@ -2477,7 +2484,7 @@ def execute_model(
                             zip(*shallow_tuple(
                                 dummy_prefill_input_data_batches_across_dp))):
                     htorch.core.mark_step()
-                    _, dummy_logits_device = \
+                    _, _, dummy_logits_device = \
                     self._execute_model_generic(
                         token_ids,
                         position_ids,
@@ -2566,7 +2573,7 @@ def execute_model(
         else:
             if dummy_decode_input_data_across_dp is not None:
                 htorch.core.mark_step()
-                _, dummy_logits_device = self._execute_model_generic(
+                _, _, dummy_logits_device = self._execute_model_generic(
                     dummy_decode_input_data_across_dp.token_ids,
                     dummy_decode_input_data_across_dp.position_ids,
                     dummy_decode_input_data_across_dp.attn_metadata,