fix lazy hang

wuxun-zhang · wuxun-zhang · commit 079944092604 · 2025-08-25T10:07:12.000+03:00
Signed-off-by: Wuxun Zhang &lt;wuxun.zhang@intel.com&gt;
diff --git a/vllm_gaudi/distributed/device_communicators/hpu_communicator.py b/vllm_gaudi/distributed/device_communicators/hpu_communicator.py
@@ -1,35 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Optional
 import torch
 import torch.distributed as dist
+from torch.distributed import ProcessGroup
 
 from vllm.distributed.device_communicators.base_device_communicator \
     import DeviceCommunicatorBase
-from vllm.distributed.parallel_state import get_dp_group
 from vllm.forward_context import get_forward_context
+from vllm.distributed.parallel_state import get_dp_group
 
 import habana_frameworks.torch as htorch  # noqa: F401
 
 
-def naive_multicast(x: torch.Tensor,
-                    cu_tokens_across_dp_cpu: torch.Tensor) -> torch.Tensor:
-    assert x.dim() == 2, "Input tensor must be 2D"
-    dp_rank = get_dp_group().rank_in_group
-    dp_world_size = get_dp_group().world_size
-    buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
-                         device=x.device,
-                         dtype=x.dtype)
-    start = 0 if dp_rank == 0 else cu_tokens_across_dp_cpu[dp_rank - 1]
-    end = cu_tokens_across_dp_cpu[dp_rank]
-    buffer[start:end, :].copy_(x)
-    for idx in range(dp_world_size):
-        start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
-        end = cu_tokens_across_dp_cpu[idx]
-        get_dp_group().broadcast(buffer[start:end, :], idx)
-    return buffer
+class HpuCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
 
+        self.dp_group = None
+        self.dp_rank = 0
+        self.dp_world_size = 1
+        # assume EP is enabled along with DP
+        if "ep" in unique_name:
+            self.dp_group = get_dp_group()
+            self.dp_rank = self.dp_group.rank_in_group
+            self.dp_world_size = self.dp_group.world_size
 
-class HpuCommunicator(DeviceCommunicatorBase):
+    def naive_multicast(self, x: torch.Tensor,
+                        cu_tokens_across_dp_cpu: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 2, "Input tensor must be 2D"
+        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+                             device=x.device,
+                             dtype=x.dtype)
+        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.dp_rank]
+        buffer[start:end, :].copy_(x)
+        for idx in range(self.dp_world_size):
+            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
+            end = cu_tokens_across_dp_cpu[idx]
+            self.dp_group.broadcast(buffer[start:end, :], idx)
+        return buffer
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
@@ -67,19 +83,22 @@ def dispatch(
             router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         cu_tokens_across_dp_cpu = get_forward_context(
         ).dp_metadata.cu_tokens_across_dp_cpu
-        hidden_states_across_dp = naive_multicast(hidden_states,
-                                                  cu_tokens_across_dp_cpu)
-        router_logits_across_dp = naive_multicast(router_logits,
-                                                  cu_tokens_across_dp_cpu)
+        hidden_states_across_dp = self.naive_multicast(
+            hidden_states, cu_tokens_across_dp_cpu)
+        router_logits_across_dp = self.naive_multicast(
+            router_logits, cu_tokens_across_dp_cpu)
         return hidden_states_across_dp, router_logits_across_dp
 
     def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        dp_rank = get_dp_group().rank_in_group
+        if htorch.utils.internal.is_lazy():
+            htorch.core.mark_step()
         cu_tokens_across_dp_cpu = get_forward_context(
         ).dp_metadata.cu_tokens_across_dp_cpu
-        start = 0 if dp_rank == 0 else cu_tokens_across_dp_cpu[dp_rank - 1]
-        end = cu_tokens_across_dp_cpu[dp_rank]
 
-        all_hidden_states = get_dp_group().all_reduce(hidden_states)
+        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.dp_rank]
+
+        all_hidden_states = self.dp_group.all_reduce(hidden_states)
         hidden_states = all_hidden_states[start:end, :]
         return hidden_states
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1354,11 +1354,8 @@ def _extract_prefill_batch_contents(self, num_prefills, num_decodes,
             # no real prefill batches
             num_prefill_batches = 0
 
-        num_pad = self.get_dp_padding(num_prefill_batches)
-        if num_pad > 0:
-            for _ in range(num_pad):
-                all_batch_contents.append(BatchContents())
-        return all_batch_contents
+        num_pad_across_dp = self.get_dp_padding(num_prefill_batches)
+        return all_batch_contents, num_pad_across_dp
 
     def _make_attn_bias(self, context_groups, token_groups):
         dtype = self.dtype
@@ -1426,11 +1423,6 @@ def _form_prefill_batch(self, contents):
         target_bs, target_seq, target_blocks = self._get_prompt_bucketing_fn()(
             query_lens, num_context_blocks)
 
-        # dp aware padding
-        target_bs += self.get_dp_padding(target_bs)
-        target_seq += self.get_dp_padding(target_seq)
-        target_blocks += self.get_dp_padding(target_blocks)
-
         # NOTE: If model does not support multimodal inputs, we pad here.
         # For models with multimodal support, we may want to get embeddings
         # for the valid tokens before padding.
@@ -1523,33 +1515,49 @@ def _form_prefill_batch(self, contents):
                                 logits_indices=[logits_indices],
                                 logits_requests=[logits_requests])
 
+    def _create_dummy_prefill_batch_contents(
+            self, num_prefills: int) -> list[PrefillInputData]:
+        req_id = -1
+        context_len = 0
+        query_len = 128
+        prompt_tokens = 128
+        token_ids = list(int(i) for i in range(prompt_tokens))
+        num_blocks = round_up(context_len + query_len,
+                              self.block_size) // self.block_size
+        blocks = [0] * num_blocks
+        num_output_logits = context_len + query_len - prompt_tokens + 1
+        logits_positions = list(range(query_len - num_output_logits,
+                                      query_len))
+
+        new_batch_contents = BatchContents(
+            req_ids=[req_id],
+            token_ids=[token_ids],
+            context_lens=[context_len],
+            blocks=[blocks],
+            logits_positions=[logits_positions],
+        )
+
+        outputs = [
+            self._form_prefill_batch(new_batch_contents)
+            for _ in range(num_prefills)
+        ]
+        return outputs
+
     def _prepare_prefill_inputs(
             self, num_prefills, num_decodes,
-            num_scheduled_tokens: list[int]) -> PrefillInputData:
-        all_batch_contents = self._extract_prefill_batch_contents(
+            num_scheduled_tokens: list[int]) -> tuple[PrefillInputData, int]:
+        all_batch_contents, num_pad_across_dp = self._extract_prefill_batch_contents(
             num_prefills, num_decodes, num_scheduled_tokens)
         all_batches = [
             self._form_prefill_batch(bc) for bc in all_batch_contents
         ]
         merge_contents(all_batches[0], *all_batches[1:])
-        return all_batches[0]
-
-    def _prepare_decode_inputs(self, num_decodes,
-                               num_scheduled_tokens) -> DecodeInputData:
-        # Decodes run as one single padded batch with shape [batch, 1]
-        #
-        # We need to set _PAD_SLOT_ID for the padding tokens in the
-        # slot_mapping, such that the attention KV cache insertion
-        # logic knows to ignore those indicies. Otherwise, the
-        # padding data can be dummy since we have a causal mask.
-
-        block_table_cpu_tensor = self.input_batch.block_table[
-            0].get_cpu_tensor()
-        if num_decodes == 0:
-            return DecodeInputData(num_decodes=0)
-        # BLOCK_TABLE [batch, max_num_blocks_per_req]
-        context_lens = self.input_batch.num_computed_tokens_cpu[:num_decodes]
+        return all_batches[0], num_pad_across_dp
 
+    def _create_decode_input_data(
+            self, num_decodes, num_scheduled_tokens, context_lens,
+            block_table_cpu_tensor, num_computed_tokens_cpu,
+            token_ids_cpu) -> tuple[DecodeInputData, int]:
         # NOTE(kzawora): the +1 is what causes this entire thing to work,
         # as in the paged attention, we don't fetch just the context from cache,
         # but also kvs for the current token
@@ -1561,8 +1569,9 @@ def _prepare_decode_inputs(self, num_decodes,
         padded_batch_size = self.bucketing_manager.find_decode_bucket(
             num_decodes, sum(num_blocks))[0]
 
-        # # dp aware padding
-        padded_batch_size += self.get_dp_padding(padded_batch_size)
+        # dp aware padding
+        num_pad_across_dp = self.get_dp_padding(padded_batch_size)
+        padded_batch_size += num_pad_across_dp
 
         block_tables_list = []
         for i, n in enumerate(num_blocks):
@@ -1574,8 +1583,7 @@ def _prepare_decode_inputs(self, num_decodes,
         # We slice at the end, since we use the positions for gathering.
         positions = torch.zeros((padded_batch_size, 1), dtype=torch.int32)
         positions[:num_decodes] = torch.from_numpy(
-            self.input_batch.num_computed_tokens_cpu.reshape(-1,
-                                                             1)[:num_decodes])
+            num_computed_tokens_cpu.reshape(-1, 1)[:num_decodes])
         positions = positions[:padded_batch_size]
 
         padded_index = torch.zeros((padded_batch_size, 1), dtype=torch.int64)
@@ -1613,11 +1621,8 @@ def _prepare_decode_inputs(self, num_decodes,
 
         # TOKEN_IDS. [batch, 1]
         token_ids = torch.zeros((padded_batch_size, 1), dtype=torch.int32)
-        token_ids[:num_decodes] = torch.gather(input=torch.from_numpy(
-            self.input_batch.token_ids_cpu),
-                                               dim=1,
-                                               index=index)
-
+        token_ids[:num_decodes] = torch.gather(
+            input=torch.from_numpy(token_ids_cpu), dim=1, index=index)
         # SLOT_MAPPING [batch, 1]
         # The "slot" is the "physical index" of a token in the KV cache.
         # Look up the block_idx in the block table (logical<>physical map)
@@ -1684,7 +1689,42 @@ def _prepare_decode_inputs(self, num_decodes,
                 num_decode_tokens=num_decode_tokens_device,
                 slot_mapping=slot_mapping_device,
                 block_size=self.block_size,
-            ))
+            )), num_pad_across_dp
+
+    def _prepare_decode_inputs(
+            self, num_decodes,
+            num_scheduled_tokens) -> tuple[DecodeInputData, int]:
+        # Decodes run as one single padded batch with shape [batch, 1]
+        #
+        # We need to set _PAD_SLOT_ID for the padding tokens in the
+        # slot_mapping, such that the attention KV cache insertion
+        # logic knows to ignore those indicies. Otherwise, the
+        # padding data can be dummy since we have a causal mask.
+
+        num_pad_across_dp = self.get_dp_padding(num_decodes)
+        if num_decodes == 0:
+            return DecodeInputData(num_decodes=0), num_pad_across_dp
+        # BLOCK_TABLE [batch, max_num_blocks_per_req]
+        context_lens = self.input_batch.num_computed_tokens_cpu[:num_decodes]
+        block_table_cpu_tensor = self.input_batch.block_table[
+            0].get_cpu_tensor()
+        return self._create_decode_input_data(
+            num_decodes, num_scheduled_tokens, context_lens,
+            block_table_cpu_tensor, self.input_batch.num_computed_tokens_cpu,
+            self.input_batch.token_ids_cpu)
+
+    def _create_dummy_decode_input_data(self) -> DecodeInputData:
+        # create dummy decode input data with batch size 1
+        context_lens = [128]
+        block_table_cpu_tensor = torch.zeros([self._PAD_BLOCK_ID],
+                                             dtype=torch.int32).reshape(1, -1)
+        num_computed_tokens_cpu = np.array([128], dtype=np.int32)
+        token_ids = np.array(list(int(i) for i in range(context_lens[0])))
+
+        return self._create_decode_input_data(1, [1], context_lens,
+                                              block_table_cpu_tensor,
+                                              num_computed_tokens_cpu,
+                                              token_ids)[0]
 
     def _prepare_inputs(
         self,
@@ -1740,18 +1780,7 @@ def get_dp_padding(self,
         dp_size = self.vllm_config.parallel_config.data_parallel_size
         dp_rank = self.vllm_config.parallel_config.data_parallel_rank
 
-        # For DP: Don't pad when setting enforce_eager.
-        # This lets us set enforce_eager on the prefiller in a P/D setup and
-        # still use CUDA graphs (enabled by this padding) on the decoder.
-        #
-        # TODO(tms) : There are many cases where padding is enabled for
-        # prefills, causing unnecessary and excessive padding of activations.
-
-        # skip padding for non PD disagg case to avoid padding on prefill batch
-        # size and decode batch size
-        if dp_size == 1 or self.vllm_config.model_config.enforce_eager or (
-                self.vllm_config.kv_transfer_config is None
-                or self.vllm_config.kv_transfer_config.kv_connector is None):
+        if dp_size == 1:
             return 0
 
         num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
@@ -1768,7 +1797,6 @@ def _execute_model_generic(self,
                                warmup_mode=False,
                                inputs_embeds=None,
                                model_mm_kwargs=None):
-
         # FORWARD.
         batch_size = token_ids.size(0)
         seq_len = self._seq_len(attn_metadata)
@@ -2057,8 +2085,10 @@ def execute_model(
         num_prefills = len(pd_info.prompt_req_ids)
         num_reqs = num_decodes + num_prefills
         with self.profiler.record_event('internal', 'prepare_input_tensors'):
-            prefill_data, decode_data = self._prepare_inputs(
+            prefill_input_data, decode_input_data = self._prepare_inputs(
                 scheduler_output, num_prefills, num_decodes)
+        prefill_data, num_pad_prefill_batch_across_dp = prefill_input_data
+        decode_data, num_pad_decode_batch_across_dp = decode_input_data
         #FIXME(kzawora): Currently there's no handling of logprobs. Fix that
         # later.
         prefill_sampled_token_ids = []
@@ -2124,6 +2154,7 @@ def execute_model(
                         model_mm_kwargs=model_mm_kwargs,
                         warmup_mode=warmup_mode)
                 htorch.core.mark_step()
+
                 # Skip separate sampling for structured output
                 if structured_output:
                     logits_prompt.append(logits_device)
@@ -2154,9 +2185,27 @@ def execute_model(
                         prompt_batch_idx=idx,
                         is_prompt=True)
                     self.profiler.record_counter(self.event_start, counters)
+
             if self.is_driver_worker and self.profiler.enabled:
                 self.profiler_counter_helper.reset_prompt_seq_stats()
 
+        else:
+            if num_pad_prefill_batch_across_dp > 0:
+                htorch.core.mark_step()
+                dummy_prefill_input_data_list = self._create_dummy_prefill_batch_contents(
+                    num_pad_prefill_batch_across_dp)
+                for dummy_prefill_input_data in dummy_prefill_input_data_list:
+                    htorch.core.mark_step()
+                    _, dummy_logits_device = \
+                    self._execute_model_generic(
+                        dummy_prefill_input_data.token_ids[0],
+                        dummy_prefill_input_data.position_ids[0],
+                        dummy_prefill_input_data.attn_metadata[0],
+                        dummy_prefill_input_data.logits_indices[0],
+                        self.kv_caches,
+                        warmup_mode=warmup_mode)
+                    htorch.core.mark_step()
+
         ######################### DECODES #########################
         # Decodes run as one single batch with [padded_decode_bs, 1]
         if num_decodes > 0:
@@ -2205,6 +2254,19 @@ def execute_model(
                     prompt_batch_idx=None,
                     is_prompt=False)
                 self.profiler.record_counter(self.event_start, counters)
+        else:
+            if num_pad_decode_batch_across_dp > 0:
+                dummy_decode_input_data = self._create_dummy_decode_input_data(
+                )
+                htorch.core.mark_step()
+                _, dummy_logits_device = self._execute_model_generic(
+                    dummy_decode_input_data.token_ids,
+                    dummy_decode_input_data.position_ids,
+                    dummy_decode_input_data.attn_metadata,
+                    dummy_decode_input_data.logits_indices,
+                    self.kv_caches,
+                    warmup_mode=warmup_mode)
+                htorch.core.mark_step()
 
         if structured_output:
             # Scheduler places cached before prompt
@@ -2315,6 +2377,7 @@ def execute_model(
             prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore[arg-type]
             pooler_output=[],
         )
+
         return model_runner_output
 
     def load_model(self) -> None:
@@ -2735,8 +2798,8 @@ def __del__(self):
 
     @torch.inference_mode()
     def profile_run(self) -> None:
+        return
         """Profile to measure peak memory during forward pass."""
-
         # use an empty tensor instead of `None`` to force Dynamo to pass
         # it by reference, rather by specializing on the value `None`.
         # the `dtype` argument does not matter, and we use `float32` as
@@ -2750,7 +2813,6 @@ def profile_run(self) -> None:
         max_seq_len = math.ceil(
             (self.max_num_tokens // self.max_prefill_batch_size) /
             self.block_size) * self.block_size
-        max_seq_len = min(max_seq_len, self.max_model_len)
         self._execute_dummy_scenario(
             (self.max_prefill_batch_size, max_seq_len, 0), None)
 
diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py