fix

wuxun-zhang · wuxun-zhang · commit e9bc231a5e3e · 2025-08-20T18:10:37.000+03:00
Signed-off-by: Wuxun Zhang &lt;wuxun.zhang@intel.com&gt;
diff --git a/vllm_gaudi/distributed/device_communicators/hpu_communicator.py b/vllm_gaudi/distributed/device_communicators/hpu_communicator.py
@@ -65,9 +65,6 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
     def dispatch(
             self, hidden_states: torch.Tensor,
             router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        all-gather based dispatch for HPUCommunicator.
-        """
         cu_tokens_across_dp_cpu = get_forward_context(
         ).dp_metadata.cu_tokens_across_dp_cpu
         hidden_states_across_dp = naive_multicast(hidden_states,
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1227,7 +1227,6 @@ def _form_prefill_batch(self, contents):
 
         query_lens = _async_h2d_tensor(query_lens, torch.int32)
         token_ids = _async_h2d_tensor(token_ids, torch.int32)
-
         token_positions = _async_h2d_tensor(token_positions, torch.int32)
         token_slots = _async_h2d_tensor(token_slots, torch.int64)
         logits_indices = _async_h2d_tensor(logits_indices, torch.int32)
@@ -1294,7 +1293,6 @@ def _prepare_decode_inputs(self, num_decodes,
             num_decodes, sum(num_blocks))[0]
 
         # # dp aware padding
-        assert padded_batch_size is not None
         padded_batch_size += self.get_dp_padding(padded_batch_size)
 
         block_tables_list = []
@@ -1754,8 +1752,6 @@ def execute_model(
 
         ######################### PREFILLS #########################
         if num_prefills > 0:
-            # Wuxun: merged prefill forward if enabled
-            # 2D bucketing or merged prefill bucketing
             htorch.core.mark_step()
             for idx, (req_id, prompt_len, token_ids, position_ids,
                       attn_metadata, logits_indices,
@@ -2098,6 +2094,121 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem):
                f'used_mem:{format_bytes(total_mem)}')
         logger.info(msg)
 
+<<<<<<< HEAD
+=======
+    def warmup_scenario(self,
+                        batch_size,
+                        seq_or_block,
+                        num_blocks,
+                        is_prompt,
+                        kv_caches,
+                        num_iters=3,
+                        is_pt_profiler_run=True,
+                        align_worker=False,
+                        is_dummy_run=False) -> None:
+        """Dummy warmup run for memory usage and graph compilation."""
+
+        query_seq_len = seq_or_block if is_prompt else 1
+        input_ids = torch.zeros((batch_size, query_seq_len),
+                                dtype=torch.int32,
+                                device='cpu')
+        position_ids = torch.zeros((batch_size, query_seq_len),
+                                   dtype=torch.int32,
+                                   device='cpu')
+        slot_mapping = torch.zeros((batch_size, query_seq_len),
+                                   dtype=torch.int64,
+                                   device='cpu')
+
+        input_ids_device = _async_h2d_tensor_copy(input_ids, self.device)
+        position_ids_device = _async_h2d_tensor_copy(position_ids, self.device)
+        slot_mapping_device = _async_h2d_tensor_copy(slot_mapping, self.device)
+
+        use_graphs = is_dummy_run or self._use_graphs()
+        phase = "prompt" if is_prompt else "decode"
+        scenario_name = ("warmup_"
+                         f"{phase}_"
+                         f"bs{batch_size}_"
+                         f"seq{query_seq_len}_"
+                         f"ctx{num_blocks}_"
+                         f"graphs{'T' if use_graphs else 'F'}")
+        input_ids = torch.zeros((batch_size, query_seq_len),
+                                dtype=torch.int32,
+                                device='cpu')
+        position_ids = torch.zeros((batch_size, query_seq_len),
+                                   dtype=torch.int32,
+                                   device='cpu')
+        slot_mapping = torch.zeros((batch_size, query_seq_len),
+                                   dtype=torch.int64,
+                                   device='cpu')
+
+        input_ids_device = _async_h2d_tensor_copy(input_ids, self.device)
+        position_ids_device = _async_h2d_tensor_copy(position_ids, self.device)
+        slot_mapping_device = _async_h2d_tensor_copy(slot_mapping, self.device)
+        self.profiler.start('internal', scenario_name)
+
+        times = num_iters if use_graphs or is_pt_profiler_run else 1
+        for time_index in range(times):
+            if is_prompt:
+                seq_lens = torch.zeros((batch_size),
+                                       dtype=torch.int32,
+                                       device='cpu')
+                seq_lens.fill_(seq_or_block)
+                seq_lens_device = _async_h2d_tensor_copy(seq_lens, self.device)
+                block_list_device = None
+                if num_blocks:
+                    prefix_block_tables = torch.ones(
+                        (batch_size, num_blocks),
+                        dtype=torch.int32,
+                        device='cpu') * self._PAD_BLOCK_ID
+                    block_list_device = _async_h2d_tensor_copy(
+                        prefix_block_tables.flatten(), self.device)
+                attn_metadata = \
+                    HPUAttentionMetadataV1.make_prefill_metadata(
+                        attn_bias=None,
+                        seq_lens_tensor=seq_lens_device,
+                        context_lens_tensor=seq_lens_device,
+                        slot_mapping=slot_mapping_device,
+                        block_list=block_list_device,
+                        block_size=self.block_size)
+            else:
+                block_tables = [
+                    x.tolist()
+                    for x in np.array_split(np.arange(num_blocks), batch_size)
+                ]
+                block_list, block_groups, block_usage = \
+                    self.get_habana_paged_attn_buffers(
+                        slot_mapping=slot_mapping,
+                        block_tables=block_tables,
+                        batch_size=batch_size)
+                block_list_device = _async_h2d_tensor_copy(
+                    block_list, self.device)
+                block_usage_device = _async_h2d_tensor_copy(
+                    block_usage, self.device)
+                block_groups_device = _async_h2d_tensor_copy(
+                    block_groups, self.device)
+                attn_metadata = HPUAttentionMetadataV1.make_decode_metadata(
+                    block_list=block_list_device,
+                    block_usage=block_usage_device,
+                    block_groups=block_groups_device,
+                    num_decode_tokens=batch_size,
+                    input_positions=None,
+                    slot_mapping=slot_mapping_device,
+                    block_size=self.block_size)
+
+        logits_indices = torch.arange(0, batch_size, device='cpu')
+        logits_indices_device = _async_h2d_tensor_copy(logits_indices,
+                                                       self.device)
+        # Dummy run.
+        htorch.core.mark_step()
+        _ = self._execute_model_generic(input_ids_device, position_ids_device,
+                                        attn_metadata, logits_indices_device,
+                                        kv_caches, True)
+        # TODO: do sampling on logits, warmup sampler and prefill joiner
+        htorch.core.mark_step()
+        self.profiler.end()
+        return None
+
+>>>>>>> 68ee934 (fix)
     def log_warmup(self, phase, i, max_i, batch_size, seq_len, num_blocks):
         free_mem = format_bytes(
             HabanaMemoryProfiler.current_free_device_memory())