Skip to content

Commit d4a4c41

Browse files
committed
enable profile run
Signed-off-by: Wuxun Zhang <[email protected]>
1 parent 1aaa6fc commit d4a4c41

File tree

2 files changed

+27
-21
lines changed

2 files changed

+27
-21
lines changed

vllm_gaudi/v1/worker/hpu_model_runner.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,9 +1152,9 @@ def _form_prefill_batch(self, contents):
11521152
query_lens, num_context_blocks)
11531153

11541154
# dp aware padding
1155-
target_bs = self.get_dp_padding(target_bs)
1156-
target_seq = self.get_dp_padding(target_seq)
1157-
target_blocks = self.get_dp_padding(target_blocks)
1155+
target_bs += self.get_dp_padding(target_bs)
1156+
target_seq += self.get_dp_padding(target_seq)
1157+
target_blocks += self.get_dp_padding(target_blocks)
11581158

11591159
token_ids = self._align_and_pad(contents.token_ids,
11601160
(target_bs, target_seq),
@@ -1273,7 +1273,7 @@ def _prepare_decode_inputs(self, num_decodes,
12731273
num_decodes, sum(num_blocks))[0]
12741274

12751275
# dp aware padding
1276-
padded_batch_size = self.get_dp_padding(padded_batch_size)
1276+
padded_batch_size += self.get_dp_padding(padded_batch_size)
12771277

12781278
block_tables_list = []
12791279
for i, n in enumerate(num_blocks):
@@ -1427,7 +1427,7 @@ def get_dp_padding(self,
14271427

14281428
if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
14291429
# Early exit.
1430-
return 0, None
1430+
return 0
14311431

14321432
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
14331433
num_tokens, dp_size, dp_rank)
@@ -1436,7 +1436,7 @@ def get_dp_padding(self,
14361436
# dp_size,
14371437
# device="cpu",
14381438
# dtype=torch.int32).item()
1439-
return max_tokens_across_dp_cpu
1439+
return max_tokens_across_dp_cpu - num_tokens
14401440

14411441
def _execute_model_generic(self,
14421442
token_ids,
@@ -1643,11 +1643,9 @@ def apply_grammar_bitmask(
16431643
logits_cpu.to(self.device, non_blocking=True).to(logits.dtype))
16441644

16451645
@torch.inference_mode()
1646-
def execute_model(
1647-
self,
1648-
scheduler_output: "SchedulerOutput",
1649-
warmup_mode=False,
1650-
) -> ModelRunnerOutput:
1646+
def execute_model(self,
1647+
scheduler_output: "SchedulerOutput",
1648+
warmup_mode=False) -> ModelRunnerOutput:
16511649
# NOTE(kzawora): Since scheduler doesn't differentiate between prefills
16521650
# and decodes, we must handle mixed batches. In _update_states we make
16531651
# sure that first self.input_batch.num_decodes requests are decodes,
@@ -1751,8 +1749,12 @@ def execute_model(
17511749
htorch.core.mark_step()
17521750
prefill_hidden_states_ts, logits_device = \
17531751
self._execute_model_generic(
1754-
token_ids, position_ids, attn_metadata, logits_indices,
1755-
self.kv_caches, warmup_mode=warmup_mode)
1752+
token_ids,
1753+
position_ids,
1754+
attn_metadata,
1755+
logits_indices,
1756+
self.kv_caches,
1757+
warmup_mode=warmup_mode)
17561758
htorch.core.mark_step()
17571759
# Skip separate sampling for structured output
17581760
if structured_output:
@@ -2477,7 +2479,6 @@ def __del__(self):
24772479

24782480
@torch.inference_mode()
24792481
def profile_run(self) -> None:
2480-
return
24812482
"""Profile to measure peak memory during forward pass."""
24822483

24832484
# use an empty tensor instead of `None`` to force Dynamo to pass
@@ -2497,10 +2498,14 @@ def profile_run(self) -> None:
24972498
if max_seq_len % self.block_size != 0:
24982499
max_seq_len = ((max_seq_len + self.block_size - 1) //
24992500
self.block_size) * self.block_size
2501+
max_seq_len = min(max_seq_len, self.max_model_len)
25002502

2501-
prompt_cfg = (max_prefill_batch_size, max_seq_len, 0)
2502-
decode_cfg = None
2503+
# different DP engine may have different config
2504+
max_seq_len += self.get_dp_padding(max_seq_len)
2505+
max_prefill_batch_size += self.get_dp_padding(max_prefill_batch_size)
25032506

2507+
prompt_cfg = (max_prefill_batch_size, max_seq_len - 1, 0)
2508+
decode_cfg = None
25042509
self._execute_dummy_scenario(prompt_cfg, decode_cfg)
25052510

25062511
# # Run empty prefill forwards - prefill max batch and prefill max seq

vllm_gaudi/v1/worker/hpu_worker.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,16 @@ def determine_available_memory(self) -> int:
165165
single_kv_block_size_bytes = 0
166166
for layer_name, layer_spec in kv_cache_spec.items():
167167
if isinstance(layer_spec, FullAttentionSpec):
168-
dtype = layer_spec.dtype
168+
# dtype = layer_spec.dtype
169169

170170
# Use an empty tensor instead of `None`` to force Dynamo to pass
171171
# it by reference, rather by specializing on the value ``None``.
172-
hpu_k_cache = torch.tensor([], dtype=dtype, device='hpu')
173-
hpu_v_cache = torch.tensor([], dtype=dtype, device='hpu')
172+
# hpu_k_cache = torch.tensor([], dtype=dtype, device='hpu')
173+
# hpu_v_cache = torch.tensor([], dtype=dtype, device='hpu')
174174

175-
kv_caches[layer_name] = (hpu_k_cache, hpu_v_cache)
175+
# kv_caches[layer_name] = (hpu_k_cache, hpu_v_cache)
176+
# avoid issue of reading kv cache during profiling
177+
kv_caches[layer_name] = None
176178

177179
single_kv_block_size_bytes += layer_spec.page_size_bytes
178180

@@ -287,7 +289,6 @@ def init_worker_distributed_environment(
287289
local_rank: int = -1,
288290
) -> None:
289291
"""Initialize the distributed environment."""
290-
print("Wuxun debug>> ", parallel_config)
291292
init_distributed_environment(parallel_config.world_size,
292293
rank,
293294
distributed_init_method,

0 commit comments

Comments
 (0)