From 7ac1bb3f78bbfe6bc2cfac4c1de56e2384bab72b Mon Sep 17 00:00:00 2001 From: slokesha Date: Tue, 14 Oct 2025 23:17:18 +0000 Subject: [PATCH 1/3] replaced apply_rotary_emb_torch() with rotary_embedding imp Signed-off-by: slokesha --- vllm/model_executor/models/siglip2navit.py | 47 +++++++++++----------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index dc0022a169c4..cf45a32dce13 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -16,6 +16,7 @@ from vllm.platforms import _Backend, current_platform from .vision import get_vit_attn_backend +from vllm.model_executor.layers.rotary_embedding import _apply_rotary_emb_torch is_hpu = current_platform.is_hpu() @@ -144,26 +145,26 @@ def rotate_half(x, interleaved=False): two=2) -def apply_rotary_emb_torch(x, cos, sin, interleaved=False): - """ - x: (batch_size, seqlen, nheads, headdim) - cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) - """ - ro_dim = cos.shape[-1] * 2 - assert ro_dim <= x.shape[-1] - cos = repeat( - cos, - "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") - sin = repeat( - sin, - "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") - return torch.cat( - [ - x[..., :ro_dim] * cos + - rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:] - ], - dim=-1, - ) +# def apply_rotary_emb_torch(x, cos, sin, interleaved=False): +# """ +# x: (batch_size, seqlen, nheads, headdim) +# cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) +# """ +# ro_dim = cos.shape[-1] * 2 +# assert ro_dim <= x.shape[-1] +# cos = repeat( +# cos, +# "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") +# sin = repeat( +# sin, +# "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") +# return torch.cat( +# [ +# x[..., :ro_dim] * cos + +# rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:] +# ], +# dim=-1, +# ) def apply_rotary_pos_emb( @@ -179,11 +180,11 @@ def apply_rotary_pos_emb( from flash_attn.layers.rotary import apply_rotary_emb apply_rotary_emb_func = apply_rotary_emb else: - apply_rotary_emb_func = apply_rotary_emb_torch + apply_rotary_emb_func = _apply_rotary_emb_torch q_embed = apply_rotary_emb_func(q.float(), cos.float(), - sin.float()).type_as(q) + sin.float(), False).type_as(q) k_embed = apply_rotary_emb_func(k.float(), cos.float(), - sin.float()).type_as(k) + sin.float(), False).type_as(k) return q_embed, k_embed From 6215912b15906b0d703dd44dc85a469b70f983e0 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Sun, 10 Aug 2025 04:31:42 +0000 Subject: [PATCH 2/3] add debug profile --- vllm/worker/hpu_worker.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 83fe668fd123..81816179cf00 100755 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -14,6 +14,11 @@ import habana_frameworks.torch as htorch # noqa:F401 import torch import torch.distributed + +from vllm_hpu_extension.debug import init_debug_logger +from vllm_hpu_extension.profiler import (HabanaMemoryProfiler, format_bytes, + setup_profiler) +from vllm_hpu_extension.runtime import get_config from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes import vllm.envs as envs @@ -38,6 +43,12 @@ logger = init_logger(__name__) +def setup_step_profiler(steps): + if steps is None: + return None + step_start, step_end = steps + active = step_end - step_start + 1 + return setup_profiler(warmup=0, active=active) class HPUWorker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a HPU. @@ -122,6 +133,10 @@ def __init__( on_trace_ready=fn(torch_profiler_trace_dir, use_gzip=True)) else: self.profiler = None + self.step = 0 + self.profile_steps = get_config().VLLM_PROFILE_STEPS + self.step_profiler = setup_step_profiler(self.profile_steps) + self.step_debug = init_debug_logger('steps') def _is_encoder_decoder_model(self): return self.model_config.is_encoder_decoder @@ -191,6 +206,10 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, ) -> Optional[List[SamplerOutput]]: + if self.step_debug: + self.step_debug(f'step={self.step}') + if self.step_profiler and self.step == self.profile_steps[0]: + self.step_profiler.start() # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -249,11 +268,27 @@ def execute_model( msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " f"{cpu_fallback_local_metric.stats()}, {input_stats}") logger.warning(msg) + if self.step_profiler: + if self.step >= self.profile_steps[0]: + self.step_profiler.step() + if self.step == self.profile_steps[1]: + self.step_profiler.stop() + self.step_profiler = None + raise RuntimeError('Step profiling finished!') + self.step += 1 return output output = LocalOrDistributedWorkerBase.execute_model( self, execute_model_req) + if self.step_profiler: + if self.step >= self.profile_steps[0]: + self.step_profiler.step() + if self.step == self.profile_steps[1]: + self.step_profiler.stop() + self.step_profiler = None + raise RuntimeError('Step profiling finished!') + self.step += 1 return output @torch.inference_mode() From bd3e8cfe9b7cc7c9ffc47af7cdcac96645f94fd4 Mon Sep 17 00:00:00 2001 From: slokesha Date: Wed, 22 Oct 2025 13:10:02 -0700 Subject: [PATCH 3/3] Using HPU rotary_embedding Signed-off-by: slokesha --- vllm/model_executor/models/siglip2navit.py | 75 +++++++++++++--------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index cf45a32dce13..38630e6bfe2f 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -16,7 +16,7 @@ from vllm.platforms import _Backend, current_platform from .vision import get_vit_attn_backend -from vllm.model_executor.layers.rotary_embedding import _apply_rotary_emb_torch +from vllm.model_executor.layers.rotary_embedding import get_rope is_hpu = current_platform.is_hpu() @@ -145,26 +145,26 @@ def rotate_half(x, interleaved=False): two=2) -# def apply_rotary_emb_torch(x, cos, sin, interleaved=False): -# """ -# x: (batch_size, seqlen, nheads, headdim) -# cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) -# """ -# ro_dim = cos.shape[-1] * 2 -# assert ro_dim <= x.shape[-1] -# cos = repeat( -# cos, -# "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") -# sin = repeat( -# sin, -# "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") -# return torch.cat( -# [ -# x[..., :ro_dim] * cos + -# rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:] -# ], -# dim=-1, -# ) +def apply_rotary_emb_torch(x, cos, sin, interleaved=False): + """ + x: (batch_size, seqlen, nheads, headdim) + cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) + """ + ro_dim = cos.shape[-1] * 2 + assert ro_dim <= x.shape[-1] + cos = repeat( + cos, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + sin = repeat( + sin, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + return torch.cat( + [ + x[..., :ro_dim] * cos + + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:] + ], + dim=-1, + ) def apply_rotary_pos_emb( @@ -180,11 +180,11 @@ def apply_rotary_pos_emb( from flash_attn.layers.rotary import apply_rotary_emb apply_rotary_emb_func = apply_rotary_emb else: - apply_rotary_emb_func = _apply_rotary_emb_torch + apply_rotary_emb_func = apply_rotary_emb_torch q_embed = apply_rotary_emb_func(q.float(), cos.float(), - sin.float(), False).type_as(q) + sin.float()).type_as(q) k_embed = apply_rotary_emb_func(k.float(), cos.float(), - sin.float(), False).type_as(k) + sin.float()).type_as(k) return q_embed, k_embed @@ -212,6 +212,17 @@ def __init__(self, config): self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) self.use_rope = config.use_rope + max_position = getattr(config, "max_position_embeddings", 4096 * 32) + rope_theta = getattr(config, "rope_theta", 10000.0) + rope_scaling = getattr(config, "rope_scaling", None) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=rope_theta, + rope_scaling=rope_scaling, + ) # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) @@ -244,12 +255,16 @@ def forward( values = values.view(seq_length, self.num_heads, self.head_dim) if self.use_rope: - cos, sin = position_embeddings - queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0), - keys.unsqueeze(0), cos, sin, - self.is_flash_attn_backend) - queries = queries.squeeze(0) - keys = keys.squeeze(0) + # cos, sin = position_embeddings + # queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0), + # keys.unsqueeze(0), cos, sin, + # self.is_flash_attn_backend) + # queries = queries.squeeze(0) + # keys = keys.squeeze(0) + seq_len = queries.shape[0] + positions = torch.arange(seq_len, device=queries.device) + queries, keys = self.rotary_emb(positions, queries.unsqueeze(0), keys.unsqueeze(0)) + queries, keys = queries.squeeze(0), keys.squeeze(0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if self.is_flash_attn_backend: