diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index dc0022a169c4..38630e6bfe2f 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -16,6 +16,7 @@ from vllm.platforms import _Backend, current_platform from .vision import get_vit_attn_backend +from vllm.model_executor.layers.rotary_embedding import get_rope is_hpu = current_platform.is_hpu() @@ -211,6 +212,17 @@ def __init__(self, config): self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) self.use_rope = config.use_rope + max_position = getattr(config, "max_position_embeddings", 4096 * 32) + rope_theta = getattr(config, "rope_theta", 10000.0) + rope_scaling = getattr(config, "rope_scaling", None) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=rope_theta, + rope_scaling=rope_scaling, + ) # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) @@ -243,12 +255,16 @@ def forward( values = values.view(seq_length, self.num_heads, self.head_dim) if self.use_rope: - cos, sin = position_embeddings - queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0), - keys.unsqueeze(0), cos, sin, - self.is_flash_attn_backend) - queries = queries.squeeze(0) - keys = keys.squeeze(0) + # cos, sin = position_embeddings + # queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0), + # keys.unsqueeze(0), cos, sin, + # self.is_flash_attn_backend) + # queries = queries.squeeze(0) + # keys = keys.squeeze(0) + seq_len = queries.shape[0] + positions = torch.arange(seq_len, device=queries.device) + queries, keys = self.rotary_emb(positions, queries.unsqueeze(0), keys.unsqueeze(0)) + queries, keys = queries.squeeze(0), keys.squeeze(0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if self.is_flash_attn_backend: diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 83fe668fd123..81816179cf00 100755 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -14,6 +14,11 @@ import habana_frameworks.torch as htorch # noqa:F401 import torch import torch.distributed + +from vllm_hpu_extension.debug import init_debug_logger +from vllm_hpu_extension.profiler import (HabanaMemoryProfiler, format_bytes, + setup_profiler) +from vllm_hpu_extension.runtime import get_config from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes import vllm.envs as envs @@ -38,6 +43,12 @@ logger = init_logger(__name__) +def setup_step_profiler(steps): + if steps is None: + return None + step_start, step_end = steps + active = step_end - step_start + 1 + return setup_profiler(warmup=0, active=active) class HPUWorker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a HPU. @@ -122,6 +133,10 @@ def __init__( on_trace_ready=fn(torch_profiler_trace_dir, use_gzip=True)) else: self.profiler = None + self.step = 0 + self.profile_steps = get_config().VLLM_PROFILE_STEPS + self.step_profiler = setup_step_profiler(self.profile_steps) + self.step_debug = init_debug_logger('steps') def _is_encoder_decoder_model(self): return self.model_config.is_encoder_decoder @@ -191,6 +206,10 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, ) -> Optional[List[SamplerOutput]]: + if self.step_debug: + self.step_debug(f'step={self.step}') + if self.step_profiler and self.step == self.profile_steps[0]: + self.step_profiler.start() # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -249,11 +268,27 @@ def execute_model( msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " f"{cpu_fallback_local_metric.stats()}, {input_stats}") logger.warning(msg) + if self.step_profiler: + if self.step >= self.profile_steps[0]: + self.step_profiler.step() + if self.step == self.profile_steps[1]: + self.step_profiler.stop() + self.step_profiler = None + raise RuntimeError('Step profiling finished!') + self.step += 1 return output output = LocalOrDistributedWorkerBase.execute_model( self, execute_model_req) + if self.step_profiler: + if self.step >= self.profile_steps[0]: + self.step_profiler.step() + if self.step == self.profile_steps[1]: + self.step_profiler.stop() + self.step_profiler = None + raise RuntimeError('Step profiling finished!') + self.step += 1 return output @torch.inference_mode()