[V1][MLA][SW-234434] Enable MLA for V1 - ported from vllm-gaudi (#1628)

xuechendi · web-flow · commit 0793bf18c3b2 · 2025-07-23T09:22:50.000-05:00
https://jira.habana-labs.com/browse/SW-234434 ## Essential Elements of an Effective PR Description Checklist - [x] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". - [ ] The test plan, such as providing test command. - [ ] The test results, such as pasting the results comparison before and after, or e2e results ## Purpose Backport vllm-gaudi V1 MLA enabling to vllm-fork for PRC customer request ## Test on Deepseek V2 lite chat ``` HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true \ PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 VLLM_CONTIGUOUS_PA=False \ lm_eval --model vllm \ --model_args "pretrained=DeepSeek-V2-Lite-Chat/,tensor_parallel_size=1,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=4096,use_v2_block_manager=True,dtype=bfloat16,max_num_seqs=128" \ --tasks gsm8k --num_fewshot "5" \ --batch_size "auto" --log_samples --output_path gsm8k_acc_DeepSeek-V2-Lite-Chat.json ``` |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.6581|± |0.0131| | | |strict-match | 5|exact_match|↑ |0.6482|± |0.0132| Test on Deepseek R1 ``` HABANA_VISIBLE_DEVICES=all \ VLLM_CONTIGUOUS_PA=False \ VLLM_USE_V1=1 \ PT_HPU_LAZY_MODE=1 \ VLLM_SKIP_WARMUP=true \ PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ PT_HPU_WEIGHT_SHARING=0 \ lm_eval --model vllm \ --model_args "pretrained=DeepSeek-R1,tensor_parallel_size=8,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=16384,use_v2_block_manager=True,dtype=bfloat16,max_num_seqs=128,gpu_memory_utilization=0.9,enable_expert_parallel=True," \ --tasks gsm8k --num_fewshot "8" \ --batch_size "128" --limit 256 --log_samples --output_path gsm8k_acc_${MODEL_NAME}.json ``` vllm (pretrained=DeepSeek-R1,tensor_parallel_size=8,distributed_executor_backend=mp,trust_remote_code=true,max_model_len=16384,use_v2_block_manager=True,dtype=bfloat16,max_num_seqs=128,gpu_memory_utilization=0.9,enable_expert_parallel=True,), gen_kwargs: (None), limit: 256.0, num_fewshot: 8, batch_size: 128 |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 8|exact_match|↑ |0.9688|± |0.0109| | | |strict-match | 8|exact_match|↑ |0.9609|± |0.0121|  --------- Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
@@ -250,18 +250,35 @@ def forward(
         if kv_cache is not None and len(kv_cache) == 2:
             self.latent_cache_k(latent_vec_k, kv_cache[0], slot_mapping)
             k_cache = kv_cache[0]
+        else:
+            k_cache = None
 
         if is_prefill:
-            return self._forward_prefill(q, k_c_normed, k_pe, attn_metadata,
-                                         batch_size)
+            return self._forward_prefill(q, latent_vec_k, k_cache,
+                                         attn_metadata, batch_size)
         else:
             return self._forward_decode(decode_ql_nope, q_pe, k_cache,
                                         attn_metadata, batch_size)
 
     def _forward_prefill(  # type: ignore
-            self, q: torch.Tensor, k_c_normed: torch.Tensor,
-            k_pe: torch.Tensor, attn_metadata: HPUAttentionMetadata,
+            self, q: torch.Tensor, latent_vec_k: torch.Tensor,
+            k_cache: torch.Tensor, attn_metadata: HPUAttentionMetadata,
             batch_size: int) -> torch.Tensor:
+        ##### get prefix cache #####
+        if attn_metadata.block_list is not None:
+            current = latent_vec_k
+            past = self.latent_cache_k.fetch_from_cache(
+                k_cache.unflatten(0, (-1, attn_metadata.block_size)),
+                attn_metadata.block_list)
+            past = past.view(-1, past.shape[-1])
+            current = torch.concat((past, current), dim=0)
+            latent_vec_k = current
+        # =========================== #
+
+        k_c_normed, k_pe = latent_vec_k.split(
+            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_pe = k_pe.view(-1, 1, self.qk_rope_head_dim)
+
         kv_nope = self.kv_b_proj(k_c_normed)[0]\
             .view(-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv_nope\
@@ -290,11 +307,14 @@ def _forward_prefill(  # type: ignore
             value=v_padded,
             is_causal=True,
             attn_bias=attn_metadata.attn_bias,
+            position_bias=None,
             valid_seq_lengths=attn_metadata.seq_lens_tensor,
             scale=self.scale,
             matmul_qk_op=self.matmul_qk,
             softmax_op=self.softmax,
             matmul_av_op=self.matmul_av,
+            keys_fetch_func=self.latent_cache_k.fetch_from_cache,
+            values_fetch_func = None,
             fsdpa_op=self.fused_scaled_dot_product_attention.apply \
             if self.fused_scaled_dot_product_attention is not None else None)
         attn_output = out.view(batch_size, -1, self.num_heads, q.shape[-1])
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
@@ -40,9 +40,12 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
                              block_size: int, use_v1: bool,
                              use_mla: bool) -> str:
-        if use_v1:
+        if use_v1 and not use_mla:
             logger.info("Using HPUAttentionV1 backend.")
             return "vllm.v1.attention.backends.hpu_attn.HPUAttentionBackendV1"
+        if use_v1 and use_mla:
+            logger.info("Using HPUAttentionMLA backend.")
+            return "vllm.attention.backends.hpu_attn.HPUMLAAttentionBackend"
         if use_mla:
             logger.info("Using HPUAttentionMLA backend.")
             return "vllm.attention.backends.hpu_attn.HPUMLAAttentionBackend"
diff --git a/vllm/v1/worker/hpu_model_runner.py b/vllm/v1/worker/hpu_model_runner.py
@@ -2349,11 +2349,18 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                     kv_cache_shape = self.attn_backend.get_kv_cache_shape(
                         num_blocks + 1, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    v_cache_shape = None if self.model_config.use_mla \
+                    else kv_cache_shape
                     dtype = kv_cache_spec.dtype
                     key_cache = torch.zeros(kv_cache_shape,
                                             dtype=dtype,
                                             device=self.device)
-                    value_cache = torch.zeros_like(key_cache)
+                    if v_cache_shape is not None:
+                        value_cache = torch.zeros(v_cache_shape,
+                                                  dtype=dtype,
+                                                  device=self.device)
+                    else:
+                        value_cache = None
                     kv_caches[layer_name] = (key_cache, value_cache)
                 else:
                     # TODO: add new branches when introducing more types of