[Hardware][Intel-Gaudi] Enable FusedSDPA support for Intel Gaudi (HPU)

scsudhak-intel · web-flow · commit af8486de49a2 · 2025-02-05T13:29:45.000-08:00
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
@@ -10,7 +10,8 @@
 
 import torch
 import vllm_hpu_extension.ops as ops
-from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
+from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax,
+                                      VLLMKVCache)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
@@ -137,9 +138,17 @@ def __init__(
 
         self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                               '0').lower() in ['1', 'true']
+        self.fused_scaled_dot_product_attention = None
         if self.prefill_usefusedsdpa:
             assert alibi_slopes is None, \
                 'Prefill with FusedSDPA not supported with alibi slopes!'
+            try:
+                from habana_frameworks.torch.hpex.kernels import FusedSDPA
+                self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
+                    FusedSDPA)
+            except ImportError:
+                logger().warning("Could not import HPU FusedSDPA kernel. "
+                                 "vLLM will use native implementation.")
 
         suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
         if head_size not in suppored_head_sizes:
@@ -227,6 +236,7 @@ def forward(
                 matmul_qk_op=self.matmul_qk,
                 softmax_op=self.softmax,
                 matmul_av_op=self.matmul_av,
+                fsdpa_op=self.fused_scaled_dot_product_attention,
             )
             output = out.reshape(batch_size, seq_len, hidden_size)
         else: