Move scaled_dot_product_attention_bhsd under iree.turbine (iree-org#870)

aviator19941 · xintin · commit 0677c69629e7 · 2025-06-02T23:16:13.000Z
This PR moves `scaled_dot_product_attention_bhsd` under iree.turbine to
have one util file for all the reference kernels.

---------

Signed-off-by: aviator19941 &lt;avinash.sharma@amd.com&gt;
Signed-off-by: xintin &lt;gaurav.verma@amd.com&gt;
diff --git a/iree/turbine/kernel/wave/utils/reference_kernel_utils.py b/iree/turbine/kernel/wave/utils/reference_kernel_utils.py
@@ -0,0 +1,67 @@
+# Copyright 2025 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def scaled_dot_product_attention_bhsd(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    is_causal: bool = False,
+    sliding_window: int = -1,
+    custom_mask: Tensor | None = None,
+) -> Tensor:
+    """
+    This version mimics PyTorch's `torch.nn.functional.scaled_dot_product_attention`
+    with optional causal masking and improved numerical stability.
+    Intended for comparison and debugging purposes.
+    Args:
+        query (Tensor): query tensor of shape [B, H, S_q, D].
+        key (Tensor): key tensor of shape [B, H, S_k, D].
+        value (Tensor): value tensor of shape [B, H, S_k, D].
+        is_causal (bool): If True, applies causal masking to the attention logits.
+    Returns:
+        Tensor: Output tensor of shape [B, H, S_q, D] after applying attention.
+    """
+    if query.dtype != torch.float32:
+        query = query.to(torch.float32)
+    if key.dtype != torch.float32:
+        key = key.to(torch.float32)
+    if value.dtype != torch.float32:
+        value = value.to(torch.float32)
+
+    scale: float = query.shape[-1] ** -0.5
+    attn_logits: Tensor = torch.matmul(query, key.transpose(-2, -1)) * scale
+
+    if sliding_window >= 0:
+        assert is_causal, f"Sliding window only supported with causal"
+
+    if is_causal:
+        seq_len_q, seq_len_k = attn_logits.shape[-2], attn_logits.shape[-1]
+        causal_mask: Tensor = torch.tril(
+            torch.ones(
+                (seq_len_q, seq_len_k), device=attn_logits.device, dtype=torch.bool
+            )
+        )
+        if sliding_window >= 0:
+            causal_mask = causal_mask.triu(-sliding_window)
+        attn_logits = attn_logits.masked_fill(~causal_mask, float("-inf"))
+
+    if custom_mask is not None:
+        bool_mask = custom_mask.to(torch.bool)
+        bool_mask = bool_mask[:, None, :, None]
+        assert bool_mask.shape == (query.shape[0], 1, query.shape[2], 1)
+        attn_logits = attn_logits.masked_fill(bool_mask, float("-inf"))
+
+    # Improve numerical stability using log-sum-exp trick
+    attn_logits = attn_logits - attn_logits.max(dim=-1, keepdim=True).values
+    attn_weights: Tensor = F.softmax(attn_logits, dim=-1)
+    attn_weights = torch.nan_to_num(attn_weights, nan=0.0)
+
+    return torch.matmul(attn_weights, value)
diff --git a/tests/kernel/wave/attention/gqa_vanilla_attention_test.py b/tests/kernel/wave/attention/gqa_vanilla_attention_test.py
@@ -26,7 +26,6 @@
     enable_scheduling_barriers,
     require_e2e,
     require_cdna3,
-    scaled_dot_product_attention_bhsd,
 )
 from ..common.shapes import get_test_shapes
 from iree.turbine.kernel.wave.templates.gqa_vanilla_attention import (
@@ -35,6 +34,9 @@
 from iree.turbine.kernel.wave.templates.attention_common import AttentionShape
 from iree.turbine.kernel.wave.scheduling.schedule import SchedulingType
 from iree.turbine.kernel.wave.compile import wave_compile, WaveCompileOptions
+from iree.turbine.kernel.wave.utils.reference_kernel_utils import (
+    scaled_dot_product_attention_bhsd,
+)
 
 
 @require_e2e
diff --git a/tests/kernel/wave/attention/vanilla_attention_test.py b/tests/kernel/wave/attention/vanilla_attention_test.py
@@ -34,7 +34,6 @@
     param_bool,
     require_cdna3,
     require_e2e,
-    scaled_dot_product_attention_bhsd,
 )
 from ..common.shapes import get_test_shapes
 from iree.turbine.kernel.wave.templates.vanilla_attention import (
@@ -45,6 +44,9 @@
 from iree.turbine.kernel.wave.templates.attention_common import AttentionShape
 from iree.turbine.kernel.wave.scheduling.schedule import SchedulingType
 from iree.turbine.kernel.wave.compile import wave_compile, WaveCompileOptions
+from iree.turbine.kernel.wave.utils.reference_kernel_utils import (
+    scaled_dot_product_attention_bhsd,
+)
 
 
 @require_e2e
diff --git a/tests/kernel/wave/common/utils.py b/tests/kernel/wave/common/utils.py
@@ -9,9 +9,6 @@
 from iree.turbine.kernel.wave.utils.run_utils import (
     get_default_arch,
 )
-import torch
-import torch.nn.functional as F
-from torch import Tensor
 
 require_e2e = pytest.mark.require_e2e
 expensive_test = pytest.mark.expensive_test
@@ -42,63 +39,3 @@ def param_bool(name, shortname=None, values=None):
     values = values or [False, True]
     ids = [f"{shortname}" if v else f"no_{shortname}" for v in values]
     return pytest.mark.parametrize(name, [pytest.param(v) for v in values], ids=ids)
-
-
-def scaled_dot_product_attention_bhsd(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    is_causal: bool = False,
-    sliding_window: int = -1,
-    custom_mask: Tensor | None = None,
-) -> Tensor:
-    """
-    This version mimics PyTorch's `torch.nn.functional.scaled_dot_product_attention`
-    with optional causal masking and improved numerical stability.
-    Intended for comparison and debugging purposes.
-
-    Args:
-        query (Tensor): query tensor of shape [B, H, S_q, D].
-        key (Tensor): key tensor of shape [B, H, S_k, D].
-        value (Tensor): value tensor of shape [B, H, S_k, D].
-        is_causal (bool): If True, applies causal masking to the attention logits.
-
-    Returns:
-        Tensor: Output tensor of shape [B, H, S_q, D] after applying attention.
-    """
-    if query.dtype != torch.float32:
-        query = query.to(torch.float32)
-    if key.dtype != torch.float32:
-        key = key.to(torch.float32)
-    if value.dtype != torch.float32:
-        value = value.to(torch.float32)
-
-    scale: float = query.shape[-1] ** -0.5
-    attn_logits: Tensor = torch.matmul(query, key.transpose(-2, -1)) * scale
-
-    if sliding_window >= 0:
-        assert is_causal, f"Sliding window only supported with causal"
-
-    if is_causal:
-        seq_len_q, seq_len_k = attn_logits.shape[-2], attn_logits.shape[-1]
-        causal_mask: Tensor = torch.tril(
-            torch.ones(
-                (seq_len_q, seq_len_k), device=attn_logits.device, dtype=torch.bool
-            )
-        )
-        if sliding_window >= 0:
-            causal_mask = causal_mask.triu(-sliding_window)
-        attn_logits = attn_logits.masked_fill(~causal_mask, float("-inf"))
-
-    if custom_mask is not None:
-        bool_mask = custom_mask.to(torch.bool)
-        bool_mask = bool_mask[:, None, :, None]
-        assert bool_mask.shape == (query.shape[0], 1, query.shape[2], 1)
-        attn_logits = attn_logits.masked_fill(bool_mask, float("-inf"))
-
-    # Improve numerical stability using log-sum-exp trick
-    attn_logits = attn_logits - attn_logits.max(dim=-1, keepdim=True).values
-    attn_weights: Tensor = F.softmax(attn_logits, dim=-1)
-    attn_weights = torch.nan_to_num(attn_weights, nan=0.0)
-
-    return torch.matmul(attn_weights, value)
diff --git a/tests/kernel/wave/nn/functional/wave_quant_attention_test.py b/tests/kernel/wave/nn/functional/wave_quant_attention_test.py
@@ -12,6 +12,8 @@
 from ...common.utils import (
     require_e2e,
     require_cdna3,
+)
+from iree.turbine.kernel.wave.utils.reference_kernel_utils import (
     scaled_dot_product_attention_bhsd,
 )
 
diff --git a/tests/kernel/wave/nn/functional/wave_sdpa_test.py b/tests/kernel/wave/nn/functional/wave_sdpa_test.py
@@ -12,6 +12,8 @@
 from ...common.utils import (
     require_e2e,
     require_cdna3,
+)
+from iree.turbine.kernel.wave.utils.reference_kernel_utils import (
     scaled_dot_product_attention_bhsd,
 )
 

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,8 @@`
`12`	`12`	`from ...common.utils import (`
`13`	`13`	`require_e2e,`
`14`	`14`	`require_cdna3,`
	`15`	`+)`
	`16`	`+from iree.turbine.kernel.wave.utils.reference_kernel_utils import (`
`15`	`17`	`scaled_dot_product_attention_bhsd,`
`16`	`18`	`)`
`17`	`19`