[Wave] Add support for sliding window attention (#626)

harsh-nod · nithinsubbiah · commit f0ee01ecce93 · 2025-04-07T17:20:13.000Z
Signed-off-by: Harsh Menon &lt;harsh@nod-labs.com&gt;
Signed-off-by: nithinsubbiah &lt;nithinsubbiah@gmail.com&gt;
diff --git a/iree/turbine/kernel/wave/templates/vanilla_attention.py b/iree/turbine/kernel/wave/templates/vanilla_attention.py
@@ -19,7 +19,14 @@ def get_vanilla_attention_kernel(
     dynamic_dims: bool,
     is_causal: bool = False,
     is_v_transposed: bool = False,
+    sliding_window_size: int = -1,
 ):
+
+    if sliding_window_size > 0 and not is_causal:
+        raise NotImplementedError(
+            "Sliding window is only supported for causal attention."
+        )
+
     # Input sizes
     B = tkl.sym.B
     M = tkl.sym.M
@@ -78,6 +85,7 @@ def base_attention_core(q, k, v, c):
         c_reg = tkl.Register[B, N, M, tkl.f32](0.0)
         init_sum = tkl.Register[B, M, tkl.f32](0.0)
         init_max = tkl.Register[B, M, tkl.f32](-1e6)
+        sliding_window = tkl.Register[M, K2, tkl.i64](sliding_window_size)
         ZEROF = tkl.Register[M, K2, tkl.f32](0.0)
         MIN_INF = tkl.Register[M, K2, tkl.f32](-1e6)
 
@@ -106,6 +114,8 @@ def repeat(
                 m_index = tkw.self_index(M, tkl.i64)
                 m_index = tkw.broadcast(m_index, target_shape=[M, K2])
                 mask = (m_index >= k2_index) & mask
+                if sliding_window_size > 0:
+                    mask = (m_index - k2_index <= sliding_window) & mask
             mask = tkw.cast(mask, tkw.i1)
             bias = tkw.select(mask, ZEROF, MIN_INF)
             x_j = x_j + bias
diff --git a/iree/turbine/kernel/wave/utils/torch_utils.py b/iree/turbine/kernel/wave/utils/torch_utils.py
@@ -49,3 +49,7 @@ def device_randperm(*args, **kwargs):
 
 def device_zeros(*args, **kwargs):
     return to_default_device(torch.zeros(*args, **kwargs))
+
+
+def device_ones(*args, **kwargs):
+    return to_default_device(torch.ones(*args, **kwargs))
diff --git a/lit_tests/kernel/wave/attention/attention.py b/lit_tests/kernel/wave/attention/attention.py
@@ -20,7 +20,6 @@
 )
 from iree.turbine.kernel.wave.scheduling.schedule import SchedulingType
 from iree.turbine.kernel.wave.compile import WaveCompileOptions, wave_compile
-import torch
 
 # Input sizes
 B = tkl.sym.B
@@ -429,3 +428,49 @@ def test_attention_bshd():
     # CHECK-COUNT-8:                {{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
     # CHECK-COUNT-8:                {{.*}} = gpu.shuffle xor {{.*}}
     # CHECK-COUNT-8:                {{.*}} = amdgpu.mfma
+
+
+@run_test
+def test_attention_sliding_window():
+    shape = AttentionShape(
+        num_query_heads=8,
+        num_kv_heads=8,
+        query_seq_len=128,
+        head_size_kv=128,
+        head_size=64,
+        kv_seq_len=256,
+    )
+    mfma_variant = (tkw.MMAType.F32_16x16x16_F16,) * 2
+    base_attention, hyperparams, _, _ = get_vanilla_attention_kernel(
+        shape, mfma_variant, False, is_causal=True, sliding_window_size=1024
+    )
+
+    options = WaveCompileOptions(
+        subs=hyperparams,
+        canonicalize=True,
+        run_bench=False,
+        schedule=SchedulingType.NONE,
+        use_scheduling_barriers=False,
+        compile_to_mlir=True,
+    )
+    base_attention = wave_compile(options, base_attention)
+    print(base_attention.asm)
+
+    # CHECK-LABEL:       func.func @base_attention
+    # CHECK:                %[[NEG_INF:.+]] = arith.constant dense<-1.000000e+06> : vector<4xf32>
+    # CHECK:                %[[WINDOW_SIZE:.+]] = arith.constant dense<1024> : vector<4xi64>
+    # CHECK:                %[[ZERO:.+]] = arith.constant dense<0.000000e+00> : vector<4xf32>
+    # CHECK:                {{.*}} = scf.for
+    # CHECK-COUNT-32:           {{.*}} = amdgpu.mfma
+    # CHECK-COUNT-4:            {{.*}} = arith.cmpi slt, {{.*}} : vector<4xindex>
+    # CHECK-COUNT-8:            {{.*}} = arith.cmpi sge, {{.*}} : vector<4xi64>
+    # CHECK-COUNT-8:            {{.*}} = arith.andi {{.*}} : vector<4xi1>
+    # This is computing the index difference: m_index - k2_index
+    # CHECK-COUNT-8:            {{.*}} = arith.subi {{.*}} : vector<4xi64>
+    # And then comparing to the window size: m_index - k2_index <= window_size
+    # CHECK-COUNT-8:            {{.*}} = arith.cmpi sle, {{.*}}, %[[WINDOW_SIZE]] : vector<4xi64>
+    # CHECK-COUNT-8:            {{.*}} = arith.andi {{.*}} : vector<4xi1>
+    # CHECK-COUNT-8:            {{.*}} = arith.select %{{.*}}, %[[ZERO]], %[[NEG_INF]] : vector<4xi1>, vector<4xf32>
+    # CHECK-COUNT-8:            {{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
+    # CHECK-COUNT-8:            {{.*}} = gpu.shuffle xor {{.*}}
+    # CHECK-COUNT-32:           {{.*}} = amdgpu.mfma
diff --git a/tests/kernel/wave/attention/vanilla_attention_test.py b/tests/kernel/wave/attention/vanilla_attention_test.py
@@ -21,6 +21,7 @@
 from iree.turbine.kernel.wave.utils.torch_utils import (
     device_randn,
     device_zeros,
+    device_ones,
 )
 from iree.turbine.kernel.wave.compile import WaveCompileOptions, wave_compile
 from iree.turbine.kernel.wave.constraints import MMAType
@@ -217,6 +218,7 @@ def testAttentionPure(
 @require_e2e
 @pytest.mark.parametrize("shape", get_test_shapes("all_attention"))
 @pytest.mark.parametrize("enable_scheduling", [SchedulingType.NONE])
+@pytest.mark.parametrize("sliding_window", ([-1, 1024]))
 @param_bool("dynamic_dims", "dyn", [False])
 @pytest.mark.parametrize(
     "mfma_variant",
@@ -228,6 +230,7 @@ def testAttentionPure(
 def testAttentionCausal(
     shape: tuple[int],
     enable_scheduling: SchedulingType,
+    sliding_window: int,
     dynamic_dims: bool,
     mfma_variant: tuple[MMAType],
     request,
@@ -248,7 +251,12 @@ def testAttentionCausal(
         dynamic_symbols,
         dynamic_symbols_map,
     ) = get_vanilla_attention_kernel(
-        shape, mfma_variant, dynamic_dims, is_causal=True, is_v_transposed=True
+        shape,
+        mfma_variant,
+        dynamic_dims,
+        is_causal=True,
+        is_v_transposed=True,
+        sliding_window_size=sliding_window,
     )
     q_shape = (shape.num_query_heads, shape.query_seq_len, shape.head_size)
     k_shape = (shape.num_kv_heads, shape.kv_seq_len, shape.head_size)
@@ -284,9 +292,19 @@ def testAttentionCausal(
     dk_sqrt = math.sqrt(1.0 / shape.head_size)
     # TODO: Add scaling of QK as part of kernel.
     asm = base_attention(q * dk_sqrt * log2e, k, v.permute([0, 2, 1]), output)
-    torch_ref = torch.nn.functional.scaled_dot_product_attention(
-        q, k, v, is_causal=True
-    )
+    if sliding_window >= 0:
+
+        def sliding_window_mask(q_seq_length, kv_seq_length, window_size):
+            mask = device_ones((q_seq_length, kv_seq_length), dtype=torch.bool)
+            mask = mask.tril().triu(-sliding_window)
+            return mask.to(dtype=torch.bool)
+
+        mask = sliding_window_mask(
+            shape.query_seq_len, shape.kv_seq_len, sliding_window
+        )
+        torch_ref = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+    else:
+        torch_ref = F.scaled_dot_product_attention(q, k, v, is_causal=True)
 
     if dump_generated_mlir:
         filename = f"wave_attention_{'x'.join(map(str, shape))}.mlir"