[Wave] Use reciprocal to compute softcap logits (#674)

harsh-nod · web-flow · commit 836dc8da5da5 · 2025-04-02T07:05:48.000-07:00
This PR uses reciprocals instead of divs to
compute the logits faster.

Signed-off-by: Harsh Menon &lt;harsh@nod-labs.com&gt;
diff --git a/iree/turbine/kernel/wave/templates/extend_attention.py b/iree/turbine/kernel/wave/templates/extend_attention.py
@@ -256,7 +256,8 @@ def first_loop(
             x_j = tkw.permute(inner_acc, target_shape=[H, N_Q, N_KV])
             x_j = x_j * layer_scale_reg
             if logit_cap > 0:
-                x_j = logit_cap_reg * tkw.tanh(x_j / logit_cap_reg)
+                logit_cap_reg_inv = tkw.reciprocal(logit_cap_reg)
+                x_j = logit_cap_reg * tkw.tanh(x_j * logit_cap_reg_inv)
             n_kv_index = tkw.self_index(N_KV, tkl.i32)
             mask = tkw.apply_expr(n_kv_index, lambda x: x < N_KV)
             mask = tkw.broadcast(mask, target_shape=[N_Q, N_KV])
@@ -308,7 +309,8 @@ def second_loop(
             x_j = tkw.permute(inner_acc, target_shape=[H, N_Q, N_KV])
             x_j = x_j * layer_scale_reg
             if logit_cap > 0:
-                x_j = logit_cap_reg * tkw.tanh(x_j / logit_cap_reg)
+                logit_cap_reg_inv = tkw.reciprocal(logit_cap_reg)
+                x_j = logit_cap_reg * tkw.tanh(x_j * logit_cap_reg_inv)
             n_kv_index = tkw.self_index(N_KV, tkl.i32)
             mask = tkw.apply_expr(n_kv_index, lambda x: x < N_KV)
             mask = tkw.broadcast(mask, target_shape=[N_Q, N_KV])
diff --git a/lit_tests/kernel/wave/attention/extend_attention.py b/lit_tests/kernel/wave/attention/extend_attention.py
@@ -184,7 +184,7 @@ def test_causal_extend_attention():
     # CHECK-COUNT-8:            amdgpu.mfma
 
     # softcap/logitcap modifier:
-    # CHECK-COUNT-2:            arith.divf
+    # CHECK-COUNT-4:            arith.mulf
     # CHECK-COUNT-2:            math.tanh
     # CHECK-COUNT-2:            arith.mulf
 
@@ -216,7 +216,7 @@ def test_causal_extend_attention():
     # CHECK-COUNT-8:            amdgpu.mfma
 
     # softcap/logitcap modifier:
-    # CHECK-COUNT-2:            arith.divf
+    # CHECK-COUNT-4:            arith.mulf
     # CHECK-COUNT-2:            math.tanh
     # CHECK-COUNT-2:            arith.mulf
 
@@ -301,7 +301,7 @@ def test_causal_extend_attention_32x32x8():
     # CHECK-COUNT-8:            amdgpu.mfma
 
     # softcap/logitcap modifier:
-    # CHECK-COUNT-1:            arith.divf
+    # CHECK-COUNT-2:            arith.mulf
     # CHECK-COUNT-1:            math.tanh
     # CHECK-COUNT-1:            arith.mulf
 
@@ -325,7 +325,7 @@ def test_causal_extend_attention_32x32x8():
     # CHECK-COUNT-8:            amdgpu.mfma
 
     # softcap/logitcap modifier:
-    # CHECK-COUNT-1:            arith.divf
+    # CHECK-COUNT-2:            arith.mulf
     # CHECK-COUNT-1:            math.tanh
     # CHECK-COUNT-1:            arith.mulf