[Wave] Introduce softsign kernel to replace tanh_approx (iree-org#829)

yichiche · web-flow · commit d8a3312912d8 · 2025-05-29T09:13:15.000-07:00
Add a new `softsign` kernel variant that replaces the previous
`tanh_approx` implementation. Benchmarks show a 10~15% speedup over
`tanh_approx`, but with a modest accuracy drop (e.g. Grok performance
falls from ~83% to ~79%). This commit provides the softsign
implementation alongside existing approximations for users to choose the
appropriate tradeoff.
diff --git a/iree/turbine/kernel/ops/wave_ops.py b/iree/turbine/kernel/ops/wave_ops.py
@@ -149,6 +149,15 @@ def abs(src: "Register") -> "Register":
     ...
 
 
+def softsign(
+    src: "Register",
+    logit_cap: float = 30.0,
+    apply_scaling: bool = False,
+    head_dim: int = None,
+) -> "Register":
+    ...
+
+
 def tanh_approx(src: "Register") -> "Register":
     ...
 
@@ -900,6 +909,23 @@ def infer_type(self):
         self.type = src_type
 
 
+@define_interface_op("softsign")
+@dataclass
+class SoftsignOp(CustomOp, ABC):
+    arg: fx.Node
+    logit_cap: float = 30.0
+    apply_scaling: bool = False
+    head_dim: int = None
+
+    @property
+    def indexing_dims(self) -> list[IndexSymbol]:
+        return get_custom(self.arg).indexing_dims
+
+    def infer_type(self):
+        src_type = get_custom(self.arg).type
+        self.type = src_type
+
+
 @define_op("select")
 @dataclass
 class SelectOp(CustomOp):
diff --git a/iree/turbine/kernel/wave/codegen/handlers.py b/iree/turbine/kernel/wave/codegen/handlers.py
@@ -85,6 +85,7 @@
     tanh,
     tanh_approx,
     workgroup_barrier,
+    softsign,
 )
 from ...compiler.base import CodegenError, ValidationError, NDEBUG
 from ...compiler.builder import IRProxyValue
@@ -842,6 +843,61 @@ def handle_tanh_approx(source: Value, options: WaveCompileOptions) -> OpResult:
     return result
 
 
+@handle_op(softsign)
+def handle_softsign(emitter: WaveEmitter, node: fx.Node) -> None:
+    """
+    Implements softsign-like logit cap using reciprocal:
+        logit = logit / (1 + abs(logit / cap))
+              = logit * (1 / (1 + abs(logit * (1 / cap))))
+              = logit * reciprocal(1 + abs(logit * reciprocal(cap)))
+    """
+    try:
+        src_arg, logit_cap, apply_scaling, head_dim = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments for softsign") from e
+
+    source = cast_py_value(emitter, src_arg).ir_value
+    element_type = get_type_or_element_type(source.type)
+    opts = emitter.options
+
+    # Compute effective cap
+    if apply_scaling:
+        if head_dim is None:
+            raise ValidationError("`head_dim` must be provided if `apply_scaling=True`")
+        eff_cap = logit_cap * (head_dim**-0.5)
+    else:
+        eff_cap = logit_cap
+
+    # # Constants
+
+    reci_cap = 1.0 / eff_cap
+    reci_cap_const = get_constant_attr(reci_cap, element_type)
+    one = arith_d.ConstantOp(
+        source.type,
+        DenseElementsAttr.get_splat(source.type, get_constant_attr(1.0, element_type)),
+    )
+    reciprocal_cap = arith_d.ConstantOp(
+        source.type, DenseElementsAttr.get_splat(source.type, reci_cap_const)
+    )
+
+    # scaled = logit * (1 / cap)
+    scaled = arith_d.mulf(source, reciprocal_cap, fastmath=get_fast_math_flags(opts))
+
+    # abs_scaled = abs(logit * (1 / cap))
+    abs_scaled = math_d.absf(scaled)
+
+    # denom = 1 + abs(...)
+    denom = arith_d.addf(one, abs_scaled, fastmath=get_fast_math_flags(opts))
+
+    # reciprocal_denom = 1 / denom
+    reciprocal_denom = arith_d.divf(one, denom, fastmath=get_fast_math_flags(opts))
+
+    # result = logit * (1 / denom)
+    result = arith_d.mulf(source, reciprocal_denom, fastmath=get_fast_math_flags(opts))
+
+    emitter.bind_node_proxy(node, IRProxyValue(result))
+
+
 @handle_unary_op(tanh)
 def handle_tanh(source: Value, options: WaveCompileOptions) -> OpResult:
     element_type = get_type_or_element_type(source.type)
diff --git a/iree/turbine/kernel/wave/templates/extend_attention.py b/iree/turbine/kernel/wave/templates/extend_attention.py
@@ -280,6 +280,13 @@ def first_loop(
             if logit_cap > 0:
                 logit_cap_reg_inv = tkw.reciprocal(logit_cap_reg)
                 x_j = logit_cap_reg * tkw.tanh_approx(x_j * logit_cap_reg_inv)
+                # We could use tkw.softsign to provide ~10% performance improvement, but this will compromise accuracy.
+                # x_j = logit_cap_reg * tkw.softsign(
+                #     x_j * logit_cap_reg_inv,
+                #     logit_cap=30.0,
+                #     apply_scaling=True,
+                #     head_dim=128,
+                # )
             n_kv_index = tkw.self_index(N_KV, tkl.i32)
             mask = tkw.apply_expr(n_kv_index, lambda x: x < N_KV)
             mask = tkw.broadcast(mask, target_shape=[N_Q, N_KV])
@@ -343,6 +350,13 @@ def second_loop(
             if logit_cap > 0:
                 logit_cap_reg_inv = tkw.reciprocal(logit_cap_reg)
                 x_j = logit_cap_reg * tkw.tanh_approx(x_j * logit_cap_reg_inv)
+                # We could use tkw.softsign to provide ~10% performance improvement, but this will compromise accuracy.
+                # x_j = logit_cap_reg * tkw.softsign(
+                #     x_j * logit_cap_reg_inv,
+                #     logit_cap=30.0,
+                #     apply_scaling=True,
+                #     head_dim=128,
+                # )
             n_kv_index = tkw.self_index(N_KV, tkl.i32)
             mask = tkw.apply_expr(n_kv_index, lambda x: x < N_KV)
             mask = tkw.broadcast(mask, target_shape=[N_Q, N_KV])
diff --git a/lit_tests/kernel/wave/codegen.py b/lit_tests/kernel/wave/codegen.py
@@ -893,6 +893,7 @@ def test(
         res_b = tkw.abs(b_reg)
         res = tkw.tanh(res)
         res = tkw.tanh_approx(res)
+        res = tkw.softsign(res, logit_cap=30.0, apply_scaling=True, head_dim=128)
         res = tkw.roundeven(res)
         tkw.write(res, a, elements_per_thread=4)
         tkw.write(res_b, b, elements_per_thread=4)
@@ -931,8 +932,17 @@ def test(
     # CHECK: %[[R:.+]] = arith.addf %[[TEMP]], %[[RECIP]] : vector<4xf16>
     # CHECK: %[[TANH_APPROX:.+]] = math.copysign %[[R]], %[[TANH]] : vector<4xf16>
 
+    # Tests softsign
+    # CHECK: %[[ONE:.+]] = arith.constant dense<1.000000e+00> : vector<4xf16>
+    # CHECK: %[[CAP:.+]] = arith.constant dense<3.771970e-01> : vector<4xf16>
+    # CHECK: %[[SCALED:.+]] = arith.mulf %[[TANH_APPROX]], %[[CAP]] : vector<4xf16>
+    # CHECK: %[[ABS2:.+]] = math.absf %[[SCALED]] : vector<4xf16>
+    # CHECK: %[[ADD:.+]] = arith.addf %[[ONE]], %[[ABS2]] : vector<4xf16>
+    # CHECK: %[[RECIP_DENOM:.+]] = arith.divf %[[ONE]], %[[ADD]] : vector<4xf16>
+    # CHECK: %[[SOFTSIGN:.+]] = arith.mulf %[[TANH_APPROX]], %[[RECIP_DENOM]] : vector<4xf16>
+
     # Tests roundeven
-    # CHECK: %[[ROUNDEVEN:.+]] = math.roundeven %[[TANH_APPROX]]
+    # CHECK: %[[ROUNDEVEN:.+]] = math.roundeven %[[SOFTSIGN]]
 
 
 # Important to check lowering of scheduling/barrier ops.