[Mosaic GPU] Allow contracting ops into FMAs

apaszke · Google-ML-Automation · commit f10d3eb31255 · 2024-11-29T02:21:14.000-08:00
Using FMAs can significantly increase the ALU throughput and only increases
the precision. We use this capability to reduce the number of operations
needed to evaluate the softmax part of attention.

PiperOrigin-RevId: 701226007
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -1199,6 +1199,13 @@ def _exp_lowering_rule(ctx: LoweringRuleContext, x):
   return a.exp(approx=ctx.module_ctx.approx_math)
 
 
+@register_lowering_rule(lax.exp2_p)
+def _exp2_lowering_rule(ctx: LoweringRuleContext, x):
+  [x_aval] = ctx.avals_in
+  a = _ensure_fa(x, x_aval.dtype)
+  return a.exp2(approx=ctx.module_ctx.approx_math)
+
+
 @register_lowering_rule(lax.reduce_sum_p)
 def _reduce_sum_lowering_rule(ctx: LoweringRuleContext, x, *, axes):
   [x_aval] = ctx.avals_in
@@ -1216,7 +1223,7 @@ def _reduce_sum_lowering_rule(ctx: LoweringRuleContext, x, *, axes):
         raise NotImplementedError
       if not jnp.issubdtype(x_aval.dtype, jnp.floating):
         raise NotImplementedError
-      return x.reduce(arith_dialect.addf, axes[0])
+      return x.reduce("add", axes[0])
     case _:
       raise NotImplementedError(f"Unsupported layout {x.layout}")
 
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -700,7 +700,7 @@ def __neg__(self):
 
   def __add__(self, other):
     if ir.FloatType.isinstance(self.mlir_dtype):
-      return self._pointwise(arith.addf, other)
+      return self._pointwise(addf, other)
     elif ir.IntegerType.isinstance(self.mlir_dtype):
       return self._pointwise(arith.addi, other)
     else:
@@ -711,7 +711,7 @@ def __radd__(self, other):
 
   def __mul__(self, other):
     if ir.FloatType.isinstance(self.mlir_dtype):
-      return self._pointwise(arith.mulf, other)
+      return self._pointwise(mulf, other)
     elif ir.IntegerType.isinstance(self.mlir_dtype):
       return self._pointwise(arith.muli, other)
     else:
@@ -722,15 +722,15 @@ def __rmul__(self, other):
 
   def __sub__(self, other):
     if ir.FloatType.isinstance(self.mlir_dtype):
-      return self._pointwise(arith.subf, other)
+      return self._pointwise(subf, other)
     elif ir.IntegerType.isinstance(self.mlir_dtype):
       return self._pointwise(arith.subi, other)
     else:
       return NotImplemented
 
   def __rsub__(self, other):
     if ir.FloatType.isinstance(self.mlir_dtype):
-      return self._pointwise(lambda s, o: arith.subf(o, s), other)
+      return self._pointwise(lambda s, o: subf(o, s), other)
     elif ir.IntegerType.isinstance(self.mlir_dtype):
       return self._pointwise(lambda s, o: arith.subi(o, s), other)
     else:
@@ -904,16 +904,20 @@ def exp(self, *, approx: bool = False):
     if not ir.FloatType.isinstance(self.mlir_dtype):
       raise NotImplementedError
     if approx:
-      f32 = ir.F32Type.get()
-      if self.mlir_dtype != f32:
-        raise NotImplementedError
-      log2e = arith.constant(f32, ir.FloatAttr.get(f32, 1.4426950408889634))
-      def fast_exp(x):
-        scaled = arith.mulf(x, log2e)
-        return llvm.inline_asm(f32, [scaled], "ex2.approx.ftz.f32 $0, $1;", "=f,f")
-      return self._pointwise(self._lift_fast_instr(fast_exp))
+      dtype = self.mlir_dtype
+      log2e = arith.constant(dtype, ir.FloatAttr.get(dtype, 1.4426950408889634))
+      return (self * log2e).exp2()
     return self._pointwise(mlir_math.exp)
 
+  def exp2(self, *, approx: bool = False):
+    if not ir.FloatType.isinstance(self.mlir_dtype):
+      raise NotImplementedError
+    if approx:
+      if not ir.F32Type.isinstance(self.mlir_dtype):
+        raise NotImplementedError(self.mlir_dtype)
+      return self._pointwise(self._lift_fast_instr("ex2.approx.ftz.f32"))
+    return self._pointwise(mlir_math.exp2)
+
   def sin(self, *, approx: bool = False):
     if not ir.FloatType.isinstance(self.mlir_dtype):
       raise NotImplementedError
@@ -1125,7 +1129,7 @@ def upcast_to_bf16(reg, high):
   # NOTE: scratch can be reused immediately once this function returns.
   def reduce_sum(self, scratch) -> ir.Value:
     if ir.FloatType.isinstance(self.mlir_dtype):
-      op = arith.addf
+      op = addf
     elif ir.IntegerType.isinstance(self.mlir_dtype):
       op = arith.addi
     else:
@@ -1167,6 +1171,13 @@ def reduce_sum(self, scratch) -> ir.Value:
   def reduce(self, op: str | Callable[[ir.Value, ir.Value], ir.Value], axis):
     if isinstance(op, str):
       match op:
+        case "add":
+          if ir.FloatType.isinstance(self.mlir_dtype):
+            op = addf
+          elif ir.IntegerType.isinstance(self.mlir_dtype):
+            op = arith.addi
+          else:
+            raise NotImplementedError(self.mlir_dtype)
         case "max":
           if ir.F32Type.isinstance(self.mlir_dtype):
             op = self._lift_fast_instr("max.NaN.f32")
@@ -1653,3 +1664,15 @@ def tree_unflatten(cls, aux, flat_registers):
     layout, reg_shape, is_signed = aux
     registers = np.asarray(flat_registers, dtype=object).reshape(reg_shape)
     return cls(_registers=registers, _layout=layout, _is_signed=is_signed)
+
+
+# We allow contractions, to potentially take advantage of FMA instructions.
+# They can change the results, but the precision should only increase.
+def addf(a: ir.Value, b: ir.Value):
+  return arith.addf(a, b, fastmath=arith.FastMathFlags.contract)
+
+def subf(a: ir.Value, b: ir.Value):
+  return arith.subf(a, b, fastmath=arith.FastMathFlags.contract)
+
+def mulf(a: ir.Value, b: ir.Value):
+  return arith.mulf(a, b, fastmath=arith.FastMathFlags.contract)
diff --git a/jax/experimental/pallas/ops/gpu/attention_mgpu.py b/jax/experimental/pallas/ops/gpu/attention_mgpu.py
@@ -16,6 +16,7 @@
 import dataclasses
 import functools
 import itertools
+import math
 import jax
 from jax import lax
 from jax._src import test_util as jtu  # noqa: F401
@@ -118,11 +119,13 @@ def compute_qk(acc_ref):
         plgpu.barrier_arrive(k_consumed_barrier)
 
         # Softmax
-        m_ij = jnp.maximum(m_i, qk.max(axis=1))
-        alpha = jnp.exp(m_i - m_ij)
+        # We keep m scaled by log2e to use FMA instructions when computing p.
+        log2e = math.log2(math.e)
+        m_ij = jnp.maximum(m_i, qk.max(axis=1) * log2e)
+        alpha = jnp.exp2(m_i - m_ij)
         m_i = m_ij
-        p = jnp.exp(qk - lax.broadcast_in_dim(m_ij, (block_q, block_kv), [0]))
-        acc *= lax.broadcast_in_dim(alpha, (block_q, head_dim), [0])
+        p = jnp.exp2(qk * log2e - lax.broadcast_in_dim(m_ij, qk.shape, [0]))
+        acc *= lax.broadcast_in_dim(alpha, acc.shape, [0])
         l_i *= alpha
         p16 = p.astype(dtype)