Support scaled_dot with rhs scale (#5107)

ThomasRaoux · web-flow · commit 462de12823c9 · 2024-11-11T10:19:18.000-08:00
This enables support for scaled_dot with rhs scale. It still only
supports one scale either on lhs or rhs at the moment.
For simplicity we just transpose operands for this case for MMAv2 and
sync the transpose op on the destination. This prepares us for MMAv3
support where the scales should always be on lhs.
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -86,7 +86,8 @@ def TT_BitcastOp : TT_Op<"bitcast", [Elementwise,
     // TODO: Add verifier
 }
 
-def TT_FpToFpOp : TT_Op<"fp_to_fp", [SameOperandsAndResultShape,
+def TT_FpToFpOp : TT_Op<"fp_to_fp", [Elementwise,
+                                     SameOperandsAndResultShape,
                                      SameOperandsAndResultEncoding,
                                      Pure,
                                      /*DeclareOpInterfaceMethods<CastOpInterface>*/]> {
@@ -675,6 +676,7 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 // DotScaled Op
 //
 def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
+                             AttrSizedOperandSegments,
                              DotLike,
                              TypesMatchWith<"result's type matches accumulator's type",
                                             "d", "c", "$_self">]> {
@@ -692,7 +694,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
       RankedTensorOf<[TT_Float,I8]>:$lhs,
       RankedTensorOf<[TT_Float,I8]>:$rhs,
       TT_FloatTensor:$c,
-      RankedTensorOf<[I8]>:$lhs_scale,
+      Optional<RankedTensorOf<[I8]>>:$lhs_scale,
       Optional<RankedTensorOf<[I8]>>:$rhs_scale,
       TT_ScaleDotElemTypeAttr:$lhs_type,
       TT_ScaleDotElemTypeAttr:$rhs_type
@@ -702,8 +704,8 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
 
     // Not sure why I need to fully specify the optional group, but otherwise it complains when loading the mlir file
     let assemblyFormat = [{
-      $lhs `,` $lhs_scale `,` $rhs (`,`) : (`,` $rhs_scale^ `,`)? $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
-      `:` type($lhs) `,` type($lhs_scale) `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
+      $lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? `,` $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
+      `:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
     }];
 }
 
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -567,6 +567,147 @@ class DecomposeScaledBlocked
   }
 };
 
+static void updateValueType(Value v, Attribute encoding,
+                            ArrayRef<int64_t> shape) {
+  auto tensorType = cast<RankedTensorType>(v.getType());
+  auto newType =
+      RankedTensorType::get(shape, tensorType.getElementType(), encoding);
+  v.setType(newType);
+}
+
+static TransOp updateUsers(Value result, const SetVector<Operation *> &slice) {
+  TransOp transOp;
+  if (llvm::any_of(result.getUsers(),
+                   [&](Operation *user) { return slice.count(user) == 0; })) {
+    OpBuilder builder(result.getContext());
+    builder.setInsertionPointAfterValue(result);
+    transOp =
+        builder.create<TransOp>(result.getLoc(), result, ArrayRef({1, 0}));
+    result.replaceUsesWithIf(transOp.getResult(), [&](OpOperand &operand) {
+      return operand.getOwner() != transOp.getOperation() &&
+             slice.count(operand.getOwner()) == 0;
+    });
+  }
+  return transOp;
+}
+
+// Sync the transpose in the IR, this is done to avoid generating convert layout
+// when we have a transpose right after a dot as mma layout cannot be propagated
+// through transpose op. Once we have layouts that can represent transposed MMA
+// we can remove this transformation.
+static void sinkTransposeOp(TransOp input) {
+  SmallVector<TransOp> queue = {input};
+  while (!queue.empty()) {
+    TransOp transOp = queue.back();
+    Value currentValue = transOp.getResult();
+    queue.pop_back();
+    mlir::ForwardSliceOptions options;
+    options.filter = [](Operation *op) {
+      if (op->hasTrait<OpTrait::Elementwise>() && op->getNumOperands() == 1)
+        return true;
+      if (isa<scf::YieldOp>(op))
+        return isa<scf::ForOp>(op->getParentOp());
+      if (isa<ConvertLayoutOp>(op))
+        return true;
+      return false;
+    };
+    SetVector<Operation *> slice;
+    mlir::getForwardSlice(currentValue, &slice, options);
+    for (Operation *op : slice) {
+      if (op->hasTrait<OpTrait::Elementwise>()) {
+        // Update users of transpose op.
+        if (op->getOperand(0) == transOp.getResult())
+          op->setOperand(0, transOp.getOperand());
+        // Update the type of the result.
+        for (Value result : op->getResults()) {
+          auto srcType = cast<RankedTensorType>(op->getOperand(0).getType());
+          updateValueType(result, srcType.getEncoding(), srcType.getShape());
+          updateUsers(result, slice);
+        }
+        continue;
+      }
+      if (auto cvtOp = dyn_cast<ConvertLayoutOp>(op)) {
+        // Update users of transpose op.
+        if (op->getOperand(0) == transOp.getResult())
+          op->setOperand(0, transOp.getOperand());
+        auto resultEncoding = cvtOp.getType().getEncoding();
+        auto newDstEncoding = inferSrcEncoding(transOp, resultEncoding);
+        auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
+        updateValueType(cvtOp.getResult(), *newDstEncoding, srcType.getShape());
+        updateUsers(cvtOp.getResult(), slice);
+        continue;
+      }
+      assert(isa<scf::YieldOp>(op));
+      auto forOp = dyn_cast<scf::ForOp>(op->getParentOp());
+      assert(forOp);
+      for (OpOperand &operand : op->getOpOperands()) {
+        Operation *def = operand.get().getDefiningOp();
+        if (def && (slice.count(def)) || def == transOp.getOperation()) {
+          if (def == transOp.getOperation())
+            operand.set(transOp.getOperand());
+          Type newType = operand.get().getType();
+          forOp.getResult(operand.getOperandNumber()).setType(newType);
+          TransOp retTrans =
+              updateUsers(forOp.getResult(operand.getOperandNumber()), slice);
+          // Recursively try to propagate the new transpose inserted.
+          if (retTrans)
+            queue.push_back(retTrans);
+          forOp.getRegionIterArg(operand.getOperandNumber()).setType(newType);
+          TransOp argTrans = updateUsers(
+              forOp.getRegionIterArg(operand.getOperandNumber()), slice);
+          if (argTrans)
+            queue.push_back(argTrans);
+          OpBuilder builder(forOp);
+          OpOperand &init = forOp.getInitsMutable()[operand.getOperandNumber()];
+          Value initTranspose = builder.create<TransOp>(
+              forOp.getLoc(), init.get(), ArrayRef({1, 0}));
+          init.set(initTranspose);
+        }
+      }
+    }
+  }
+}
+
+// Transpose scaled_dot ops that have a scale on lhs.
+static Operation *transposeDotOp(DotScaledOp dotOp) {
+  OpBuilder builder(dotOp);
+  Value lhs = dotOp.getLhs();
+  std::array<int, 2> transOrder = {1, 0};
+  Value lhsTransposed = builder.create<TransOp>(lhs.getLoc(), lhs, transOrder);
+  Value rhs = dotOp.getRhs();
+  Value rhsTransposed = builder.create<TransOp>(rhs.getLoc(), rhs, transOrder);
+  Value c = dotOp.getC();
+  Value cTransposed = builder.create<TransOp>(c.getLoc(), c, transOrder);
+  Value result = builder.create<DotScaledOp>(
+      dotOp.getLoc(), cTransposed.getType(), rhsTransposed, lhsTransposed,
+      cTransposed, dotOp.getRhsScale(), dotOp.getLhsScale(), dotOp.getRhsType(),
+      dotOp.getLhsType());
+  Operation *transposedResult =
+      builder.create<TransOp>(result.getLoc(), result, transOrder);
+  dotOp.replaceAllUsesWith(transposedResult);
+  dotOp.erase();
+  return transposedResult;
+}
+
+static void transposeDots(ModuleOp m) {
+  // TODO: extend to regular dot when it is profitable. For instance when we may
+  // want to use rhs from register for mmav3.
+  SmallVector<DotScaledOp> toTranspose;
+  m.walk([&](DotScaledOp dotOp) -> void {
+    if (dotOp.getLhsScale() == nullptr && dotOp.getRhsScale() != nullptr)
+      toTranspose.push_back(dotOp);
+  });
+  SmallVector<Operation *> transposes;
+  for (DotScaledOp dotOp : toTranspose) {
+    Operation *transpose = transposeDotOp(dotOp);
+    transposes.push_back(transpose);
+  }
+
+  for (Operation *transpose : transposes) {
+    sinkTransposeOp(cast<TransOp>(transpose));
+  }
+}
+
 #define GEN_PASS_DEF_TRITONGPUACCELERATEMATMUL
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
@@ -582,6 +723,7 @@ class TritonGPUAccelerateMatmulPass
     ModuleOp m = getOperation();
 
     auto computeCapability = getNVIDIAComputeCapability(m);
+    transposeDots(m);
 
     mlir::RewritePatternSet patterns(context);
     patterns.add<BlockedToMMA, DecomposeScaledBlocked>(context,
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1481,12 +1481,13 @@ void init_triton_ir(py::module &&m) {
                                        maxNumImpreciseAcc);
            })
       .def("create_dot_scaled",
-           [](TritonOpBuilder &self, mlir::Value &lhs, mlir::Value &lhs_scale,
+           [](TritonOpBuilder &self, mlir::Value &lhs,
+              std::optional<mlir::Value> &lhs_scale,
               ScaleDotElemType lhs_format, mlir::Value &rhs,
               std::optional<mlir::Value> &rhs_scale,
               ScaleDotElemType rhs_format, mlir::Value &c) -> mlir::Value {
              return self.create<DotScaledOp>(
-                 c.getType(), lhs, rhs, c, lhs_scale,
+                 c.getType(), lhs, rhs, c, lhs_scale.value_or(Value()),
                  rhs_scale.value_or(Value()), lhs_format, rhs_format);
            })
       .def("create_floor",
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3367,48 +3367,55 @@ def kernel(X, stride_xm, stride_xk, Y, stride_yk, stride_yn, W, stride_wn, strid
             assert 'wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3' in ptx
 
 
-@pytest.mark.parametrize("M, N, K, col_a, col_b, type_a, type_b, num_warps, mma, kpack",
-                         [(M, N, K, col_a, col_b, type_a, type_b, 4, mma, kpack)
+@pytest.mark.parametrize("M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, num_warps, mma, kpack",
+                         [(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, 4, mma, kpack)
                           for M, N, K in itertools.product([32, 64, 128], [32, 64, 128], [64, 128])
                           for col_a, col_b in itertools.product([True, False], repeat=2)
-                          for type_a in ["e2m1", "e4m3", "e5m2"]
-                          for type_b in ["e4m3", "e5m2", "bf16"]
+                          for rhs_scale in [False, True]
+                          for normal_type in ["e2m1", "e4m3", "e5m2"]
+                          for mxfp_type in ["e4m3", "e5m2", "bf16"]
                           for mma in ([32, 16] if is_hip() else [16])
                           for kpack in ([1, 2] if is_hip() else [1])])
-def test_scaled_dot(M, N, K, col_a, col_b, type_a, type_b, num_warps, mma, kpack, device):
+def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, num_warps, mma, kpack, device):
     if is_cuda():
         cc = torch.cuda.get_device_capability()
         if cc < (8, 9):
             pytest.skip("float8e4nv not supported on CUDA < 8.9")
     if is_hip():
+        if rhs_scale:
+            pytest.skip("scales on rhs not yet support for HIP")
         if not is_hip_cdna():
             pytest.skip("scaled_dot only implemented for HIP CDNA")
-        if "e4m3" in (type_a, type_b) and not is_hip_mi300():
-            pytest.skip(f"scaled_dot({type_a}, {type_b}) only implemented for MI300")
+        if "e4m3" in (normal_type, mxfp_type) and not is_hip_mi300():
+            pytest.skip(f"scaled_dot({normal_type}, {mxfp_type}) only implemented for MI300")
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
 
     @triton.jit
-    def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, stride_b1, out,
+    def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, stride_b1, b_scale, out,
                          BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, type_a: tl.constexpr,
                          type_b: tl.constexpr):
-        tl.static_assert((type_b == "e4m3" or type_b == "e5m2") or type_b == "bf16", "type_b must be fp8 or bf16")
-        IS_FP8: tl.constexpr = type_a == "e4m3" or type_a == "e5m2"
-        DIV_FACTOR: tl.constexpr = 1 if IS_FP8 else 2
-        PACKED_BLOCK_K_A: tl.constexpr = BLOCK_K // DIV_FACTOR
-        PACKED_BLOCK_K_B: tl.constexpr = BLOCK_K
+        DIV_FACTOR_A: tl.constexpr = 2 if type_a == "e2m1" else 1
+        DIV_FACTOR_B: tl.constexpr = 2 if type_b == "e2m1" else 1
+        PACKED_BLOCK_K_A: tl.constexpr = BLOCK_K // DIV_FACTOR_A
+        PACKED_BLOCK_K_B: tl.constexpr = BLOCK_K // DIV_FACTOR_B
         a_ptr = a_base + tl.arange(0, BLOCK_M)[:, None] * stride_a0 + tl.arange(0,
                                                                                 PACKED_BLOCK_K_A)[None, :] * stride_a1
         b_ptr = b_base + tl.arange(0, PACKED_BLOCK_K_B)[:, None] * stride_b0 + tl.arange(0,
                                                                                          BLOCK_N)[None, :] * stride_b1
 
-        SCALE_BLOCK_K: tl.constexpr = BLOCK_K // 32
-        scale_a_ptr = a_scale + tl.arange(0, BLOCK_M)[:, None] * SCALE_BLOCK_K + tl.arange(0, SCALE_BLOCK_K)[None, :]
-
         a = tl.load(a_ptr)
         b = tl.load(b_ptr)
-        a_scale = tl.load(scale_a_ptr)
-        c = tl.dot_scaled(a, a_scale, type_a, b, None, type_b)
+        SCALE_BLOCK_K: tl.constexpr = BLOCK_K // 32
+        if a_scale is not None:
+            scale_a_ptr = a_scale + tl.arange(0, BLOCK_M)[:, None] * SCALE_BLOCK_K + tl.arange(0,
+                                                                                               SCALE_BLOCK_K)[None, :]
+            a_scale = tl.load(scale_a_ptr)
+        if b_scale is not None:
+            scale_b_ptr = b_scale + tl.arange(0, BLOCK_N)[:, None] * SCALE_BLOCK_K + tl.arange(0,
+                                                                                               SCALE_BLOCK_K)[None, :]
+            b_scale = tl.load(scale_b_ptr)
+        c = tl.dot_scaled(a, a_scale, type_a, b, b_scale, type_b)
         out_ptr = out + tl.arange(0, BLOCK_M)[:, None] * BLOCK_N + tl.arange(0, BLOCK_N)[None, :]
         tl.store(out_ptr, c.to(tl.bfloat16))
 
@@ -3481,22 +3488,31 @@ def mxfp_to_bf16_kernel(
         offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
         tl.store(mxfp_ptr + offsets, tl.ravel(mxfp), mask=offsets < N * 32)
 
-    def dot_scale_ref(x, scale, y, type_x, type_y):
-        e_bits, m_bits = {"e2m1": (2, 1), "e4m3": (4, 3), "e5m2": (5, 2)}[type_x]
-        type_y = {"e4m3": torch.float8_e4m3fn, "e5m2": torch.float8_e5m2, "bf16": torch.bfloat16}[type_y]
-
-        comp_dtype = torch.bfloat16
-
-        x = x.contiguous()
-        x_upcast = x.new_empty(scale.shape[:-1] + (32 * scale.shape[-1], ), dtype=comp_dtype)
-
-        N = x_upcast.numel()
-        BLOCK_SIZE = 512
-        grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE, )
-        mxfp_to_bf16_kernel[grid](x, scale, x_upcast, scale.numel(), e_bits, m_bits, BLOCK_SIZE, num_warps=num_warps)
-        assert x_upcast.isfinite().all()
-
-        y_upcast = y.view(type_y).to(comp_dtype)
+    def dot_scale_ref(x, scale_x, y, scale_y, type_x, type_y):
+
+        def upcast(v, scale, type, transposed):
+            comp_dtype = torch.bfloat16
+            if scale is None:
+                type = {"e4m3": torch.float8_e4m3fn, "e5m2": torch.float8_e5m2, "bf16": torch.bfloat16}[type]
+                return v.view(type).to(comp_dtype)
+            e_bits, m_bits = {"e2m1": (2, 1), "e4m3": (4, 3), "e5m2": (5, 2)}[type]
+            # Packing is always on the K dimension so we transpose before upcasting then transpose back.
+            if transposed:
+                v = v.mT.contiguous()
+            v = v.contiguous()
+            v_upcast = v.new_empty(scale.shape[:-1] + (32 * scale.shape[-1], ), dtype=comp_dtype)
+            N = v_upcast.numel()
+            BLOCK_SIZE = 512
+            grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE, )
+            mxfp_to_bf16_kernel[grid](v, scale, v_upcast, scale.numel(), e_bits, m_bits, BLOCK_SIZE,
+                                      num_warps=num_warps)
+            assert v_upcast.isfinite().all()
+            if transposed:
+                v_upcast = v_upcast.mT
+            return v_upcast
+
+        x_upcast = upcast(x, scale_x, type_x, False)
+        y_upcast = upcast(y, scale_y, type_y, True)
 
         class AccumulateInFp32:
 
@@ -3525,13 +3541,22 @@ def make_arg(shape, ty, col_major=False, max_val=255):
             ret = ret.mT
         return ret
 
-    DIV_FACTOR = 2 if type_a == "e2m1" else 1
-    x = make_arg((M, K // DIV_FACTOR), type_a, col_major=col_a)
-    y = make_arg((K, N), type_b, col_major=col_b)
+    type_a = normal_type if not rhs_scale else mxfp_type
+    type_b = mxfp_type if not rhs_scale else normal_type
+
+    DIV_FACTOR_A = 2 if type_a == "e2m1" else 1
+    DIV_FACTOR_B = 2 if type_b == "e2m1" else 1
+    x = make_arg((M, K // DIV_FACTOR_A), type_a, col_major=col_a)
+    y = make_arg((K // DIV_FACTOR_B, N), type_b, col_major=col_b)
 
     # sample scales that don't overflow as otherwise it's implementation defined (underflowing is alright)
     # Max scale= 2**15
     scale_x = make_arg((M, K // 32), "e8m0", max_val=127 + 15)
+    scale_y = make_arg((N, K // 32), "e8m0", max_val=127 + 15)
+    if rhs_scale:
+        scale_x = None
+    else:
+        scale_y = None
 
     def make_finite(x, dtype):
         # e5m2 has too many non-finite values when sampled uniformly (1 / 32) and
@@ -3546,16 +3571,14 @@ def make_finite(x, dtype):
 
     x = make_finite(x, type_a)
     y = make_finite(y, type_b)
-
     kernel_kwargs = {"num_warps": num_warps}
     if is_hip():
         kernel_kwargs["kpack"] = kpack
         kernel_kwargs["matrix_instr_nonkdim"] = mma
     z = x.new_empty((M, N), dtype=torch.bfloat16)
-    pgm = dot_scale_kernel[(1, )](x, *x.stride(), scale_x, y, *y.stride(), z, M, N, K, type_a, type_b, **kernel_kwargs)
-
-    z_ref = dot_scale_ref(x, scale_x, y, type_a, type_b)
-
+    pgm = dot_scale_kernel[(1, )](x, *x.stride(), scale_x, y, *y.stride(), scale_y, z, M, N, K, type_a, type_b,
+                                  **kernel_kwargs)
+    z_ref = dot_scale_ref(x, scale_x, y, scale_y, type_a, type_b)
     # Bigger tolerance for AMD MI200 devices.
     # MI200 devices use reduced precision fp16 and bf16 and flush input and output denormal values
     # to zero. Detailed info is at:
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir