ROCm
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/Rock.h‎
Lines changed: 0 additions & 6 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/Rock.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td‎
Lines changed: 6 additions & 4 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 79 additions & 41 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 79 additions & 41 deletions
diff --git a/‎mlir/lib/Dialect/Rock/IR/RockDialect.cpp‎
Lines changed: 77 additions & 39 deletions b/‎mlir/lib/Dialect/Rock/IR/RockDialect.cpp‎
Lines changed: 77 additions & 39 deletions
@@ -49,12 +49,6 @@ class FusionRoot : public TraitBase<ConcreteType, FusionRoot> {};
 } // namespace OpTrait
 } // namespace mlir
 
-// Following ifdef could be used to change
-// the attention operator to be a fused gemm-gemm
-// kernel for debugging purposes. This will also
-// adjust the test harness to verify the same as well
-// #define ROCK_DEBUG_ATTENTION_REMOVE_SOFTMAX
-
 namespace mlir {
 namespace rock {
 //===----------------------------------------------------------------------===//
 
@@ -63,11 +63,13 @@ def KernelTypeConvBwdData : I32EnumAttrCase<"ConvBwdData", 1>;
 def KernelTypeConvBwdWeight : I32EnumAttrCase<"ConvBwdWeight", 2>;
 def KernelTypeGemm : I32EnumAttrCase<"Gemm", 3>;
 def KernelTypeAttention : I32EnumAttrCase<"Attention", 4>;
+def KernelTypeGemmElementwiseGemm : I32EnumAttrCase<"GemmElementwiseGemm", 5>;
 
-def KernelType : Rock_I32Enum<"KernelType", "Any of the possible types of a rock kernel",
-  [KernelTypeConv, KernelTypeConvBwdData,
-    KernelTypeConvBwdWeight, KernelTypeGemm,
-    KernelTypeAttention]>;
+def KernelType
+    : Rock_I32Enum<"KernelType", "Any of the possible types of a rock kernel",
+                   [KernelTypeConv, KernelTypeConvBwdData,
+                    KernelTypeConvBwdWeight, KernelTypeGemm,
+                    KernelTypeAttention, KernelTypeGemmElementwiseGemm]>;
 
 /// TransformType
 def PassThrough : I32EnumAttrCase<"PassThrough", 0>;
 
@@ -205,38 +205,33 @@ def Rock_ReduceOp :
   }];
 }
 
-def Rock_AttentionOp :
-  Rock_Op<"attention", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, RockFusionRoot, AttrSizedOperandSegments]>,
-  Arguments<(ins
-    TensorOrMemRefOf<[F32, F16, BF16, I8]>:$queries,
-    TensorOrMemRefOf<[F32, F16, BF16, I8]>:$keys,
-    TensorOrMemRefOf<[F32, F16, BF16]>:$values,
-    Variadic<AnyTensorOrMemRef>:$preSoftmaxElemWiseInputs,
-    Optional<TensorOrMemRefOf<[I32]>>:$currentSeqLen,
-    TensorOrMemRefOf<[F32, BF16, F16]>:$out,
-    UnitAttr:$qTransposed,
-    UnitAttr:$kTransposed,
-    UnitAttr:$vTransposed,
-    UnitAttr:$oTransposed,
-    StrAttr:$arch,
-    Rock_GemmFeaturesAttr:$features,
-    OptionalAttr<I32Attr>:$numCU,
-    OptionalAttr<RockTuningParamAttrInterface>:$params0,
-    OptionalAttr<RockTuningParamAttrInterface>:$params1,
-    I32Attr:$firstGemmIdx
-  )>,
-  Results<(outs Optional<TensorOf<[F32, F16, BF16]>>:$result)> {
+def Rock_AttentionOp
+    : Rock_Op<"attention", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+                            RockFusionRoot, AttrSizedOperandSegments]>,
+      Arguments<(ins TensorOrMemRefOf<[F32, F16, BF16, I8]>:$queries,
+          TensorOrMemRefOf<[F32, F16, BF16, I8]>:$keys,
+          TensorOrMemRefOf<[F32, F16, BF16]>:$values,
+          Variadic<AnyTensorOrMemRef>:$preSoftmaxElemWiseInputs,
+          Optional<TensorOrMemRefOf<[I32]>>:$currentSeqLen,
+          TensorOrMemRefOf<[F32, F16, BF16]>:$out, UnitAttr:$qTransposed,
+          UnitAttr:$kTransposed, UnitAttr:$vTransposed, UnitAttr:$oTransposed,
+          StrAttr:$arch, Rock_GemmFeaturesAttr:$features,
+          OptionalAttr<I32Attr>:$numCU,
+          OptionalAttr<RockTuningParamAttrInterface>:$params0,
+          OptionalAttr<RockTuningParamAttrInterface>:$params1,
+          I32Attr:$firstGemmIdx)>,
+      Results<(outs Optional<TensorOf<[F32, F16, BF16]>>:$result)> {
   let summary = "Attention operation of transformer models";
   let description = [{
-    Performs the operation out = SOFTMAX((queries * keys) .* scale) * values.
+    Performs the operation out = SOFTMAX(queries * keys) * values.
 
     This operation performs attention mechanism of transformer models.
 
     Those creating a `rock.attention` must specify the GPU architecture being targetted
     and the number of compute units (numCu) available. The parameters
     `gridSize`, and `blockSize` are optional as they can be inferred by
     a tuning process or a heuristic, but they must be set before the `attention` is
-    lowered into the `gridwise_attention` stage of the code generation pipeline.
+    lowered into the `gridwise_attention_accel` stage of the code generation pipeline.
 
     `features` specifies what hardware features can be used in the generated code.
   }];
@@ -255,6 +250,50 @@ def Rock_AttentionOp :
   }];
 }
 
+def Rock_GemmElementwiseGemmOp
+    : Rock_Op<"gemm_elementwise_gemm", [DeclareOpInterfaceMethods<
+                                            MemoryEffectsOpInterface>,
+                                        RockFusionRoot]>,
+      AllElementTypesMatch<["a", "b", "c"]>,
+      Arguments<(ins TensorOrMemRefOf<[F32]>:$a, TensorOrMemRefOf<[F32]>:$b,
+          TensorOrMemRefOf<[F32]>:$c,
+          Variadic<AnyTensorOrMemRef>:$elemwiseInputs,
+          TensorOrMemRefOf<[F32]>:$out, UnitAttr:$aTransposed,
+          UnitAttr:$bTransposed, UnitAttr:$cTransposed, UnitAttr:$oTransposed,
+          StrAttr:$arch, Rock_GemmFeaturesAttr:$features,
+          OptionalAttr<I32Attr>:$numCU,
+          OptionalAttr<RockTuningParamAttrInterface>:$params0,
+          OptionalAttr<RockTuningParamAttrInterface>:$params1,
+          I32Attr:$firstGemmIdx)>,
+      Results<(outs Optional<TensorOf<[F32]>>:$result)> {
+  let summary = "GEMM-elementwise-GEMM operation";
+  let description = [{
+    Performs the operation out = (a * b) * c.
+
+    This operation performs fused GEMM-elementwise-GEMM. 
+
+    Those creating a `rock.gemm_elementwise_gemm` must specify the GPU architecture being targetted
+    and the number of compute units (numCu) available. The parameters
+    `gridSize`, and `blockSize` are optional as they can be inferred by
+    a tuning process or a heuristic, but they must be set before the `gemm_elementwise_gemm` is
+    lowered into the `gridwise_attention_accel` stage of the code generation pipeline.
+
+    `features` specifies what hardware features can be used in the generated code.
+  }];
+  let hasVerifier = 1;
+  let regions = (region AnyRegion:$preSecondGemmBody);
+  let assemblyFormat = [{
+    `{` `\n`
+        ` ` `ab` `=` (`tr` $aTransposed^)? $a `*` (`tr` $bTransposed^)? $b `:` type($a) `,` type($b) `\n`
+        (`ab` `=` `elementwise` (`otherIns` `(` $elemwiseInputs^ `:` type($elemwiseInputs) `)`)? $preSecondGemmBody^ `\n`)?
+        (`tr` $oTransposed^)? $out `=` `ab` `*` (`tr` $cTransposed^)? $c `:` type($c) `->` type($out) `\n`
+    `}` attr-dict (`->` type($result)^)?
+  }];
+  let extraClassDeclaration = [{
+    ::mlir::OpOperand* getOutArgument() { return &(*this)->getOpOperands().back(); }
+  }];
+}
+
 def Rock_InitKernelOp :
     Rock_Op<"init_kernel", []>,
     Arguments<(ins AnyTensorOrMemRef:$buffer,
@@ -432,24 +471,23 @@ def Rock_GridwiseGemmAccelOp :
 }
 
 // gridwise_attention_accel
-def Rock_GridwiseAttentionAccelOp :
-    Rock_Op<"gridwise_attention_accel", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, RockFusionRoot, AttrSizedOperandSegments]>,
-    Arguments<(ins MemRefRankOf<[F32, F16, BF16, I8], [3]>:$queries,
-                   MemRefRankOf<[F32, F16, BF16, I8], [3]>:$keys,
-                   MemRefRankOf<[F32, F16, BF16,], [3]>:$values,
-                   Variadic<AnyTensorOrMemRef>:$preSoftmaxElemWiseInputs,
-                   Optional<MemRefRankOf<[I32], [1]>>:$currentSeqLen,
-                   MemRefRankOf<[F32, F16, BF16], [3]>:$out,
-                   StrAttr:$arch,
-                   Rock_GemmFeaturesAttr:$features,
-                   I32Attr:$blockSize,
-                   I32Attr:$gridSize,
-                   UnitAttr:$disableQBypassLDS,
-                   OptionalAttr<IndexAttr>:$prePadG0M,
-                   OptionalAttr<IndexAttr>:$prePadG0N,
-                   RockAccelTuningParamAttrInterface:$params0,
-                   RockAccelTuningParamAttrInterface:$params1,
-                   I32Attr:$firstGemmIdx)> {
+def Rock_GridwiseAttentionAccelOp
+    : Rock_Op<"gridwise_attention_accel",
+              [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+               RockFusionRoot, AttrSizedOperandSegments]>,
+      Arguments<(ins MemRefRankOf<[F32, F16, BF16, I8], [3]>:$queries,
+          MemRefRankOf<[F32, F16, BF16, I8], [3]>:$keys,
+          MemRefRankOf<[F32, F16, BF16, ], [3]>:$values,
+          Variadic<AnyTensorOrMemRef>:$preSoftmaxElemWiseInputs,
+          Optional<MemRefRankOf<[I32], [1]>>:$currentSeqLen,
+          MemRefRankOf<[F32, F16, BF16], [3]>:$out, StrAttr:$arch,
+          Rock_GemmFeaturesAttr:$features, I32Attr:$blockSize,
+          I32Attr:$gridSize, UnitAttr:$disableQBypassLDS,
+          OptionalAttr<IndexAttr>:$prePadG0M,
+          OptionalAttr<IndexAttr>:$prePadG0N,
+          RockAccelTuningParamAttrInterface:$params0,
+          RockAccelTuningParamAttrInterface:$params1, I32Attr:$firstGemmIdx,
+          DefaultValuedOptionalAttr<BoolAttr, "true">:$enableSoftmax)> {
   let summary = "Gridwise attention accelerated version";
   let description = [{
     The `rock.gridwise_attention_accel` op computes gridwise attention with acceleration.
 
@@ -33,6 +33,7 @@
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
@@ -46,6 +47,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include <algorithm>
@@ -486,10 +488,13 @@ ConvOpType mlir::rock::convOpTypeFromKernelType(KernelType kernelType) {
     return ConvOpType::BwdWeight;
   case KernelType::Gemm:
     llvm_unreachable(
-        "Gemm ops shouldn't be in convolution-specific lowering passes");
+        "GEMM ops shouldn't be in convolution-specific lowering passes");
   case KernelType::Attention:
     llvm_unreachable(
         "Attention ops shouldn't be in convolution-specific lowering passes");
+  case KernelType::GemmElementwiseGemm:
+    llvm_unreachable(
+        "GEMM+GEMM ops shouldn't be in convolution-specific lowering passes");
   }
   llvm_unreachable("Unsuppported KernelType");
 }
@@ -566,17 +571,20 @@ static LogicalResult verifyGemmTypes(Operation *op, GemmFeatures features,
           "Mfma gridwise does not support E4M3/E5M2 data types ");
     }
   }
-  if (isa<FloatType>(elemTypeA) && !isa<FloatType>(elemTypeC)) {
-    return op->emitOpError("floating-point input type ")
-           << elemTypeA
-           << " requires a floating-point output type, but the output type is "
-           << elemTypeC;
-  }
-  if (isa<IntegerType>(elemTypeA) && !isa<IntegerType>(elemTypeC)) {
-    return op->emitOpError("integer input type ")
-           << elemTypeA
-           << " requires an integer output type, but the output type is "
-           << elemTypeC;
+  if (elemTypeC) {
+    if (isa<FloatType>(elemTypeA) && !isa<FloatType>(elemTypeC)) {
+      return op->emitOpError("floating-point input type ")
+             << elemTypeA
+             << " requires a floating-point output type, but the output type "
+                "is "
+             << elemTypeC;
+    }
+    if (isa<IntegerType>(elemTypeA) && !isa<IntegerType>(elemTypeC)) {
+      return op->emitOpError("integer input type ")
+             << elemTypeA
+             << " requires an integer output type, but the output type is "
+             << elemTypeC;
+    }
   }
   return success();
 }
@@ -2068,77 +2076,107 @@ LogicalResult BlockwiseFillOp::verify() {
 }
 
 //===-----------------------------------------------------===//
-// AttentionOp
+// GemmElementwiseGemmOp
 //===-----------------------------------------------------===//
 
-LogicalResult AttentionOp::verify() {
-  ShapedType qType = getQueries().getType();
+template <typename Op>
+static LogicalResult verifyAttentionOp(Op op, Value q, Value k, Value v,
+                                       Value currentSeqLen, bool qTransposed,
+                                       bool kTransposed, bool vTransposed) {
+  ShapedType qType = cast<ShapedType>(q.getType());
   int64_t qBatchDim = qType.getShape().size() == 3 ? qType.getShape()[0] : 1;
   ArrayRef<int64_t> qLastDims = qType.getShape().slice(qType.getRank() - 2);
-  auto [queryM, queryK] = getQTransposed()
-                              ? std::tuple{qLastDims[1], qLastDims[0]}
-                              : std::tuple{qLastDims[0], qLastDims[1]};
+  auto [queryM, queryK] = qTransposed ? std::tuple{qLastDims[1], qLastDims[0]}
+                                      : std::tuple{qLastDims[0], qLastDims[1]};
 
-  ShapedType kType = getKeys().getType();
+  ShapedType kType = cast<ShapedType>(k.getType());
   int64_t kBatchDim = kType.getShape().size() == 3 ? kType.getShape()[0] : 1;
   ArrayRef<int64_t> kLastDims = kType.getShape().slice(kType.getRank() - 2);
-  auto [keyK, keyN] = getKTransposed() ? std::tuple{kLastDims[1], kLastDims[0]}
-                                       : std::tuple{kLastDims[0], kLastDims[1]};
+  auto [keyK, keyN] = kTransposed ? std::tuple{kLastDims[1], kLastDims[0]}
+                                  : std::tuple{kLastDims[0], kLastDims[1]};
 
-  ShapedType vType = getValues().getType();
+  ShapedType vType = cast<ShapedType>(v.getType());
   int64_t vBatchDim = vType.getShape().size() == 3 ? vType.getShape()[0] : 1;
   ArrayRef<int64_t> vLastDims = vType.getShape().slice(vType.getRank() - 2);
-  auto [valueK, valueN] = getVTransposed()
-                              ? std::tuple{vLastDims[1], vLastDims[0]}
-                              : std::tuple{vLastDims[0], vLastDims[1]};
+  auto [valueK, valueN] = vTransposed ? std::tuple{vLastDims[1], vLastDims[0]}
+                                      : std::tuple{vLastDims[0], vLastDims[1]};
 
   if (qBatchDim != kBatchDim || kBatchDim != vBatchDim) {
-    return emitError("Batch dimensions do not match");
+    return op.emitError("Batch dimensions do not match");
   }
   if (queryK != keyK) {
-    return emitError("reduction dimensions of first gemm do not match");
+    return op.emitError("reduction dimensions of first gemm do not match");
   }
   if (keyN != valueK) {
-    return emitError("reduction dimensions of second gemm do not match");
+    return op.emitError("reduction dimensions of second gemm do not match");
   }
 
   // check output type
-  ShapedType oType = getOut().getType();
+  ShapedType oType = op.getOut().getType();
   int64_t oBatchDim = oType.getShape().size() == 3 ? oType.getShape()[0] : 1;
 
   ArrayRef<int64_t> oLastDims = oType.getShape().slice(oType.getRank() - 2);
   auto [outputSeqLen, outputHeadDim] =
-      getOTransposed() ? std::tuple{oLastDims[1], oLastDims[0]}
-                       : std::tuple{oLastDims[0], oLastDims[1]};
+      op.getOTransposed() ? std::tuple{oLastDims[1], oLastDims[0]}
+                          : std::tuple{oLastDims[0], oLastDims[1]};
 
   if (qType.getShape().size() != oType.getShape().size()) {
-    return emitError("Number of dimensions do not match (Q and Output)");
+    return op.emitError("Number of dimensions do not match (Q and Output)");
   }
   if (qBatchDim != oBatchDim) {
-    return emitError("Batch dimensions do not match (Q and Output)");
+    return op.emitError("Batch dimensions do not match (Q and Output)");
   }
   if (queryM != outputSeqLen) {
-    return emitError("Sequence length does not match (Q and Output)");
+    return op.emitError("Sequence length does not match (Q and Output)");
   }
   if (valueN != outputHeadDim) {
-    return emitError("Head dimensions do not match (V and Output)");
+    return op.emitError("Head dimensions do not match (V and Output)");
   }
 
   // check currentSeqLen (KV Cache)
-  auto currentSeqLen = getCurrentSeqLen();
   if (currentSeqLen) {
-    ShapedType seqLenType = currentSeqLen.getType();
+    ShapedType seqLenType = cast<ShapedType>(currentSeqLen.getType());
     if (seqLenType.getShape().size() != 1) {
-      return emitError("Number of dimensions is not one (currentSeqLen)");
+      return op.emitError("Number of dimensions is not one (currentSeqLen)");
     }
     if (seqLenType.getShape()[0] != oBatchDim) {
-      return emitError(
+      return op.emitError(
           "Batch dimensions do not match (currentSeqLen and Output)");
     }
   }
   return success();
 }
 
+LogicalResult GemmElementwiseGemmOp::verify() {
+  return verifyAttentionOp(*this, getA(), getB(), getC(),
+                           /*currentSeqLen=*/nullptr, getATransposed(),
+                           getBTransposed(), getCTransposed());
+}
+
+void GemmElementwiseGemmOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  auto *read = MemoryEffects::Read::get();
+  auto *write = MemoryEffects::Write::get();
+  effects.emplace_back(read, &getOutMutable());
+  effects.emplace_back(write, &getOutMutable());
+
+  effects.emplace_back(read, &getAMutable());
+  effects.emplace_back(read, &getBMutable());
+  effects.emplace_back(read, &getCMutable());
+  for (auto &regionArg : getElemwiseInputsMutable())
+    effects.emplace_back(read, &regionArg);
+}
+
+//===-----------------------------------------------------===//
+// AttentionOp
+//===-----------------------------------------------------===//
+
+LogicalResult AttentionOp::verify() {
+  return verifyAttentionOp(*this, getQueries(), getKeys(), getValues(),
+                           getCurrentSeqLen(), getQTransposed(),
+                           getKTransposed(), getVTransposed());
+}
+
 void AttentionOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   auto *read = MemoryEffects::Read::get();