ROCm
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/Rock.h‎
Lines changed: 0 additions & 6 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/Rock.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td‎
Lines changed: 2 additions & 1 deletion b/‎mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 54 additions & 4 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 54 additions & 4 deletions
@@ -49,12 +49,6 @@ class FusionRoot : public TraitBase<ConcreteType, FusionRoot> {};
 } // namespace OpTrait
 } // namespace mlir
 
-// Following ifdef could be used to change
-// the attention operator to be a fused gemm-gemm
-// kernel for debugging purposes. This will also
-// adjust the test harness to verify the same as well
-// #define ROCK_DEBUG_ATTENTION_REMOVE_SOFTMAX
-
 namespace mlir {
 namespace rock {
 //===----------------------------------------------------------------------===//
 
@@ -63,11 +63,12 @@ def KernelTypeConvBwdData : I32EnumAttrCase<"ConvBwdData", 1>;
 def KernelTypeConvBwdWeight : I32EnumAttrCase<"ConvBwdWeight", 2>;
 def KernelTypeGemm : I32EnumAttrCase<"Gemm", 3>;
 def KernelTypeAttention : I32EnumAttrCase<"Attention", 4>;
+def KernelTypeGemmElementwiseGemm : I32EnumAttrCase<"GemmElementwiseGemm", 5>;
 
 def KernelType : Rock_I32Enum<"KernelType", "Any of the possible types of a rock kernel",
   [KernelTypeConv, KernelTypeConvBwdData,
     KernelTypeConvBwdWeight, KernelTypeGemm,
-    KernelTypeAttention]>;
+    KernelTypeAttention, KernelTypeGemmElementwiseGemm]>;
 
 /// TransformType
 def PassThrough : I32EnumAttrCase<"PassThrough", 0>;
 
@@ -213,7 +213,7 @@ def Rock_AttentionOp :
     TensorOrMemRefOf<[F32, F16, BF16]>:$values,
     Variadic<AnyTensorOrMemRef>:$preSoftmaxElemWiseInputs,
     Optional<TensorOrMemRefOf<[I32]>>:$currentSeqLen,
-    TensorOrMemRefOf<[F32, BF16, F16]>:$out,
+    TensorOrMemRefOf<[F32, F16, BF16]>:$out,
     UnitAttr:$qTransposed,
     UnitAttr:$kTransposed,
     UnitAttr:$vTransposed,
@@ -228,15 +228,15 @@ def Rock_AttentionOp :
   Results<(outs Optional<TensorOf<[F32, F16, BF16]>>:$result)> {
   let summary = "Attention operation of transformer models";
   let description = [{
-    Performs the operation out = SOFTMAX((queries * keys) .* scale) * values.
+    Performs the operation out = SOFTMAX(queries * keys) * values.
 
     This operation performs attention mechanism of transformer models.
 
     Those creating a `rock.attention` must specify the GPU architecture being targetted
     and the number of compute units (numCu) available. The parameters
     `gridSize`, and `blockSize` are optional as they can be inferred by
     a tuning process or a heuristic, but they must be set before the `attention` is
-    lowered into the `gridwise_attention` stage of the code generation pipeline.
+    lowered into the `gridwise_attention_accel` stage of the code generation pipeline.
 
     `features` specifies what hardware features can be used in the generated code.
   }];
@@ -255,6 +255,55 @@ def Rock_AttentionOp :
   }];
 }
 
+def Rock_GemmElementwiseGemmOp:
+  Rock_Op<"gemm_elementwise_gemm", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, RockFusionRoot]>,
+  AllElementTypesMatch<["a", "b", "c"]>,
+  Arguments<(ins
+    TensorOrMemRefOf<[F32]>:$a,
+    TensorOrMemRefOf<[F32]>:$b,
+    TensorOrMemRefOf<[F32]>:$c,
+    Variadic<AnyTensorOrMemRef>:$elemwiseInputs,
+    TensorOrMemRefOf<[F32]>:$out,
+    UnitAttr:$aTransposed,
+    UnitAttr:$bTransposed,
+    UnitAttr:$cTransposed,
+    UnitAttr:$oTransposed,
+    StrAttr:$arch,
+    Rock_GemmFeaturesAttr:$features,
+    OptionalAttr<I32Attr>:$numCU,
+    OptionalAttr<RockTuningParamAttrInterface>:$params0,
+    OptionalAttr<RockTuningParamAttrInterface>:$params1,
+    I32Attr:$firstGemmIdx
+  )>,
+  Results<(outs Optional<TensorOf<[F32]>>:$result)> {
+  let summary = "GEMM-elementwise-GEMM operation";
+  let description = [{
+    Performs the operation out = (a * b) * c.
+
+    This operation performs fused GEMM-elementwise-GEMM. 
+
+    Those creating a `rock.gemm_elementwise_gemm` must specify the GPU architecture being targetted
+    and the number of compute units (numCu) available. The parameters
+    `gridSize`, and `blockSize` are optional as they can be inferred by
+    a tuning process or a heuristic, but they must be set before the `gemm_elementwise_gemm` is
+    lowered into the `gridwise_attention_accel` stage of the code generation pipeline.
+
+    `features` specifies what hardware features can be used in the generated code.
+  }];
+  let hasVerifier = 1;
+  let regions = (region AnyRegion:$preSecondGemmBody);
+  let assemblyFormat = [{
+    `{` `\n`
+        ` ` `ab` `=` (`tr` $aTransposed^)? $a `*` (`tr` $bTransposed^)? $b `:` type($a) `,` type($b) `\n`
+        (`ab` `=` `elementwise` (`otherIns` `(` $elemwiseInputs^ `:` type($elemwiseInputs) `)`)? $preSecondGemmBody^ `\n`)?
+        (`tr` $oTransposed^)? $out `=` `ab` `*` (`tr` $cTransposed^)? $c `:` type($c) `->` type($out) `\n`
+    `}` attr-dict (`->` type($result)^)?
+  }];
+  let extraClassDeclaration = [{
+    ::mlir::OpOperand* getOutArgument() { return &(*this)->getOpOperands().back(); }
+  }];
+}
+
 def Rock_InitKernelOp :
     Rock_Op<"init_kernel", []>,
     Arguments<(ins AnyTensorOrMemRef:$buffer,
@@ -449,7 +498,8 @@ def Rock_GridwiseAttentionAccelOp :
                    OptionalAttr<IndexAttr>:$prePadG0N,
                    RockAccelTuningParamAttrInterface:$params0,
                    RockAccelTuningParamAttrInterface:$params1,
-                   I32Attr:$firstGemmIdx)> {
+                   I32Attr:$firstGemmIdx,
+                   DefaultValuedOptionalAttr<BoolAttr, "true">:$enableSoftmax)> {
   let summary = "Gridwise attention accelerated version";
   let description = [{
     The `rock.gridwise_attention_accel` op computes gridwise attention with acceleration.