ROCm
diff --git a/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 14 additions & 9 deletions b/‎mlir/include/mlir/Dialect/Rock/IR/RockOps.td‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎mlir/include/mlir/Dialect/Rock/utility/loweringUtils.h‎
Lines changed: 4 additions & 0 deletions b/‎mlir/include/mlir/Dialect/Rock/utility/loweringUtils.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlir/lib/Conversion/TosaToRock/TosaToRock.cpp‎
Lines changed: 1 addition & 0 deletions b/‎mlir/lib/Conversion/TosaToRock/TosaToRock.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mlir/lib/Dialect/Rock/IR/RockDialect.cpp‎
Lines changed: 25 additions & 5 deletions b/‎mlir/lib/Dialect/Rock/IR/RockDialect.cpp‎
Lines changed: 25 additions & 5 deletions
diff --git a/‎mlir/lib/Dialect/Rock/Transforms/GemmToGridwise.cpp‎
Lines changed: 12 additions & 27 deletions b/‎mlir/lib/Dialect/Rock/Transforms/GemmToGridwise.cpp‎
Lines changed: 12 additions & 27 deletions
diff --git a/‎mlir/lib/Dialect/Rock/Transforms/GridLayoutEmitter.cpp‎
Lines changed: 3 additions & 3 deletions b/‎mlir/lib/Dialect/Rock/Transforms/GridLayoutEmitter.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎mlir/lib/Dialect/Rock/Transforms/GridLayoutEmitter.h‎
Lines changed: 2 additions & 2 deletions b/‎mlir/lib/Dialect/Rock/Transforms/GridLayoutEmitter.h‎
Lines changed: 2 additions & 2 deletions
@@ -165,7 +165,7 @@ def Rock_GemmOp :
     transposed. For example, if `aTransposed` is set, then the argument A should be
     a [G] x K x M memory.
 
-    Those creating a `rock.gemm` must specify the GPU architecture being targetted
+    Those creating a `rock.gemm` must specify the GPU architecture being targeted
     and the number of compute units (numCu) available. The parameters
     `derivedBlockSize`, `gridSize`, and `params` are optional as they can be inferred by
     a tuning process or a heuristic, but they must be set before the `gemm` is
@@ -215,10 +215,11 @@ def Rock_AttentionOp
           TensorOrMemRefOf<[F32, F16, BF16]>:$values,
           Variadic<AnyTensorOrMemRef>:$preSoftmaxElemWiseInputs,
           Optional<TensorOrMemRefOf<[I32]>>:$currentSeqLen,
-          TensorOrMemRefOf<[F32, F16, BF16]>:$out, UnitAttr:$qTransposed,
-          UnitAttr:$kTransposed, UnitAttr:$vTransposed, UnitAttr:$oTransposed,
-          UnitAttr:$causal, StrAttr:$arch, Rock_GemmFeaturesAttr:$features,
-          OptionalAttr<I32Attr>:$numCU,
+          TensorOrMemRefOf<[F32, F16, BF16]>:$out,
+          Optional<TensorOrMemRefOf<[F32, F16, BF16]>>:$lse,
+          UnitAttr:$qTransposed, UnitAttr:$kTransposed, UnitAttr:$vTransposed,
+          UnitAttr:$oTransposed, UnitAttr:$causal, StrAttr:$arch,
+          Rock_GemmFeaturesAttr:$features, OptionalAttr<I32Attr>:$numCU,
           OptionalAttr<RockTuningParamAttrInterface>:$params0,
           OptionalAttr<RockTuningParamAttrInterface>:$params1,
           I32Attr:$firstGemmIdx)>,
@@ -240,7 +241,9 @@ def Rock_AttentionOp
 
     If causal is enabled, we implement causal masking.
 
-    Those creating a `rock.attention` must specify the GPU architecture being targetted
+    LSE (log-sum-exp) is an optional output typically used for flash decoding.
+
+    Those creating a `rock.attention` must specify the GPU architecture being targeted
     and the number of compute units (numCu) available. The parameters
     `gridSize`, and `blockSize` are optional as they can be inferred by
     a tuning process or a heuristic, but they must be set before the `attention` is
@@ -255,6 +258,7 @@ def Rock_AttentionOp
         ` ` `qk` `=` (`tr` $qTransposed^)? $queries `*` (`tr` $kTransposed^)? $keys `:` type($queries) `,` type($keys) `\n`
         (`currentSeqLen` `=` `(` $currentSeqLen^ `:` type($currentSeqLen) `)` `\n`)?
         (`causal` `\n` $causal^)?
+        (`lse` `=` $lse^ `:` type($lse) `\n`)?
         (`qk` `=` `elementwise` (`otherIns` `(` $preSoftmaxElemWiseInputs^ `:` type($preSoftmaxElemWiseInputs) `)`)? $preSoftmaxBody^ `\n`)?
         (`tr` $oTransposed^)? $out `=` `softmax` `(` `qk` `)` `*` (`tr` $vTransposed^)? $values `:` type($values) `->` type($out) `\n`
     `}` attr-dict (`->` type($result)^)?
@@ -294,7 +298,7 @@ def Rock_GemmElementwiseGemmOp
     transposed. For example, if `aTransposed` is set, then the argument `a` should be
     a [G] x K x M memory.
 
-    Those creating a `rock.gemm_elementwise_gemm` must specify the GPU architecture being targetted
+    Those creating a `rock.gemm_elementwise_gemm` must specify the GPU architecture being targeted
     and the number of compute units (numCu) available. The parameters
     `gridSize`, and `blockSize` are optional as they can be inferred by
     a tuning process or a heuristic, but they must be set before the `gemm_elementwise_gemm` is
@@ -346,7 +350,7 @@ def Rock_ConvElementwiseGemmOp
     transposed. For example, if `cTransposed` is set, then the argument `c` should be
     a [G] x O x M memory.
 
-    Those creating a `rock.conv_elementwise_gemm` must specify the GPU architecture being targetted
+    Those creating a `rock.conv_elementwise_gemm` must specify the GPU architecture being targeted
     and the number of compute units (numCu) available. The parameters
     `gridSize`, and `blockSize` are optional as they can be inferred by
     a tuning process or a heuristic, but they must be set before the `conv_elementwise_gemm` is
@@ -551,7 +555,8 @@ def Rock_GridwiseAttentionAccelOp
           MemRefRankOf<[F32, F16, BF16], [3]>:$values,
           Variadic<AnyTensorOrMemRef>:$preSoftmaxElemWiseInputs,
           Optional<MemRefRankOf<[I32], [1]>>:$currentSeqLen,
-          MemRefRankOf<[F32, F16, BF16], [3]>:$out, UnitAttr:$causal,
+          MemRefRankOf<[F32, F16, BF16], [3]>:$out,
+          Optional<MemRefRankOf<[F32, F16, BF16], [2]>>:$lse, UnitAttr:$causal,
           StrAttr:$arch, Rock_GemmFeaturesAttr:$features, I32Attr:$blockSize,
           I32Attr:$gridSize, UnitAttr:$disableQBypassLDS,
           OptionalAttr<IndexAttr>:$prePadG0M,
 
@@ -135,6 +135,10 @@ Type vectorTypeOrSelf(Type elementType, int64_t len);
 Value padMatrix(Value matrix, OpBuilder &b, Location loc, StringRef firstDim,
                 int64_t firstDimPad, StringRef secondDim, int64_t secondDimPad);
 
+// Apply padding to a vector in its `firstDim` if applicable.
+Value padVector(Value vector, OpBuilder &b, Location loc, StringRef firstDim,
+                int64_t firstDimPad);
+
 /// Normalize the argument into the form requested.
 /// If a group dimension is not present, add one.
 /// If doTranspose is true, meaning the user's transpose requests don't match
 
@@ -1693,6 +1693,7 @@ struct AttentionRewritePattern : public OpRewritePattern<tosa::MatMulOp> {
     rock::AttentionOp attnOp = rewriter.create<rock::AttentionOp>(
         loc, outputType, firstMatMulOp.getA(), firstMatMulOp.getB(), op.getB(),
         elementwiseOtherArgs, currentSeqLen, output,
+        /*lse=*/nullptr,
         /*qTransposed=*/nullptr,
         /*kTransposed=*/nullptr,
         /*vTransposed=*/nullptr,
 
@@ -1869,7 +1869,11 @@ LogicalResult GridwiseAttentionAccelOp::verify() {
   int64_t gemm0kpack = gemm0TuningParams.getKpack();
   int64_t gemm0NPerBlock = gemm0TuningParams.getNPerBlock();
   if (gemm0NPerBlock % gemm0kpack != 0) {
-    return emitError("NPerBlock should be divisble by kpack.");
+    return emitError("NPerBlock should be divisible by kpack.");
+  }
+
+  if (!getEnableSoftmax() && getLse()) {
+    return emitError("LSE only works for attention.");
   }
 
   int64_t linalgOpCount = 0;
@@ -2126,7 +2130,7 @@ GemmGemmSize GemmElementwiseGemmOp::getGemmGemmSize() {
 }
 
 static LogicalResult verifyGemmPlusGemmLikeOp(RockGemmGemmWrapperInterface op,
-                                              Value currentSeqLen) {
+                                              Value currentSeqLen, Value lse) {
   ShapedType qType = cast<ShapedType>(op.getAType());
   int64_t qBatchDim = qType.getShape().size() == 3 ? qType.getShape()[0] : 1;
   ArrayRef<int64_t> qLastDims = qType.getShape().slice(qType.getRank() - 2);
@@ -2191,11 +2195,26 @@ static LogicalResult verifyGemmPlusGemmLikeOp(RockGemmGemmWrapperInterface op,
           "Batch dimensions do not match (currentSeqLen and Output)");
     }
   }
+
+  // check LSE (log-sum-exp)
+  if (lse) {
+    ShapedType lseType = cast<ShapedType>(lse.getType());
+    if (lseType.getShape().size() != 2) {
+      return op.emitError("Number of dimensions is not two (LSE)");
+    }
+    if (lseType.getShape()[0] != oBatchDim) {
+      return op.emitError("Batch dimensions do not match (LSE and Output)");
+    }
+    if (lseType.getShape()[1] != queryM) {
+      return op.emitError("SeqLenQ dimensions do not match (LSE and Q)");
+    }
+  }
   return success();
 }
 
 LogicalResult GemmElementwiseGemmOp::verify() {
-  return verifyGemmPlusGemmLikeOp(*this, /*currentSeqLen=*/nullptr);
+  return verifyGemmPlusGemmLikeOp(*this, /*currentSeqLen=*/nullptr,
+                                  /*lse=*/nullptr);
 }
 
 void GemmElementwiseGemmOp::getEffects(
@@ -2290,7 +2309,8 @@ GemmGemmSize ConvElementwiseGemmOp::getGemmGemmSize() {
 }
 
 LogicalResult ConvElementwiseGemmOp::verify() {
-  return verifyGemmPlusGemmLikeOp(*this, /*currentSeqLen=*/nullptr);
+  return verifyGemmPlusGemmLikeOp(*this, /*currentSeqLen=*/nullptr,
+                                  /*lse=*/nullptr);
 }
 
 void ConvElementwiseGemmOp::getEffects(
@@ -2354,7 +2374,7 @@ GemmGemmSize AttentionOp::getGemmGemmSize() {
 }
 
 LogicalResult AttentionOp::verify() {
-  return verifyGemmPlusGemmLikeOp(*this, getCurrentSeqLen());
+  return verifyGemmPlusGemmLikeOp(*this, getCurrentSeqLen(), getLse());
 }
 
 void AttentionOp::getEffects(
 
@@ -92,19 +92,12 @@ struct GemmElementwiseGemmRewritePattern
   LogicalResult matchAndRewrite(GemmElementwiseGemmOp op,
                                 GemmElementwiseGemmOpAdaptor adaptor,
                                 ConversionPatternRewriter &rw) const override;
-
-  LogicalResult computeGridSize(ConversionPatternRewriter &rw,
-                                GemmElementwiseGemmOp op, Value a, Value b,
-                                Value c) const;
 };
 
 struct AttentionRewritePattern : public OpConversionPattern<AttentionOp> {
   using OpConversionPattern<AttentionOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(AttentionOp op, AttentionOpAdaptor adaptor,
                                 ConversionPatternRewriter &rw) const override;
-
-  LogicalResult computeGridSize(ConversionPatternRewriter &rw, AttentionOp op,
-                                Value queries, Value keys, Value values) const;
 };
 
 template <typename Op>
@@ -139,7 +132,7 @@ computeGridSizeAttentionGemmElmtGemm(ConversionPatternRewriter &rw, Op op,
 static LogicalResult
 commonAttentionGemmElmtGemm(ConversionPatternRewriter &rw,
                             RockGemmGemmWrapperInterface op, Value a, Value b,
-                            Value c, Value out, Value currentSeqLen,
+                            Value c, Value out, Value lse, Value currentSeqLen,
                             UnitAttr causal, ValueRange elementwiseInputs,
                             Region &preSecondOpRegion, bool enableSoftmax) {
   Location loc = op->getLoc();
@@ -150,17 +143,17 @@ commonAttentionGemmElmtGemm(ConversionPatternRewriter &rw,
   bool isAccel = rock::isAccel(op.getGemmFeatures());
   if (!isAccel) {
     return op.emitError("Currently, op is only supported on GPUs "
-                        "with matrix accelerator extentions");
+                        "with matrix accelerator extensions");
   }
   if (!op.getGemm0Params().has_value()) {
     return op.emitError("gemm0 params is missing and it should've been "
-                        "assigned by affix-tuing-params");
+                        "assigned by affix-tuning-params");
   }
   RockAccelTuningParamAttrInterface params0 =
       cast<RockAccelTuningParamAttrInterface>(op.getGemm0Params().value());
   if (!op.getGemm1Params().has_value()) {
     return op.emitError("gemm1 params is missing and it should've been "
-                        "assigned by affix-tuing-params");
+                        "assigned by affix-tuning-params");
   }
   RockAccelTuningParamAttrInterface params1 =
       cast<RockAccelTuningParamAttrInterface>(op.getGemm1Params().value());
@@ -177,6 +170,7 @@ commonAttentionGemmElmtGemm(ConversionPatternRewriter &rw,
   ArrayRef<int64_t> aShape = cast<MemRefType>(a.getType()).getShape();
   ArrayRef<int64_t> bShape = cast<MemRefType>(b.getType()).getShape();
   ArrayRef<int64_t> cShape = cast<MemRefType>(c.getType()).getShape();
+  assert(cShape[1] == bShape[2]);
   GemmSize gemm0Size(/*g=*/aShape[0], /*m=*/bShape[2],
                      /*k=*/aShape[1],
                      /*n=*/aShape[2]);
@@ -200,6 +194,8 @@ commonAttentionGemmElmtGemm(ConversionPatternRewriter &rw,
   // fusions legit. So the extra pad needs to be swapped and applied.
   out = padMatrix(out, rw, loc, "gemm1N", gemm1ExtraPad.n, "gemm1M",
                   gemm1ExtraPad.m);
+  if (lse)
+    lse = padVector(lse, rw, loc, "gemm1N", gemm1ExtraPad.n);
 
   if (failed(computeGridSizeAttentionGemmElmtGemm(rw, op, a, b, c))) {
     return op.emitError("failed to compute the grid size of "
@@ -218,7 +214,7 @@ commonAttentionGemmElmtGemm(ConversionPatternRewriter &rw,
     prePadG0NAttr = rw.getIndexAttr(gemm0Size.n);
   }
   auto newOp = rw.create<GridwiseAttentionAccelOp>(
-      loc, a, b, c, elementwiseInputs, currentSeqLen, out, causal,
+      loc, a, b, c, elementwiseInputs, currentSeqLen, out, lse, causal,
       rw.getStringAttr(op.getArch()),
       rw.getAttr<rock::GemmFeaturesAttr>(op.getGemmFeatures()), blockSizeAttr,
       gridSizeAttr,
@@ -585,34 +581,23 @@ AttentionRewritePattern::matchAndRewrite(AttentionOp op,
                                          ConversionPatternRewriter &rw) const {
   return commonAttentionGemmElmtGemm(
       rw, op, adaptor.getQueries(), adaptor.getKeys(), adaptor.getValues(),
-      adaptor.getOut(), adaptor.getCurrentSeqLen(), adaptor.getCausalAttr(),
-      adaptor.getPreSoftmaxElemWiseInputs(), op.getPreSoftmaxBody(),
+      adaptor.getOut(), adaptor.getLse(), adaptor.getCurrentSeqLen(),
+      adaptor.getCausalAttr(), adaptor.getPreSoftmaxElemWiseInputs(),
+      op.getPreSoftmaxBody(),
       /*enableSoftmax=*/true);
 }
 
-LogicalResult
-AttentionRewritePattern::computeGridSize(ConversionPatternRewriter &rw,
-                                         AttentionOp op, Value queries,
-                                         Value keys, Value values) const {
-  return computeGridSizeAttentionGemmElmtGemm(rw, op, queries, keys, values);
-}
-
 LogicalResult GemmElementwiseGemmRewritePattern::matchAndRewrite(
     GemmElementwiseGemmOp op, GemmElementwiseGemmOpAdaptor adaptor,
     ConversionPatternRewriter &rw) const {
   return commonAttentionGemmElmtGemm(
       rw, op, adaptor.getA(), adaptor.getB(), adaptor.getC(), adaptor.getOut(),
+      /*lse=*/nullptr,
       /*currentSeqLen=*/nullptr, /*causal=*/nullptr,
       adaptor.getElemwiseInputs(), op.getPreSecondGemmBody(),
       /*enableSoftmax=*/false);
 }
 
-LogicalResult GemmElementwiseGemmRewritePattern::computeGridSize(
-    ConversionPatternRewriter &rw, GemmElementwiseGemmOp op, Value a, Value b,
-    Value c) const {
-  return computeGridSizeAttentionGemmElmtGemm(rw, op, a, b, c);
-}
-
 void RockGemmToGridwisePass::runOnOperation() {
   MLIRContext *ctx = &getContext();
   ConversionTarget target(*ctx);
 
@@ -70,14 +70,14 @@ GridCoordinates rock::layout::makeGroupedGridLayout(PatternRewriter &b,
   // be slowest changing in the grid.
   int64_t numChiplets = rock::lookupArchInfo(arch).maxNumXCC;
   if (numChiplets > 1) {
-    // It was emphircally found that two chiplets as a group
+    // It was empirically found that two chiplets as a group
     // computing a spatial mxn tile has better locality throughout.
     int64_t numChipletsPerGroup = std::ceil(numChiplets / 2);
     int64_t gridSize = info.gBlocks * info.mBlocks * info.nBlocks;
     bid = rearrangeWorkgroupsForXCC(loc, b, bid, gridSize, numChipletsPerGroup);
   }
 
-  // Heurisitc to compute groupSize
+  // Heuristic to compute groupSize
   // This also covers the cases where the output width is larger
   // than the input width
   int64_t bitWidthIn = info.inputType.getIntOrFloatBitWidth();
@@ -137,7 +137,7 @@ GridCoordinates rock::layout::makeGxNGridLayout(PatternRewriter &b,
   // be slowest changing in the grid.
   int64_t numChiplets = rock::lookupArchInfo(arch).maxNumXCC;
   if (numChiplets > 1) {
-    // It was emphircally found that two chiplets as a group
+    // It was empirically found that two chiplets as a group
     // computing a spatial mxn tile has better locality throughout.
     int64_t numChipletsPerGroup = std::ceil(numChiplets / 2);
     bid = rearrangeWorkgroupsForXCC(loc, b, bid, gridSize, numChipletsPerGroup);
 
@@ -50,8 +50,8 @@ struct GridLayoutInfo {
   Type outputType;
 };
 
-/// This function emits the right triplet of <group,block_m,block_n> identifers,
-/// given a flat blockId. This has been adapted from:
+/// This function emits the right triplet of <group,block_m,block_n>
+/// identifiers, given a flat blockId. This has been adapted from:
 /// https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py
 ///
 GridCoordinates makeGroupedGridLayout(PatternRewriter &b, Location loc,