keshavvinayak01
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyPaddingLevel.cpp‎
Lines changed: 173 additions & 13 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyPaddingLevel.cpp‎
Lines changed: 173 additions & 13 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel‎
Lines changed: 2 additions & 1 deletion b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding.mlir‎ renamed to ‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding_online_attention.mlir‎ b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding.mlir‎ renamed to ‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_padding_online_attention.mlir‎
@@ -7,24 +7,23 @@
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
-#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtInterfaces.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/InterleavedRange.h"
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #define DEBUG_TYPE "iree-codegen-gpu-apply-padding-level"
 
@@ -61,20 +60,95 @@ getTiledOps(Operation *funcOp, IREE::GPU::TilingLevel tilingLevel) {
   return targets;
 }
 
+/// For a reduction dimension of a linalg operation `linalgOp`,
+///
+/// 1) `loopIndex` is the index of the loop in `linalgOp` that is reduced,
+/// 2) `operand` is the first operand indexed by the reduction dimension,
+/// 3) `operandDim` is the dimension of the operand that's reduced.
+///
+/// Example. Consider the generic below with 2 reduction dimensions.
+///
+/// ```mlir
+/// #map = affine_map<(d0, d1) -> (d1, d0)>
+/// #map1 = affine_map<(d0, d1) -> (d0, d1)>
+/// #map2 = affine_map<(d0, d1) -> ()>
+/// [...]
+/// %0 = linalg.generic {indexing_maps = [#map, #map1, #map2],
+///                      iterator_types = ["reduction", "reduction"]}
+///                      ins(%arg0, %arg1 : tensor<?x?xf16>, tensor<?x?xf16>)
+///                      outs(%arg2 : tensor<f16>)
+/// [...]
+/// ```
+///
+/// d0 has reduction info (loopIndex = 0, operand = %arg0, operandDim = 1),
+/// d1 has reduction info (loopIndex = 1, operand = %arg0, operandDim = 0).
+namespace {
+struct ReductionDimInfo {
+public:
+  unsigned loopIndex;
+  unsigned operandDim;
+  Value operand;
+};
+} // namespace
+
+static FailureOr<SmallVector<ReductionDimInfo>>
+getReductionInfo(linalg::LinalgOp linalgOp) {
+  SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();
+  SmallVector<utils::IteratorType> iteratorTypes =
+      linalgOp.getIteratorTypesArray();
+
+  unsigned numReductionDims =
+      llvm::count(iteratorTypes, utils::IteratorType::reduction);
+  SmallVector<ReductionDimInfo> reductionDimInfos;
+  reductionDimInfos.reserve(numReductionDims);
+
+  for (unsigned loopIdx = 0, endIdx = iteratorTypes.size(); loopIdx < endIdx;
+       ++loopIdx) {
+    if (!linalg::isReductionIterator(iteratorTypes[loopIdx]))
+      continue;
+
+    AffineExpr loopIdxExpr = getAffineDimExpr(loopIdx, linalgOp.getContext());
+    for (auto [operandIndex, indexingMap] : llvm::enumerate(indexingMaps)) {
+      if (std::optional<unsigned> index =
+              indexingMap.getResultPosition(loopIdxExpr)) {
+        Value operand = linalgOp->getOperand(operandIndex);
+        reductionDimInfos.push_back({loopIdx, index.value(), operand});
+        break;
+      }
+    }
+  }
+
+  if (reductionDimInfos.size() != numReductionDims)
+    return failure();
+
+  return reductionDimInfos;
+}
+
 static LogicalResult applyPaddingLevel(RewriterBase &rewriter,
                                        TilingInterface tilingInterfaceOp,
                                        IREE::GPU::TilingLevel tilingLevel) {
+  // 1.a. Get padding values. The default should be poison, instead of 0.
+  //
+  // TODO(newling) pad with poison. Requires
+  //
+  // https://github.com/iree-org/iree/pull/21573
+  // https://github.com/llvm/llvm-project/pull/152003
+  // https://github.com/iree-org/iree/issues/21575
+  //
+  // The linalg operations can be padded with any value because we rewrite the
+  // basic block to select the reduction identity for the yielded value if the
+  // index corresponds to the padded part of the tensor.
+  //
+  // Non-linalg operations require special handling.
+  // TODO: Extract the special handling into an upstream PaddingOpInterface.
 
-  // 1.a. Get padding values.
   SmallVector<Attribute> paddingValues;
   for (Value operand : tilingInterfaceOp.getOperation()->getOperands()) {
     paddingValues.push_back(
         rewriter.getZeroAttr(getElementTypeOrSelf(operand.getType())));
   }
 
-  // 1.b. Special adjustment for OnlineAttention mask padding that needs to be
-  // mindful of softmax and pad to -inf.
-  // TODO: Extract into an upstream PaddingOpInterface.
+  // 1.b. Special adjustment for OnlineAttention mask padding.
   if (auto onlineAttentionOp = dyn_cast<IREE::LinalgExt::OnlineAttentionOp>(
           tilingInterfaceOp.getOperation())) {
     TypedValue<ShapedType> mask = onlineAttentionOp.getMask();
@@ -108,10 +182,11 @@ static LogicalResult applyPaddingLevel(RewriterBase &rewriter,
       getAsIndexOpFoldResult(rewriter.getContext(), tileSizes);
 
   // 3. Set options.
-  auto options = linalg::PadTilingInterfaceOptions()
-                     .setPaddingSizes(padSizes)
-                     .setPaddingValues(paddingValues)
-                     .setPadToMultipleOf(true);
+  linalg::PadTilingInterfaceOptions options =
+      linalg::PadTilingInterfaceOptions()
+          .setPaddingSizes(padSizes)
+          .setPaddingValues(paddingValues)
+          .setPadToMultipleOf(true);
 
   LLVM_DEBUG(DBGS() << "Start padding " << *tilingInterfaceOp << "\n";
              DBGS() << "--with tile sizes: "
@@ -121,6 +196,19 @@ static LogicalResult applyPaddingLevel(RewriterBase &rewriter,
              DBGS() << "--with padToMultipleOf: " << options.padToMultipleOf
                     << "\n");
 
+  // For linalg ops, we will rewrite the basic block in a way that means padded
+  // parts of tensors are never read. This is useful to avoid inferring what
+  // padding values should be for non-trivial basic blocks.
+  FailureOr<SmallVector<ReductionDimInfo>> reductionDimInfo;
+  if (auto linalgOp =
+          dyn_cast<linalg::LinalgOp>(tilingInterfaceOp.getOperation())) {
+    reductionDimInfo = getReductionInfo(linalgOp);
+    if (failed(reductionDimInfo)) {
+      tilingInterfaceOp.emitWarning("failed to map reduction dimensions");
+      return failure();
+    }
+  }
+
   // 4. Pad.
   SmallVector<tensor::PadOp> padOps;
   FailureOr<TilingInterface> maybePaddedOp =
@@ -130,9 +218,81 @@ static LogicalResult applyPaddingLevel(RewriterBase &rewriter,
     return failure();
   }
 
-  // 5. For each PadOp, create a linalg::CopyOp to allow dim propagations.
   TilingInterface paddedOp = *maybePaddedOp;
-  for (auto padOp : padOps) {
+
+  if (auto paddedLinalgOp =
+          dyn_cast<linalg::LinalgOp>(paddedOp.getOperation())) {
+    Block *block = paddedLinalgOp.getBlock();
+
+    SmallVector<Operation *> reductions;
+    for (auto [index, initOpOperand] :
+         llvm::enumerate(paddedLinalgOp.getDpsInitsMutable())) {
+      SmallVector<Operation *> combinerOps;
+      matchReduction(paddedLinalgOp.getRegionOutputArgs(), index, combinerOps);
+      reductions.insert(reductions.begin(), combinerOps.begin(),
+                        combinerOps.end());
+    }
+
+    for (Operation *reduction : reductions) {
+      std::optional<TypedAttr> reductionIdentity =
+          arith::getNeutralElement(reduction);
+      if (!reductionIdentity.has_value()) {
+        paddedOp.emitWarning("failed to get neutral element for reduction");
+        return failure();
+      }
+
+      // Get the sizes of the reduction dimensions before padding:
+      rewriter.setInsertionPoint(paddedOp.getOperation());
+      SmallVector<std::pair<unsigned, Value>> reductionDimSizes;
+      assert(succeeded(reductionDimInfo) &&
+             "obtained with confirmation earlier");
+      for (auto &&dimInfo : reductionDimInfo.value()) {
+        Value redDimSize = rewriter.create<tensor::DimOp>(
+            paddedOp.getLoc(), dimInfo.operand, dimInfo.operandDim);
+        reductionDimSizes.push_back({dimInfo.loopIndex, redDimSize});
+      }
+
+      // Add a check within the block to see if the current iteration over the
+      // loops is inside or outside the padded part of the iteration space.
+      rewriter.setInsertionPoint(reduction);
+      SmallVector<Value> conds;
+      for (auto &&[redDim, redDimSize] : reductionDimSizes) {
+        Value redDimIndex =
+            linalg::IndexOp::create(rewriter, paddedOp.getLoc(), redDim);
+        Value cond = arith::CmpIOp::create(rewriter, paddedOp.getLoc(),
+                                           arith::CmpIPredicate::ult,
+                                           redDimIndex, redDimSize);
+        conds.push_back(cond);
+      }
+      Value reductionIdentityValue = rewriter.create<arith::ConstantOp>(
+          paddedOp.getLoc(), reductionIdentity.value());
+      assert(conds.size() > 0);
+      Value cond = conds[0];
+      for (Value nxtCond : llvm::drop_begin(conds, 1)) {
+        cond = rewriter.create<arith::AndIOp>(paddedOp.getLoc(), cond, nxtCond);
+      }
+
+      // Find the reduction op operand that is reduced with the carried output.
+      if (reduction->getNumOperands() != 2) {
+        paddedOp.emitWarning("expected a reduction operation with 2 operands");
+        return failure();
+      }
+      Value carry = block->getArguments().back();
+      unsigned uncarryIndex = reduction->getOperand(0) == carry ? 1 : 0;
+      Value uncarried = reduction->getOperand(uncarryIndex);
+
+      // Select the reduction identity value if in the padding region.
+      Value selected = arith::SelectOp::create(
+          rewriter, paddedOp.getLoc(), cond, uncarried, reductionIdentityValue);
+      IRMapping mapping;
+      mapping.map(reduction->getOperand(uncarryIndex), selected);
+      Operation *redClone = rewriter.clone(*reduction, mapping);
+      rewriter.replaceOp(reduction, redClone);
+    }
+  }
+
+  // 5. For each PadOp, create a linalg::CopyOp to allow dim propagations.
+  for (tensor::PadOp padOp : padOps) {
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(padOp);
 
 
@@ -21,7 +21,8 @@ iree_lit_test_suite(
             "decompose_horizontally_fused_gemms.mlir",
             "gpu_alloc_private_memory_for_dps_ops.mlir",
             "gpu_apply_derived_thread_config.mlir",
-            "gpu_apply_padding.mlir",
+            "gpu_apply_padding_online_attention.mlir",
+            "gpu_apply_padding_partial_reduction.mlir",
             "gpu_apply_tiling_level.mlir",
             "gpu_bubble_resource_casts.mlir",
             "gpu_check_resource_usage.mlir",
 
@@ -17,7 +17,8 @@ iree_lit_test_suite(
     "decompose_horizontally_fused_gemms.mlir"
     "gpu_alloc_private_memory_for_dps_ops.mlir"
     "gpu_apply_derived_thread_config.mlir"
-    "gpu_apply_padding.mlir"
+    "gpu_apply_padding_online_attention.mlir"
+    "gpu_apply_padding_partial_reduction.mlir"
     "gpu_apply_tiling_level.mlir"
     "gpu_bubble_resource_casts.mlir"
     "gpu_check_resource_usage.mlir"