Skip to content

Conversation

@Groverkss
Copy link
Member

@Groverkss Groverkss commented Jul 2, 2025

This patch adds a better maskedload/maskedstore lowering on amdgpu backend for loads which are either fully masked or fully unmasked. For these cases, we can either generate a oob buffer load with no if condition, or we can generate a normal load with a if condition (if no fat_raw_buffer space).

@llvmbot
Copy link
Member

llvmbot commented Jul 2, 2025

@llvm/pr-subscribers-mlir-gpu
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-amdgpu

Author: Kunwar Grover (Groverkss)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/146748.diff

2 Files Affected:

  • (modified) mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp (+94-1)
  • (modified) mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir (+25)
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp
index 9a368f372c296..b290dc46910e3 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp
@@ -61,6 +61,36 @@ static Value createVectorLoadForMaskedLoad(OpBuilder &builder, Location loc,
   return res;
 }
 
+/// Check if the given value comes from a:
+///
+/// arith.select %cond, TRUE/FALSE, TRUE/FALSE
+///
+/// i.e the condition is either always true or it's always false.
+///
+/// Returns the condition to use for scf.if (condition) { true } else { false }.
+static FailureOr<Value> matchFullSelect(OpBuilder &b, Value val) {
+  auto selectOp = val.getDefiningOp<arith::SelectOp>();
+  if (!selectOp)
+    return failure();
+  std::optional<int64_t> trueInt = getConstantIntValue(selectOp.getTrueValue());
+  std::optional<int64_t> falseInt =
+      getConstantIntValue(selectOp.getFalseValue());
+  if (!trueInt || !falseInt)
+    return failure();
+  // getConstantIntValue returns -1 for "true" for bools.
+  if (trueInt.value() == -1 && falseInt.value() == 0)
+    return selectOp.getCondition();
+
+  if (trueInt.value() == 0 && falseInt.value() == -1) {
+    Value cond = selectOp.getCondition();
+    Value one = b.create<arith::ConstantIntOp>(cond.getLoc(), /*value=*/true,
+                                               /*width=*/1);
+    Value inverse = b.create<arith::XOrIOp>(cond.getLoc(), cond, one);
+    return inverse;
+  }
+  return failure();
+}
+
 static constexpr char kMaskedloadNeedsMask[] =
     "amdgpu.buffer_maskedload_needs_mask";
 
@@ -78,6 +108,16 @@ struct MaskedLoadLowering final : OpRewritePattern<vector::MaskedLoadOp> {
       return failure();
     }
 
+    // Check if this is either a full inbounds load or an empty, oob load. If
+    // so, take the fast path and don't generate a if condition, because we know
+    // doing the oob load is always safe.
+    if (succeeded(matchFullSelect(rewriter, maskedOp.getMask()))) {
+      Value load =
+          createVectorLoadForMaskedLoad(rewriter, maskedOp.getLoc(), maskedOp);
+      rewriter.replaceOp(maskedOp, load);
+      return success();
+    }
+
     Location loc = maskedOp.getLoc();
     Value src = maskedOp.getBase();
 
@@ -148,11 +188,64 @@ struct MaskedLoadLowering final : OpRewritePattern<vector::MaskedLoadOp> {
   }
 };
 
+struct FullMaskedLoadToConditionalLoad
+    : OpRewritePattern<vector::MaskedLoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+public:
+  LogicalResult matchAndRewrite(vector::MaskedLoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    FailureOr<Value> maybeCond = matchFullSelect(rewriter, loadOp.getMask());
+    if (failed(maybeCond)) {
+      return failure();
+    }
+
+    Value cond = maybeCond.value();
+    auto trueBuilder = [&](OpBuilder &builder, Location loc) {
+      Value res = createVectorLoadForMaskedLoad(builder, loc, loadOp);
+      rewriter.create<scf::YieldOp>(loc, res);
+    };
+    auto falseBuilder = [&](OpBuilder &builder, Location loc) {
+      rewriter.create<scf::YieldOp>(loc, loadOp.getPassThru());
+    };
+    auto ifOp = rewriter.create<scf::IfOp>(loadOp.getLoc(), cond, trueBuilder,
+                                           falseBuilder);
+    rewriter.replaceOp(loadOp, ifOp);
+    return success();
+  }
+};
+
+struct FullMaskedStoreToConditionalStore
+    : OpRewritePattern<vector::MaskedStoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+public:
+  LogicalResult matchAndRewrite(vector::MaskedStoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    FailureOr<Value> maybeCond = matchFullSelect(rewriter, storeOp.getMask());
+    if (failed(maybeCond)) {
+      return failure();
+    }
+    Value cond = maybeCond.value();
+
+    auto trueBuilder = [&](OpBuilder &builder, Location loc) {
+      rewriter.create<vector::StoreOp>(loc, storeOp.getValueToStore(),
+                                       storeOp.getBase(), storeOp.getIndices());
+      rewriter.create<scf::YieldOp>(loc);
+    };
+    auto ifOp = rewriter.create<scf::IfOp>(storeOp.getLoc(), cond, trueBuilder);
+    rewriter.replaceOp(storeOp, ifOp);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::amdgpu::populateAmdgpuMaskedloadToLoadPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<MaskedLoadLowering>(patterns.getContext(), benefit);
+  patterns.add<MaskedLoadLowering, FullMaskedLoadToConditionalLoad,
+               FullMaskedStoreToConditionalStore>(patterns.getContext(),
+                                                  benefit);
 }
 
 struct AmdgpuMaskedloadToLoadPass final
diff --git a/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir b/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir
index febe46bf7a759..d6682ba14eeca 100644
--- a/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir
+++ b/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir
@@ -114,3 +114,28 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_
 // CHECK: %[[IF:.*]] = scf.if
 // CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG1]]]
 // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[ARG3]]
+
+// -----
+
+func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, %idx : index, %mask : vector<4xi1>, %passthru : vector<4xf32>) -> vector<4xf32> {
+  %res = vector.maskedload %mem[%idx, %idx], %mask, %passthru : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<4xi1>, vector<4xf32> into vector<4xf32>
+  return %res : vector<4xf32>
+}
+
+// -----
+
+func.func @full_select_maskedload_fatrawbuffer_to_load(%mem : memref<8x8xf16, #amdgpu.address_space<fat_raw_buffer>>, %idx : index, %cond : i1, %passthru : vector<4xf16>) -> vector<4xf16> {
+  %true = arith.constant dense<true> : vector<4xi1>
+  %false = arith.constant dense<false> : vector<4xi1>
+  %mask = arith.select %cond, %true, %false : vector<4xi1>
+  %res = vector.maskedload %mem[%idx, %idx], %mask, %passthru : memref<8x8xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xi1>, vector<4xf16> into vector<4xf16>
+  return %res : vector<4xf16>
+}
+
+func.func @full_select_maskedload_to_load(%mem : memref<8x8xf16>, %idx : index, %cond : i1, %passthru : vector<4xf16>) -> vector<4xf16> {
+  %true = arith.constant dense<true> : vector<4xi1>
+  %false = arith.constant dense<false> : vector<4xi1>
+  %mask = arith.select %cond, %true, %false : vector<4xi1>
+  %res = vector.maskedload %mem[%idx, %idx], %mask, %passthru : memref<8x8xf16>, vector<4xi1>, vector<4xf16> into vector<4xf16>
+  return %res : vector<4xf16>
+}

@llvmbot
Copy link
Member

llvmbot commented Jul 2, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Kunwar Grover (Groverkss)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/146748.diff

2 Files Affected:

  • (modified) mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp (+94-1)
  • (modified) mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir (+25)
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp
index 9a368f372c296..b290dc46910e3 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/MaskedloadToLoad.cpp
@@ -61,6 +61,36 @@ static Value createVectorLoadForMaskedLoad(OpBuilder &builder, Location loc,
   return res;
 }
 
+/// Check if the given value comes from a:
+///
+/// arith.select %cond, TRUE/FALSE, TRUE/FALSE
+///
+/// i.e the condition is either always true or it's always false.
+///
+/// Returns the condition to use for scf.if (condition) { true } else { false }.
+static FailureOr<Value> matchFullSelect(OpBuilder &b, Value val) {
+  auto selectOp = val.getDefiningOp<arith::SelectOp>();
+  if (!selectOp)
+    return failure();
+  std::optional<int64_t> trueInt = getConstantIntValue(selectOp.getTrueValue());
+  std::optional<int64_t> falseInt =
+      getConstantIntValue(selectOp.getFalseValue());
+  if (!trueInt || !falseInt)
+    return failure();
+  // getConstantIntValue returns -1 for "true" for bools.
+  if (trueInt.value() == -1 && falseInt.value() == 0)
+    return selectOp.getCondition();
+
+  if (trueInt.value() == 0 && falseInt.value() == -1) {
+    Value cond = selectOp.getCondition();
+    Value one = b.create<arith::ConstantIntOp>(cond.getLoc(), /*value=*/true,
+                                               /*width=*/1);
+    Value inverse = b.create<arith::XOrIOp>(cond.getLoc(), cond, one);
+    return inverse;
+  }
+  return failure();
+}
+
 static constexpr char kMaskedloadNeedsMask[] =
     "amdgpu.buffer_maskedload_needs_mask";
 
@@ -78,6 +108,16 @@ struct MaskedLoadLowering final : OpRewritePattern<vector::MaskedLoadOp> {
       return failure();
     }
 
+    // Check if this is either a full inbounds load or an empty, oob load. If
+    // so, take the fast path and don't generate a if condition, because we know
+    // doing the oob load is always safe.
+    if (succeeded(matchFullSelect(rewriter, maskedOp.getMask()))) {
+      Value load =
+          createVectorLoadForMaskedLoad(rewriter, maskedOp.getLoc(), maskedOp);
+      rewriter.replaceOp(maskedOp, load);
+      return success();
+    }
+
     Location loc = maskedOp.getLoc();
     Value src = maskedOp.getBase();
 
@@ -148,11 +188,64 @@ struct MaskedLoadLowering final : OpRewritePattern<vector::MaskedLoadOp> {
   }
 };
 
+struct FullMaskedLoadToConditionalLoad
+    : OpRewritePattern<vector::MaskedLoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+public:
+  LogicalResult matchAndRewrite(vector::MaskedLoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    FailureOr<Value> maybeCond = matchFullSelect(rewriter, loadOp.getMask());
+    if (failed(maybeCond)) {
+      return failure();
+    }
+
+    Value cond = maybeCond.value();
+    auto trueBuilder = [&](OpBuilder &builder, Location loc) {
+      Value res = createVectorLoadForMaskedLoad(builder, loc, loadOp);
+      rewriter.create<scf::YieldOp>(loc, res);
+    };
+    auto falseBuilder = [&](OpBuilder &builder, Location loc) {
+      rewriter.create<scf::YieldOp>(loc, loadOp.getPassThru());
+    };
+    auto ifOp = rewriter.create<scf::IfOp>(loadOp.getLoc(), cond, trueBuilder,
+                                           falseBuilder);
+    rewriter.replaceOp(loadOp, ifOp);
+    return success();
+  }
+};
+
+struct FullMaskedStoreToConditionalStore
+    : OpRewritePattern<vector::MaskedStoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+public:
+  LogicalResult matchAndRewrite(vector::MaskedStoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    FailureOr<Value> maybeCond = matchFullSelect(rewriter, storeOp.getMask());
+    if (failed(maybeCond)) {
+      return failure();
+    }
+    Value cond = maybeCond.value();
+
+    auto trueBuilder = [&](OpBuilder &builder, Location loc) {
+      rewriter.create<vector::StoreOp>(loc, storeOp.getValueToStore(),
+                                       storeOp.getBase(), storeOp.getIndices());
+      rewriter.create<scf::YieldOp>(loc);
+    };
+    auto ifOp = rewriter.create<scf::IfOp>(storeOp.getLoc(), cond, trueBuilder);
+    rewriter.replaceOp(storeOp, ifOp);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::amdgpu::populateAmdgpuMaskedloadToLoadPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<MaskedLoadLowering>(patterns.getContext(), benefit);
+  patterns.add<MaskedLoadLowering, FullMaskedLoadToConditionalLoad,
+               FullMaskedStoreToConditionalStore>(patterns.getContext(),
+                                                  benefit);
 }
 
 struct AmdgpuMaskedloadToLoadPass final
diff --git a/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir b/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir
index febe46bf7a759..d6682ba14eeca 100644
--- a/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir
+++ b/mlir/test/Dialect/AMDGPU/maskedload-to-load.mlir
@@ -114,3 +114,28 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_
 // CHECK: %[[IF:.*]] = scf.if
 // CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG1]]]
 // CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[ARG3]]
+
+// -----
+
+func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, %idx : index, %mask : vector<4xi1>, %passthru : vector<4xf32>) -> vector<4xf32> {
+  %res = vector.maskedload %mem[%idx, %idx], %mask, %passthru : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<4xi1>, vector<4xf32> into vector<4xf32>
+  return %res : vector<4xf32>
+}
+
+// -----
+
+func.func @full_select_maskedload_fatrawbuffer_to_load(%mem : memref<8x8xf16, #amdgpu.address_space<fat_raw_buffer>>, %idx : index, %cond : i1, %passthru : vector<4xf16>) -> vector<4xf16> {
+  %true = arith.constant dense<true> : vector<4xi1>
+  %false = arith.constant dense<false> : vector<4xi1>
+  %mask = arith.select %cond, %true, %false : vector<4xi1>
+  %res = vector.maskedload %mem[%idx, %idx], %mask, %passthru : memref<8x8xf16, #amdgpu.address_space<fat_raw_buffer>>, vector<4xi1>, vector<4xf16> into vector<4xf16>
+  return %res : vector<4xf16>
+}
+
+func.func @full_select_maskedload_to_load(%mem : memref<8x8xf16>, %idx : index, %cond : i1, %passthru : vector<4xf16>) -> vector<4xf16> {
+  %true = arith.constant dense<true> : vector<4xi1>
+  %false = arith.constant dense<false> : vector<4xi1>
+  %mask = arith.select %cond, %true, %false : vector<4xi1>
+  %res = vector.maskedload %mem[%idx, %idx], %mask, %passthru : memref<8x8xf16>, vector<4xi1>, vector<4xf16> into vector<4xf16>
+  return %res : vector<4xf16>
+}

@Groverkss Groverkss force-pushed the fullselect-masked-load branch from da495c7 to 86115e7 Compare July 10, 2025 11:28
@Groverkss Groverkss requested a review from kuhar July 10, 2025 11:28
@Groverkss Groverkss changed the title [mlir][AMDGPU] Add better load/store lowering for full select mask [mlir][AMDGPU] Add better load/store lowering for full mask Jul 10, 2025
@Groverkss Groverkss merged commit f964922 into llvm:main Jul 10, 2025
9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants