Rewrites pattern to be closer to device lib impl.

Muzammiluddin-Syed-ECE · Muzammiluddin-Syed-ECE · commit df55f2001906 · 2025-04-11T10:35:21.000-05:00
Signed-off-by: Muzammiluddin Syed &lt;muzasyed@amd.com&gt;
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -62,6 +62,13 @@ void populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
 
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
+void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
+                                                 unsigned subgroupSize,
+                                                 PatternBenefit benefit = 1);
+
 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns`
 /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
 void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp
@@ -67,7 +67,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
@@ -76,39 +76,41 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_shl, permArg);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 8) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4);
-    Value dppResult =
-        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   if (ci.clusterSize >= 16) {
-    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8);
     Value dppResult =
         b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
-                                amdgpu::DPPPerm::row_shr, permArg);
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
   }
 
   const int allRows = 0xf;
   const int allBanks = 0xf;
-
+  auto int32Type = IntegerType::get(b.getContext(), 32);
   if (ci.clusterSize >= 32) {
     auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
     Value dppResult = b.create<amdgpu::DPPOp>(
         loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
         b.getUnitAttr(), 0xa, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
-                                        result, dppResult);
+    result, dppResult);
+    if (ci.subgroupSize == 32) {
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);    
+    }
   }
 
   if (ci.clusterSize == 64) {
@@ -118,11 +120,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
         b.getUnitAttr(), allRows, allBanks, false);
     result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
                                         result, dppResult);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   }
 
-  auto int32Type = IntegerType::get(b.getContext(), 32);
-  Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
-  result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
   assert(result.getType() == input.getType());
   return result;
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -12,6 +12,8 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -362,6 +364,106 @@ struct VectorSubgroupReduceToShuffles final
   unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
 };
+
+Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
+                                 gpu::AllReduceOperation mode,
+                                 const ClusterInfo &ci) {
+  Value result = input;
+  if (ci.clusterSize >= 2) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shl, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 4) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2);
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_shl, permArg);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 8) {
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror,
+        b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  if (ci.clusterSize >= 16) {
+    Value dppResult =
+        b.create<amdgpu::DPPOp>(loc, result.getType(), result, result,
+                                amdgpu::DPPPerm::row_mirror, b.getUnitAttr());
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+  }
+
+  const int allRows = 0xf;
+  const int allBanks = 0xf;
+  auto int32Type = IntegerType::get(b.getContext(), 32);
+  if (ci.clusterSize >= 32) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15,
+        b.getUnitAttr(), 0xa, allBanks, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+    if (ci.subgroupSize == 32) {
+      Value lane01 = b.create<LLVM::ConstantOp>(loc, int32Type, 1);
+      result =
+          b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane01);
+    }
+  }
+
+  if (ci.clusterSize == 64) {
+    auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31);
+    Value dppResult = b.create<amdgpu::DPPOp>(
+        loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31,
+        b.getUnitAttr(), allRows, allBanks, false);
+    result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode),
+                                        result, dppResult);
+    Value lane63 = b.create<LLVM::ConstantOp>(loc, int32Type, 63);
+    result = b.create<ROCDL::ReadlaneOp>(loc, input.getType(), result, lane63);
+  }
+
+  assert(result.getType() == input.getType());
+  return result;
+}
+
+struct ScalarSubgroupReduceToDPP final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
+                            bool matchClustered, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        matchClustered(matchClustered) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getClusterSize().has_value() != matchClustered) {
+      return rewriter.notifyMatchFailure(
+          op, llvm::formatv("op is {0}clustered but pattern is configured to "
+                            "only match {1}clustered ops",
+                            matchClustered ? "non-" : "",
+                            matchClustered ? "" : "non-"));
+    }
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
+    Location loc = op.getLoc();
+    rewriter.replaceOp(op, createSubgroupDPPReduction(
+                               rewriter, loc, op.getValue(), op.getOp(), *ci));
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  bool matchClustered = false;
+};
 } // namespace
 
 void mlir::populateGpuBreakDownSubgroupReducePatterns(
@@ -372,6 +474,13 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
   patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
 }
 
+void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
+                                          /*matchClustered=*/true, benefit);
+}
+
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth, PatternBenefit benefit) {