diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index a13ad33df29cd..5c63ad5f32b71 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -74,13 +74,15 @@ void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
 /// `subgroupSize` lanes. Applicable only to AMD GPUs.
 void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
                                                  unsigned subgroupSize,
+                                                 unsigned shuffleBitwidth,
                                                  amdgpu::Chipset chipset,
                                                  PatternBenefit benefit = 1);
 
 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns`
 /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
 void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset,
     PatternBenefit benefit = 1);
 
 /// Collect all patterns to rewrite ops within the GPU dialect.
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
index 073493971e6b7..a55f0e1f09a36 100644
--- a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
+++ b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
@@ -29,6 +29,8 @@ class LaunchOp;
 
 /// Returns the matching vector combining kind.
 vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode);
+/// Returns the matching gpu allreduce mode.
+gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind);
 } // namespace gpu
 
 /// Get a gpu.func created from outlining the region of a gpu.launch op with the
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
new file mode 100644
index 0000000000000..f766dab8c02df
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
@@ -0,0 +1,41 @@
+//===- ReductionUtils.h - Reduction Utilities -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
+#define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+
+struct ClusterInfo {
+  unsigned clusterStride;
+  unsigned clusterSize;
+  unsigned subgroupSize;
+};
+
+FailureOr<ClusterInfo> getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
+  unsigned subgroupSize);
+
+FailureOr<Value>
+createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+                           Value input, gpu::AllReduceOperation mode,
+                           const ClusterInfo &ci, amdgpu::Chipset chipset,
+                           function_ref<Value(Value)> packFn,
+                           function_ref<Value(Value)> unpackFn);
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 013311ec027da..1074760aa959e 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -53,6 +53,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
 
   LINK_LIBS PUBLIC
   MLIRAffineUtils
+  MLIRAMDGPUDialect
   MLIRArithDialect
   MLIRAsyncDialect
   MLIRBufferizationDialect
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index f2fc9a4e39bcd..57af63cbe5eca 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -148,34 +149,34 @@ struct ScalarizeSingleElementReduce final
   }
 };
 
-struct ClusterInfo {
-  unsigned clusterStride;
-  unsigned clusterSize;
-  unsigned subgroupSize;
-};
-
-static FailureOr<ClusterInfo>
-getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
-  assert(llvm::isPowerOf2_32(subgroupSize));
-
-  std::optional<uint32_t> clusterSize = op.getClusterSize();
-  assert(!clusterSize ||
-         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
-  if (clusterSize && *clusterSize > subgroupSize)
-    return op.emitOpError()
-           << "cluster size " << *clusterSize
-           << " is greater than subgroup size " << subgroupSize;
-  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
-
-  auto clusterStride = op.getClusterStride();
-  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-  if (clusterStride >= subgroupSize)
-    return op.emitOpError()
-           << "cluster stride " << clusterStride
-           << " is not less than subgroup size " << subgroupSize;
-
-  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
-}
+// struct ClusterInfo {
+//   unsigned clusterStride;
+//   unsigned clusterSize;
+//   unsigned subgroupSize;
+// };
+
+// static FailureOr<ClusterInfo>
+// getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+//   assert(llvm::isPowerOf2_32(subgroupSize));
+
+//   std::optional<uint32_t> clusterSize = op.getClusterSize();
+//   assert(!clusterSize ||
+//          llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+//   if (clusterSize && *clusterSize > subgroupSize)
+//     return op.emitOpError()
+//            << "cluster size " << *clusterSize
+//            << " is greater than subgroup size " << subgroupSize;
+//   unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+//   auto clusterStride = op.getClusterStride();
+//   assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+//   if (clusterStride >= subgroupSize)
+//     return op.emitOpError()
+//            << "cluster stride " << clusterStride
+//            << " is not less than subgroup size " << subgroupSize;
+
+//   return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+// }
 
 /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn`
 /// and `unpackFn` to convert to the native shuffle type and to the reduction
@@ -367,113 +368,113 @@ struct VectorSubgroupReduceToShuffles final
   bool matchClustered = false;
 };
 
-FailureOr<Value>
-createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
-                           Value input, gpu::AllReduceOperation mode,
-                           const ClusterInfo &ci, amdgpu::Chipset chipset) {
-  Location loc = op.getLoc();
-  Value dpp;
-  Value res = input;
-  constexpr int allRows = 0xf;
-  constexpr int allBanks = 0xf;
-  const bool boundCtrl = true;
-  if (ci.clusterSize >= 2) {
-    // Perform reduction between all lanes N <-> N+1.
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-        rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-
-  if (ci.clusterSize >= 4) {
-    // Perform reduction between all lanes N <-> N+2.
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-        rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  if (ci.clusterSize >= 8) {
-    // Perform reduction between all lanes N <-> 7-N,
-    // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
-        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  if (ci.clusterSize >= 16) {
-    // Perform reduction between all lanes N <-> 15-N,
-    // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
-        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  if (ci.clusterSize >= 32) {
-    if (chipset.majorVersion <= 9) {
-      // Broadcast last value from each row to next row.
-      // Use row mask to avoid polluting rows 1 and 3.
-      dpp = rewriter.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
-          rewriter.getUnitAttr(), 0xa, allBanks,
-          /*bound_ctrl*/ false);
-      res = vector::makeArithReduction(
-          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
-    } else if (chipset.majorVersion <= 12) {
-      // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
-      Value uint32Max = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
-      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
-                                                  uint32Max, uint32Max,
-                                                  /*fi=*/true,
-                                                  /*bound_ctrl=*/false);
-      res = vector::makeArithReduction(
-          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
-      if (ci.subgroupSize == 32) {
-        Value lane0 = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
-        res =
-            rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
-      }
-    } else {
-      return rewriter.notifyMatchFailure(
-          op, "Subgroup reduce lowering to DPP not currently supported for "
-              "this device.");
-    }
-  }
-  if (ci.clusterSize >= 64) {
-    if (chipset.majorVersion <= 9) {
-      // Broadcast 31st lane value to rows 2 and 3.
-      // Use row mask to avoid polluting rows 0 and 1.
-      dpp = rewriter.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
-          rewriter.getUnitAttr(), 0xc, allBanks,
-          /*bound_ctrl*/ false);
-
-    } else if (chipset.majorVersion <= 12) {
-      // Assume reduction across 32 lanes has been done.
-      // Perform final reduction manually by summing values in lane 0 and
-      // lane 32.
-      Value lane0 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
-      Value lane32 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
-      dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
-      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
-    } else {
-      return rewriter.notifyMatchFailure(
-          op, "Subgroup reduce lowering to DPP not currently supported for "
-              "this device.");
-    }
-    res = vector::makeArithReduction(rewriter, loc,
-                                     gpu::convertReductionKind(mode), res, dpp);
-  }
-  assert(res.getType() == input.getType());
-  return res;
-}
+// FailureOr<Value>
+// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+//                            Value input, gpu::AllReduceOperation mode,
+//                            const ClusterInfo &ci, amdgpu::Chipset chipset) {
+//   Location loc = op.getLoc();
+//   Value dpp;
+//   Value res = input;
+//   constexpr int allRows = 0xf;
+//   constexpr int allBanks = 0xf;
+//   const bool boundCtrl = true;
+//   if (ci.clusterSize >= 2) {
+//     // Perform reduction between all lanes N <-> N+1.
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+//         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+
+//   if (ci.clusterSize >= 4) {
+//     // Perform reduction between all lanes N <-> N+2.
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+//         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 8) {
+//     // Perform reduction between all lanes N <-> 7-N,
+//     // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+//         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 16) {
+//     // Perform reduction between all lanes N <-> 15-N,
+//     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+//         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 32) {
+//     if (chipset.majorVersion <= 9) {
+//       // Broadcast last value from each row to next row.
+//       // Use row mask to avoid polluting rows 1 and 3.
+//       dpp = rewriter.create<amdgpu::DPPOp>(
+//           loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+//           rewriter.getUnitAttr(), 0xa, allBanks,
+//           /*bound_ctrl*/ false);
+//       res = vector::makeArithReduction(
+//           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+//     } else if (chipset.majorVersion <= 12) {
+//       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+//       Value uint32Max = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+//       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+//                                                   uint32Max, uint32Max,
+//                                                   /*fi=*/true,
+//                                                   /*bound_ctrl=*/false);
+//       res = vector::makeArithReduction(
+//           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+//       if (ci.subgroupSize == 32) {
+//         Value lane0 = rewriter.create<arith::ConstantOp>(
+//             loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+//         res =
+//             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//       }
+//     } else {
+//       return rewriter.notifyMatchFailure(
+//           op, "Subgroup reduce lowering to DPP not currently supported for "
+//               "this device.");
+//     }
+//   }
+//   if (ci.clusterSize >= 64) {
+//     if (chipset.majorVersion <= 9) {
+//       // Broadcast 31st lane value to rows 2 and 3.
+//       // Use row mask to avoid polluting rows 0 and 1.
+//       dpp = rewriter.create<amdgpu::DPPOp>(
+//           loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
+//           rewriter.getUnitAttr(), 0xc, allBanks,
+//           /*bound_ctrl*/ false);
+
+//     } else if (chipset.majorVersion <= 12) {
+//       // Assume reduction across 32 lanes has been done.
+//       // Perform final reduction manually by summing values in lane 0 and
+//       // lane 32.
+//       Value lane0 = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+//       Value lane32 = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
+//       dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+//       res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//     } else {
+//       return rewriter.notifyMatchFailure(
+//           op, "Subgroup reduce lowering to DPP not currently supported for "
+//               "this device.");
+//     }
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   assert(res.getType() == input.getType());
+//   return res;
+// }
 
 /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
 /// ops over scalar types. Assumes that the subgroup has
@@ -481,9 +482,9 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
 struct ScalarSubgroupReduceToDPP final
     : OpRewritePattern<gpu::SubgroupReduceOp> {
   ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
-                            bool matchClustered, amdgpu::Chipset chipset,
-                            PatternBenefit benefit)
-      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+                            unsigned shuffleBitwidth, bool matchClustered,
+                            amdgpu::Chipset chipset, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth),
         matchClustered(matchClustered), chipset(chipset) {}
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
@@ -505,12 +506,42 @@ struct ScalarSubgroupReduceToDPP final
               "clusters of contiguous lanes.");
 
     Type valueTy = op.getType();
-    if (!valueTy.isIntOrFloat())
+    unsigned elemBitwidth =
+        getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth();
+    if (!valueTy.isIntOrFloat() || elemBitwidth > shuffleBitwidth)
       return rewriter.notifyMatchFailure(
           op, "value type is not a compatible scalar");
 
+    Location loc = op.getLoc();
+    // Since this is already a native shuffle scalar, no packing is necessary.
+    if (elemBitwidth == shuffleBitwidth) {
+      auto identityFn = [](Value v) { return v; };
+      FailureOr<Value> dpp =
+          createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(),
+                                     *ci, chipset, identityFn, identityFn);
+      if (failed(dpp))
+        return failure();
+      rewriter.replaceOp(op, dpp.value());
+      return success();
+    }
+
+    auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth);
+    auto equivIntType = rewriter.getIntegerType(elemBitwidth);
+    auto packFn = [loc, &rewriter, equivIntType,
+                   shuffleIntType](Value unpackedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::BitcastOp>(loc, equivIntType, unpackedVal);
+      return rewriter.create<arith::ExtUIOp>(loc, shuffleIntType, asInt);
+    };
+    auto unpackFn = [loc, &rewriter, equivIntType,
+                     valueTy](Value packedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::TruncIOp>(loc, equivIntType, packedVal);
+      return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
+    };
+
     FailureOr<Value> dpp = createSubgroupDPPReduction(
-        rewriter, op, op.getValue(), op.getOp(), *ci, chipset);
+        rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn);
     if (failed(dpp))
       return failure();
 
@@ -520,6 +551,7 @@ struct ScalarSubgroupReduceToDPP final
 
 private:
   unsigned subgroupSize = 0;
+  unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
   amdgpu::Chipset chipset;
 };
@@ -534,19 +566,19 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
 }
 
 void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
-    PatternBenefit benefit) {
-  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
-                                          /*matchClustered=*/false, chipset,
-                                          benefit);
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(
+      patterns.getContext(), subgroupSize, shuffleBitwidth,
+      /*matchClustered=*/false, chipset, benefit);
 }
 
 void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-    RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset,
-    PatternBenefit benefit) {
-  patterns.add<ScalarSubgroupReduceToDPP>(patterns.getContext(), subgroupSize,
-                                          /*matchClustered=*/true, chipset,
-                                          benefit);
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(
+      patterns.getContext(), subgroupSize, shuffleBitwidth,
+      /*matchClustered=*/true, chipset, benefit);
 }
 
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
index 69094c518a159..e7489eaac4988 100644
--- a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
@@ -1,14 +1,17 @@
 add_mlir_dialect_library(MLIRGPUUtils
   Utils.cpp
   DistributionUtils.cpp
+  ReductionUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils
 
   LINK_LIBS PUBLIC
-  MLIRArithDialect
   MLIRAffineDialect
+  MLIRArithDialect
+  MLIRAMDGPUDialect
   MLIRGPUDialect
+  MLIRROCDLDialect
   MLIRSupport
   MLIRIR
   )
diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
new file mode 100644
index 0000000000000..2f50a1ec87cba
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
@@ -0,0 +1,171 @@
+//===- ReductionUtils.cpp - Distribution tools for GPUOps --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements distribution utility methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+
+#include <numeric>
+
+using namespace mlir;
+
+FailureOr<ClusterInfo> mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
+                                                 unsigned subgroupSize) {
+  assert(llvm::isPowerOf2_32(subgroupSize));
+
+  std::optional<uint32_t> clusterSize = op.getClusterSize();
+  assert(!clusterSize ||
+         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+  if (clusterSize && *clusterSize > subgroupSize)
+    return op.emitOpError()
+           << "cluster size " << *clusterSize
+           << " is greater than subgroup size " << subgroupSize;
+  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+  auto clusterStride = op.getClusterStride();
+  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+  if (clusterStride >= subgroupSize)
+    return op.emitOpError()
+           << "cluster stride " << clusterStride
+           << " is not less than subgroup size " << subgroupSize;
+
+  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+FailureOr<Value> mlir::createSubgroupDPPReduction(
+    PatternRewriter &rewriter, gpu::SubgroupReduceOp op, Value input,
+    gpu::AllReduceOperation mode, const ClusterInfo &ci,
+    amdgpu::Chipset chipset, function_ref<Value(Value)> packFn,
+    function_ref<Value(Value)> unpackFn) {
+  
+  Location loc = op.getLoc();
+  Value dpp;
+  Value res = input;
+  constexpr int allRows = 0xf;
+  constexpr int allBanks = 0xf;
+  const bool boundCtrl = true;
+  if (ci.clusterSize >= 2) {
+    // Perform reduction between all lanes N <-> N+1.
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+
+  if (ci.clusterSize >= 4) {
+    // Perform reduction between all lanes N <-> N+2.
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 8) {
+    // Perform reduction between all lanes N <-> 7-N,
+    // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 16) {
+    // Perform reduction between all lanes N <-> 15-N,
+    // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 32) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast last value from each row to next row.
+      // Use row mask to avoid polluting rows 1 and 3.
+      res = packFn(res);
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+          rewriter.getUnitAttr(), 0xa, allBanks,
+          /*bound_ctrl*/ false);
+      dpp = unpackFn(dpp);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+    } else if (chipset.majorVersion <= 12) {
+      // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+      Value uint32Max = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+      res = packFn(res);
+      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+                                                  uint32Max, uint32Max,
+                                                  /*fi=*/true,
+                                                  /*bound_ctrl=*/false);
+      dpp = unpackFn(dpp);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+      if (ci.subgroupSize == 32) {
+        Value lane0 = rewriter.create<arith::ConstantOp>(
+            loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+        res =
+            rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+      }
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
+    }
+  }
+  if (ci.clusterSize >= 64) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast 31st lane value to rows 2 and 3.
+      // Use row mask to avoid polluting rows 0 and 1.
+      res = packFn(res);
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
+          rewriter.getUnitAttr(), 0xc, allBanks,
+          /*bound_ctrl*/ false);
+      dpp = unpackFn(dpp);
+
+    } else if (chipset.majorVersion <= 12) {
+      // Assume reduction across 32 lanes has been done.
+      // Perform final reduction manually by summing values in lane 0 and
+      // lane 32.
+      Value lane0 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+      Value lane32 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
+      dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
+    }
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  assert(res.getType() == input.getType());
+  return res;
+}
diff --git a/mlir/lib/Dialect/GPU/Utils/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
index 1f09875b3e273..53b1e0883055c 100644
--- a/mlir/lib/Dialect/GPU/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
@@ -41,4 +41,30 @@ vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode) {
   llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
 }
 
+gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind) {
+  switch (kind) {
+#define MAP_CASE(X)                                                            \
+  case vector::CombiningKind::X:                                               \
+    return gpu::AllReduceOperation::X
+
+    MAP_CASE(ADD);
+    MAP_CASE(MUL);
+    MAP_CASE(MINUI);
+    MAP_CASE(MINSI);
+    MAP_CASE(MINNUMF);
+    MAP_CASE(MAXSI);
+    MAP_CASE(MAXUI);
+    MAP_CASE(MAXNUMF);
+    MAP_CASE(AND);
+    MAP_CASE(OR);
+    MAP_CASE(XOR);
+    MAP_CASE(MINIMUMF);
+    MAP_CASE(MAXIMUMF);
+
+#undef MAP_CASE
+  }
+
+  llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
+}
+
 } // namespace mlir::gpu
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index fe402da4cc105..4ebcf897fd532 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -93,9 +93,9 @@ struct TestGpuSubgroupReduceLoweringPass
       auto maybeChipset = amdgpu::Chipset::parse(target);
       if (succeeded(maybeChipset)) {
         populateGpuLowerSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
         populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
       }
       populateGpuLowerSubgroupReduceToShufflePatterns(
           patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);