diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 5cc65082a7e56..5c63ad5f32b71 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
 #define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
 
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/IR/PatternMatch.h"
@@ -68,6 +69,22 @@ void populateGpuLowerClusteredSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
 
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
+void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns,
+                                                 unsigned subgroupSize,
+                                                 unsigned shuffleBitwidth,
+                                                 amdgpu::Chipset chipset,
+                                                 PatternBenefit benefit = 1);
+
+/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns`
+/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`.
+void populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset,
+    PatternBenefit benefit = 1);
+
 /// Collect all patterns to rewrite ops within the GPU dialect.
 inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
   populateGpuAllReducePatterns(patterns);
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
index 073493971e6b7..a55f0e1f09a36 100644
--- a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
+++ b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
@@ -29,6 +29,8 @@ class LaunchOp;
 
 /// Returns the matching vector combining kind.
 vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode);
+/// Returns the matching gpu allreduce mode.
+gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind);
 } // namespace gpu
 
 /// Get a gpu.func created from outlining the region of a gpu.launch op with the
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
new file mode 100644
index 0000000000000..f766dab8c02df
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
@@ -0,0 +1,41 @@
+//===- ReductionUtils.h - Reduction Utilities -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
+#define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+
+struct ClusterInfo {
+  unsigned clusterStride;
+  unsigned clusterSize;
+  unsigned subgroupSize;
+};
+
+FailureOr<ClusterInfo> getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
+  unsigned subgroupSize);
+
+FailureOr<Value>
+createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+                           Value input, gpu::AllReduceOperation mode,
+                           const ClusterInfo &ci, amdgpu::Chipset chipset,
+                           function_ref<Value(Value)> packFn,
+                           function_ref<Value(Value)> unpackFn);
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 013311ec027da..1074760aa959e 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -53,6 +53,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
 
   LINK_LIBS PUBLIC
   MLIRAffineUtils
+  MLIRAMDGPUDialect
   MLIRArithDialect
   MLIRAsyncDialect
   MLIRBufferizationDialect
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 43eff3eddcc49..57af63cbe5eca 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -10,15 +10,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
@@ -144,34 +149,34 @@ struct ScalarizeSingleElementReduce final
   }
 };
 
-struct ClusterInfo {
-  unsigned clusterStride;
-  unsigned clusterSize;
-  unsigned subgroupSize;
-};
-
-static FailureOr<ClusterInfo>
-getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
-  assert(llvm::isPowerOf2_32(subgroupSize));
-
-  std::optional<uint32_t> clusterSize = op.getClusterSize();
-  assert(!clusterSize ||
-         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
-  if (clusterSize && *clusterSize > subgroupSize)
-    return op.emitOpError()
-           << "cluster size " << *clusterSize
-           << " is greater than subgroup size " << subgroupSize;
-  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
-
-  auto clusterStride = op.getClusterStride();
-  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-  if (clusterStride >= subgroupSize)
-    return op.emitOpError()
-           << "cluster stride " << clusterStride
-           << " is not less than subgroup size " << subgroupSize;
-
-  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
-}
+// struct ClusterInfo {
+//   unsigned clusterStride;
+//   unsigned clusterSize;
+//   unsigned subgroupSize;
+// };
+
+// static FailureOr<ClusterInfo>
+// getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) {
+//   assert(llvm::isPowerOf2_32(subgroupSize));
+
+//   std::optional<uint32_t> clusterSize = op.getClusterSize();
+//   assert(!clusterSize ||
+//          llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+//   if (clusterSize && *clusterSize > subgroupSize)
+//     return op.emitOpError()
+//            << "cluster size " << *clusterSize
+//            << " is greater than subgroup size " << subgroupSize;
+//   unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+//   auto clusterStride = op.getClusterStride();
+//   assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+//   if (clusterStride >= subgroupSize)
+//     return op.emitOpError()
+//            << "cluster stride " << clusterStride
+//            << " is not less than subgroup size " << subgroupSize;
+
+//   return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+// }
 
 /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn`
 /// and `unpackFn` to convert to the native shuffle type and to the reduction
@@ -362,6 +367,194 @@ struct VectorSubgroupReduceToShuffles final
   unsigned shuffleBitwidth = 0;
   bool matchClustered = false;
 };
+
+// FailureOr<Value>
+// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+//                            Value input, gpu::AllReduceOperation mode,
+//                            const ClusterInfo &ci, amdgpu::Chipset chipset) {
+//   Location loc = op.getLoc();
+//   Value dpp;
+//   Value res = input;
+//   constexpr int allRows = 0xf;
+//   constexpr int allBanks = 0xf;
+//   const bool boundCtrl = true;
+//   if (ci.clusterSize >= 2) {
+//     // Perform reduction between all lanes N <-> N+1.
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+//         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+
+//   if (ci.clusterSize >= 4) {
+//     // Perform reduction between all lanes N <-> N+2.
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+//         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 8) {
+//     // Perform reduction between all lanes N <-> 7-N,
+//     // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+//         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 16) {
+//     // Perform reduction between all lanes N <-> 15-N,
+//     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+//     dpp = rewriter.create<amdgpu::DPPOp>(
+//         loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+//         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   if (ci.clusterSize >= 32) {
+//     if (chipset.majorVersion <= 9) {
+//       // Broadcast last value from each row to next row.
+//       // Use row mask to avoid polluting rows 1 and 3.
+//       dpp = rewriter.create<amdgpu::DPPOp>(
+//           loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+//           rewriter.getUnitAttr(), 0xa, allBanks,
+//           /*bound_ctrl*/ false);
+//       res = vector::makeArithReduction(
+//           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+//     } else if (chipset.majorVersion <= 12) {
+//       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+//       Value uint32Max = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+//       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+//                                                   uint32Max, uint32Max,
+//                                                   /*fi=*/true,
+//                                                   /*bound_ctrl=*/false);
+//       res = vector::makeArithReduction(
+//           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+//       if (ci.subgroupSize == 32) {
+//         Value lane0 = rewriter.create<arith::ConstantOp>(
+//             loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+//         res =
+//             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//       }
+//     } else {
+//       return rewriter.notifyMatchFailure(
+//           op, "Subgroup reduce lowering to DPP not currently supported for "
+//               "this device.");
+//     }
+//   }
+//   if (ci.clusterSize >= 64) {
+//     if (chipset.majorVersion <= 9) {
+//       // Broadcast 31st lane value to rows 2 and 3.
+//       // Use row mask to avoid polluting rows 0 and 1.
+//       dpp = rewriter.create<amdgpu::DPPOp>(
+//           loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
+//           rewriter.getUnitAttr(), 0xc, allBanks,
+//           /*bound_ctrl*/ false);
+
+//     } else if (chipset.majorVersion <= 12) {
+//       // Assume reduction across 32 lanes has been done.
+//       // Perform final reduction manually by summing values in lane 0 and
+//       // lane 32.
+//       Value lane0 = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+//       Value lane32 = rewriter.create<arith::ConstantOp>(
+//           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
+//       dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+//       res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//     } else {
+//       return rewriter.notifyMatchFailure(
+//           op, "Subgroup reduce lowering to DPP not currently supported for "
+//               "this device.");
+//     }
+//     res = vector::makeArithReduction(rewriter, loc,
+//                                      gpu::convertReductionKind(mode), res, dpp);
+//   }
+//   assert(res.getType() == input.getType());
+//   return res;
+// }
+
+/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp`
+/// ops over scalar types. Assumes that the subgroup has
+/// `subgroupSize` lanes. Applicable only to AMD GPUs.
+struct ScalarSubgroupReduceToDPP final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
+                            unsigned shuffleBitwidth, bool matchClustered,
+                            amdgpu::Chipset chipset, PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth),
+        matchClustered(matchClustered), chipset(chipset) {}
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getClusterSize().has_value() != matchClustered) {
+      return rewriter.notifyMatchFailure(
+          op, llvm::formatv("op is {0}clustered but pattern is configured to "
+                            "only match {1}clustered ops",
+                            matchClustered ? "non-" : "",
+                            matchClustered ? "" : "non-"));
+    }
+    auto ci = getAndValidateClusterInfo(op, subgroupSize);
+    if (failed(ci))
+      return failure();
+
+    if (ci->clusterStride != 1)
+      return rewriter.notifyMatchFailure(
+          op, "Supgroup reductions using DPP are currently only available for "
+              "clusters of contiguous lanes.");
+
+    Type valueTy = op.getType();
+    unsigned elemBitwidth =
+        getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth();
+    if (!valueTy.isIntOrFloat() || elemBitwidth > shuffleBitwidth)
+      return rewriter.notifyMatchFailure(
+          op, "value type is not a compatible scalar");
+
+    Location loc = op.getLoc();
+    // Since this is already a native shuffle scalar, no packing is necessary.
+    if (elemBitwidth == shuffleBitwidth) {
+      auto identityFn = [](Value v) { return v; };
+      FailureOr<Value> dpp =
+          createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(),
+                                     *ci, chipset, identityFn, identityFn);
+      if (failed(dpp))
+        return failure();
+      rewriter.replaceOp(op, dpp.value());
+      return success();
+    }
+
+    auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth);
+    auto equivIntType = rewriter.getIntegerType(elemBitwidth);
+    auto packFn = [loc, &rewriter, equivIntType,
+                   shuffleIntType](Value unpackedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::BitcastOp>(loc, equivIntType, unpackedVal);
+      return rewriter.create<arith::ExtUIOp>(loc, shuffleIntType, asInt);
+    };
+    auto unpackFn = [loc, &rewriter, equivIntType,
+                     valueTy](Value packedVal) -> Value {
+      auto asInt =
+          rewriter.create<arith::TruncIOp>(loc, equivIntType, packedVal);
+      return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
+    };
+
+    FailureOr<Value> dpp = createSubgroupDPPReduction(
+        rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn);
+    if (failed(dpp))
+      return failure();
+
+    rewriter.replaceOp(op, dpp.value());
+    return success();
+  }
+
+private:
+  unsigned subgroupSize = 0;
+  unsigned shuffleBitwidth = 0;
+  bool matchClustered = false;
+  amdgpu::Chipset chipset;
+};
 } // namespace
 
 void mlir::populateGpuBreakDownSubgroupReducePatterns(
@@ -372,6 +565,22 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns(
   patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
 }
 
+void mlir::populateGpuLowerSubgroupReduceToDPPPatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(
+      patterns.getContext(), subgroupSize, shuffleBitwidth,
+      /*matchClustered=*/false, chipset, benefit);
+}
+
+void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+    RewritePatternSet &patterns, unsigned subgroupSize,
+    unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) {
+  patterns.add<ScalarSubgroupReduceToDPP>(
+      patterns.getContext(), subgroupSize, shuffleBitwidth,
+      /*matchClustered=*/true, chipset, benefit);
+}
+
 void mlir::populateGpuLowerSubgroupReduceToShufflePatterns(
     RewritePatternSet &patterns, unsigned subgroupSize,
     unsigned shuffleBitwidth, PatternBenefit benefit) {
diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
index 69094c518a159..e7489eaac4988 100644
--- a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
@@ -1,14 +1,17 @@
 add_mlir_dialect_library(MLIRGPUUtils
   Utils.cpp
   DistributionUtils.cpp
+  ReductionUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils
 
   LINK_LIBS PUBLIC
-  MLIRArithDialect
   MLIRAffineDialect
+  MLIRArithDialect
+  MLIRAMDGPUDialect
   MLIRGPUDialect
+  MLIRROCDLDialect
   MLIRSupport
   MLIRIR
   )
diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
new file mode 100644
index 0000000000000..2f50a1ec87cba
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
@@ -0,0 +1,171 @@
+//===- ReductionUtils.cpp - Distribution tools for GPUOps --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements distribution utility methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
+#include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+
+#include <numeric>
+
+using namespace mlir;
+
+FailureOr<ClusterInfo> mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
+                                                 unsigned subgroupSize) {
+  assert(llvm::isPowerOf2_32(subgroupSize));
+
+  std::optional<uint32_t> clusterSize = op.getClusterSize();
+  assert(!clusterSize ||
+         llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+  if (clusterSize && *clusterSize > subgroupSize)
+    return op.emitOpError()
+           << "cluster size " << *clusterSize
+           << " is greater than subgroup size " << subgroupSize;
+  unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
+
+  auto clusterStride = op.getClusterStride();
+  assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
+  if (clusterStride >= subgroupSize)
+    return op.emitOpError()
+           << "cluster stride " << clusterStride
+           << " is not less than subgroup size " << subgroupSize;
+
+  return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize};
+}
+
+FailureOr<Value> mlir::createSubgroupDPPReduction(
+    PatternRewriter &rewriter, gpu::SubgroupReduceOp op, Value input,
+    gpu::AllReduceOperation mode, const ClusterInfo &ci,
+    amdgpu::Chipset chipset, function_ref<Value(Value)> packFn,
+    function_ref<Value(Value)> unpackFn) {
+  
+  Location loc = op.getLoc();
+  Value dpp;
+  Value res = input;
+  constexpr int allRows = 0xf;
+  constexpr int allBanks = 0xf;
+  const bool boundCtrl = true;
+  if (ci.clusterSize >= 2) {
+    // Perform reduction between all lanes N <-> N+1.
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+
+  if (ci.clusterSize >= 4) {
+    // Perform reduction between all lanes N <-> N+2.
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+        rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 8) {
+    // Perform reduction between all lanes N <-> 7-N,
+    // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 16) {
+    // Perform reduction between all lanes N <-> 15-N,
+    // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
+    res = packFn(res);
+    dpp = rewriter.create<amdgpu::DPPOp>(
+        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    dpp = unpackFn(dpp);
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  if (ci.clusterSize >= 32) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast last value from each row to next row.
+      // Use row mask to avoid polluting rows 1 and 3.
+      res = packFn(res);
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
+          rewriter.getUnitAttr(), 0xa, allBanks,
+          /*bound_ctrl*/ false);
+      dpp = unpackFn(dpp);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+    } else if (chipset.majorVersion <= 12) {
+      // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+      Value uint32Max = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+      res = packFn(res);
+      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+                                                  uint32Max, uint32Max,
+                                                  /*fi=*/true,
+                                                  /*bound_ctrl=*/false);
+      dpp = unpackFn(dpp);
+      res = vector::makeArithReduction(
+          rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+      if (ci.subgroupSize == 32) {
+        Value lane0 = rewriter.create<arith::ConstantOp>(
+            loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+        res =
+            rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+      }
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
+    }
+  }
+  if (ci.clusterSize >= 64) {
+    if (chipset.majorVersion <= 9) {
+      // Broadcast 31st lane value to rows 2 and 3.
+      // Use row mask to avoid polluting rows 0 and 1.
+      res = packFn(res);
+      dpp = rewriter.create<amdgpu::DPPOp>(
+          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
+          rewriter.getUnitAttr(), 0xc, allBanks,
+          /*bound_ctrl*/ false);
+      dpp = unpackFn(dpp);
+
+    } else if (chipset.majorVersion <= 12) {
+      // Assume reduction across 32 lanes has been done.
+      // Perform final reduction manually by summing values in lane 0 and
+      // lane 32.
+      Value lane0 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
+      Value lane32 = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
+      dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
+      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "Subgroup reduce lowering to DPP not currently supported for "
+              "this device.");
+    }
+    res = vector::makeArithReduction(rewriter, loc,
+                                     gpu::convertReductionKind(mode), res, dpp);
+  }
+  assert(res.getType() == input.getType());
+  return res;
+}
diff --git a/mlir/lib/Dialect/GPU/Utils/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
index 1f09875b3e273..53b1e0883055c 100644
--- a/mlir/lib/Dialect/GPU/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
@@ -41,4 +41,30 @@ vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode) {
   llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
 }
 
+gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind) {
+  switch (kind) {
+#define MAP_CASE(X)                                                            \
+  case vector::CombiningKind::X:                                               \
+    return gpu::AllReduceOperation::X
+
+    MAP_CASE(ADD);
+    MAP_CASE(MUL);
+    MAP_CASE(MINUI);
+    MAP_CASE(MINSI);
+    MAP_CASE(MINNUMF);
+    MAP_CASE(MAXSI);
+    MAP_CASE(MAXUI);
+    MAP_CASE(MAXNUMF);
+    MAP_CASE(AND);
+    MAP_CASE(OR);
+    MAP_CASE(XOR);
+    MAP_CASE(MINIMUMF);
+    MAP_CASE(MAXIMUMF);
+
+#undef MAP_CASE
+  }
+
+  llvm_unreachable("Vector and GPU reduction kinds should match 1:1");
+}
+
 } // namespace mlir::gpu
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 9f2aa1be52fc3..139edf6882df6 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -6,14 +6,20 @@
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles" %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-SHFL
 
+// RUN: mlir-opt --allow-unregistered-dialect \
+// RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-DPP
+
 // CHECK-SUB:  gpu.module @kernels {
 // CHECK-SHFL: gpu.module @kernels {
+// CHECK-DPP: gpu.module @kernels {
 gpu.module @kernels {
 
   // CHECK-SUB-LABEL:  gpu.func @kernel0(
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<5xf16>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel0(
+  // CHECK-DPP-LABEL: gpu.func @kernel0(
   gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
     // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
     // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -26,16 +32,19 @@ gpu.module @kernels {
     // CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
     // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
     // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum0) : (vector<5xf16>) -> ()
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
     %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum1) : (vector<5xf16>) -> ()
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}}
     %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum2) : (vector<5xf16>) -> ()
 
@@ -52,27 +61,34 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<1xf32>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel1(
+  //
+  // CHECK-DPP-LABEL: gpu.func @kernel1(
   gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
     // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
     // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
     // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
     %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum0) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
     %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum1) : (vector<1xf32>) -> ()
 
     // Note stride is dropped because it is == 1.
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} quad_perm
+    // CHECK-DPP: amdgpu.dpp {{.+}} row_half_mirror
     %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum2) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
     // CHECK-SUB: "test.consume"
+    // CHECK-DPP-NOT: amdgpu.dpp
     %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum3) : (vector<1xf32>) -> ()
 
@@ -86,6 +102,8 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel2(
+  // CHECK-DPP-LABEL: gpu.func @kernel2(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
     // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -103,6 +121,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-DPP-LABEL: gpu.func @kernel3(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel3(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -122,6 +142,8 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
     // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
     // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
+    
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
     %sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
 
@@ -131,6 +153,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-DPP-LABEL: gpu.func @kernel3_clustered(
+  // CHECK-DPP-SAME:    %[[ARG0:.+]]: i32)
   gpu.func @kernel3_clustered(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -144,6 +168,14 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32
     // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
     // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
+
+    // CHECK-DPP: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
+    // CHECK-DPP: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
+    // CHECK-DPP: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
+    // CHECK-DPP: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
+    // CHECK-DPP: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
+    // CHECK-DPP: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+    // CHECK-DPP: "test.consume"(%[[A2]]) : (i32) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
 
@@ -153,6 +185,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-DPP-LABEL: gpu.func @kernel3_clustered_strided(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -175,6 +209,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel4(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
+  // CHECK-DPP-LABEL: gpu.func @kernel4(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -211,6 +247,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
+  // CHECK-DPP-LABEL: gpu.func @kernel4_clustered(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -226,6 +264,7 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel5(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-DPP-LABEL: gpu.func @kernel5(
   gpu.func @kernel5(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -237,6 +276,7 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
+    // CHECK-DPP-COUNT-6: amdgpu.dpp
     %sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
@@ -246,6 +286,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-DPP-LABEL: gpu.func @kernel5_clustered
+  // CHECK-DPP-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5_clustered(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -257,6 +299,16 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
+
+    // CHECK-DPP: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
+    // CHECK-DPP: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
+    // CHECK-DPP: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
+    // CHECK-DPP: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-DPP: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
+    // CHECK-DPP: "test.consume"(%[[VAR7]]) : (i16) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
@@ -266,6 +318,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel6(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
+  // CHECK-DPP-LABEL: gpu.func @kernel6(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
     // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -289,6 +343,8 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
+  // CHECK-DPP-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
+  // CHECK-DPP-NOT: amdgpu.dpp
   gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL-COUNT-5: gpu.shuffle xor
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>)
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index a49d304baf5c6..4ebcf897fd532 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -10,10 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
@@ -54,7 +57,9 @@ struct TestGpuSubgroupReduceLoweringPass
       : PassWrapper(pass) {}
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<arith::ArithDialect, vector::VectorDialect>();
+    registry
+        .insert<amdgpu::AMDGPUDialect, arith::ArithDialect, LLVM::LLVMDialect,
+                ROCDL::ROCDLDialect, vector::VectorDialect>();
   }
 
   StringRef getArgument() const final {
@@ -70,6 +75,12 @@ struct TestGpuSubgroupReduceLoweringPass
       llvm::cl::desc("Expand subgroup_reduce ops to shuffle ops."),
       llvm::cl::init(false)};
 
+  Option<std::string> target{
+      *this, "target",
+      llvm::cl::desc("Target backend name which will be used to provide "
+                     "compatible lowerings of subgroup reduce."),
+      llvm::cl::init("")};
+
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
 
@@ -77,8 +88,15 @@ struct TestGpuSubgroupReduceLoweringPass
     // perform fewer failing matches.
     populateGpuBreakDownSubgroupReducePatterns(patterns,
                                                /*maxShuffleBitwidth=*/32,
-                                               PatternBenefit(2));
+                                               PatternBenefit(3));
     if (expandToShuffles) {
+      auto maybeChipset = amdgpu::Chipset::parse(target);
+      if (succeeded(maybeChipset)) {
+        populateGpuLowerSubgroupReduceToDPPPatterns(
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+        populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+      }
       populateGpuLowerSubgroupReduceToShufflePatterns(
           patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);
       populateGpuLowerClusteredSubgroupReduceToShufflePatterns(