From 029b2ccce15d08900dd3aeaed1968e1b011fb6f0 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Tue, 25 Mar 2025 14:04:06 -0500 Subject: [PATCH 01/28] Creates AMDToGPUPass to house a subgroup reduce lowering pattern to DPP ops. Signed-off-by: Muzammiluddin Syed --- .../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h | 32 ++++ mlir/include/mlir/Conversion/Passes.h | 1 + mlir/include/mlir/Conversion/Passes.td | 16 ++ mlir/lib/Conversion/CMakeLists.txt | 1 + .../lib/Conversion/GPUToAMDGPU/CMakeLists.txt | 22 +++ .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 176 ++++++++++++++++++ mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt | 1 + 7 files changed, 249 insertions(+) create mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt create mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h new file mode 100644 index 0000000000000..2d3bb384235ca --- /dev/null +++ b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h @@ -0,0 +1,32 @@ +//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_ +#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_ + + +#include "mlir/IR/PatternMatch.h" +#include +#include + +namespace mlir { + +class LLVMTypeConverter; +class RewritePatternSet; +class TypeConverter; +class Pass; + +#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS +#include "mlir/Conversion/Passes.h.inc" + +void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns, + unsigned subgroupSize, + PatternBenefit benefit); + +} // namespace mlir + +#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_ \ No newline at end of file diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index ccd862f67c068..1189423799092 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -34,6 +34,7 @@ #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h" #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index bbba495e613b2..b28b4900e6814 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -643,6 +643,22 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> { ]; } +//===----------------------------------------------------------------------===// +// GPUToAMDGPU +//===----------------------------------------------------------------------===// + +def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> { + let summary = "Generate AMDGPU operations for gpu operations"; + let dependentDialects = [ + "amdgpu::AMDGPUDialect", + "LLVM::LLVMDialect", + "ROCDL::ROCDLDialect", + ]; + let options = [Option<"subgroupSize", "subgroup-size", "unsigned", + /*default=*/"64", + "Size of subgroup">]; +} + //===----------------------------------------------------------------------===// // ConvertIndexToLLVMPass //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index b6c21440c571c..b957a4473f1e6 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -24,6 +24,7 @@ add_subdirectory(FuncToEmitC) add_subdirectory(FuncToLLVM) add_subdirectory(FuncToSPIRV) add_subdirectory(GPUCommon) +add_subdirectory(GPUToAMDGPU) add_subdirectory(GPUToLLVMSPV) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt new file mode 100644 index 0000000000000..9b82b5dc63d9c --- /dev/null +++ b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt @@ -0,0 +1,22 @@ +add_mlir_conversion_library(MLIRGPUToAMDGPU + GPUToAMDGPU.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU + + DEPENDS + MLIRConversionPassIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRLLVMCommonConversion + MLIRLLVMDialect + MLIRGPUDialect + MLIRAMDGPUDialect + MLIRAMDGPUUtils + MLIRROCDLDialect + MLIRPass + MLIRTransforms + ) diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp new file mode 100644 index 0000000000000..bab83c12157a9 --- /dev/null +++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp @@ -0,0 +1,176 @@ +//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h" + +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Pass/Pass.h" + +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" + +#include "mlir/Transforms/WalkPatternRewriteDriver.h" +#include "llvm/Support/FormatVariadic.h" + +namespace mlir { +#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS +#include "mlir/Conversion/Passes.h.inc" +} // namespace mlir + +using namespace mlir; + +namespace { +struct ClusterInfo { + unsigned clusterStride; + unsigned clusterSize; + unsigned subgroupSize; +}; + +static FailureOr +getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { + assert(llvm::isPowerOf2_32(subgroupSize)); + + std::optional clusterSize = op.getClusterSize(); + assert(!clusterSize || + llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. + if (clusterSize && *clusterSize > subgroupSize) + return op.emitOpError() + << "cluster size " << *clusterSize + << " is greater than subgroup size " << subgroupSize; + unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + + auto clusterStride = op.getClusterStride(); + assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. + if (clusterStride >= subgroupSize) + return op.emitOpError() + << "cluster stride " << clusterStride + << " is not less than subgroup size " << subgroupSize; + + return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +} + +Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, + gpu::AllReduceOperation mode, + const ClusterInfo &ci) { + Value result = input; + if (ci.clusterSize >= 2) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1); + Value dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shr, permArg); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize >= 4) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2); + Value dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shr, permArg); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize >= 8) { + Value dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, + b.getUnitAttr()); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize >= 16) { + Value dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_mirror, b.getUnitAttr()); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize >= 32) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15); + Value dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, + b.getUnitAttr(), 10, 15, false); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize == 64) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31); + Value dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, + b.getUnitAttr(), 12, 15, false); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + auto int32Type = IntegerType::get(b.getContext(), 32); + Value lane63 = b.create(loc, int32Type, 63); + result = b.create(loc, input.getType(), result, lane63); + assert(result.getType() == input.getType()); + return result; +} + +struct ScalarSubgroupReduceToShuffles final + : OpRewritePattern { + ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize, + bool matchClustered, PatternBenefit benefit) + : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), + matchClustered(matchClustered) {} + + LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, + PatternRewriter &rewriter) const override { + if (op.getClusterSize().has_value() != matchClustered) { + return rewriter.notifyMatchFailure( + op, llvm::formatv("op is {0}clustered but pattern is configured to " + "only match {1}clustered ops", + matchClustered ? "non-" : "", + matchClustered ? "" : "non-")); + } + + auto ci = getAndValidateClusterInfo(op, subgroupSize); + if (failed(ci)) + return failure(); + + Location loc = op.getLoc(); + rewriter.replaceOp(op, createSubgroupDPPReduction( + rewriter, loc, op.getValue(), op.getOp(), *ci)); + return success(); + } + +private: + unsigned subgroupSize = 0; + bool matchClustered = false; +}; + +struct ConvertGPUToAMDGPUPass + : public impl::ConvertGPUToAMDGPUPassBase { + using Base::Base; + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + int subgroupSizeInt = static_cast(subgroupSize); + populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt, + PatternBenefit(1)); + walkAndApplyPatterns(getOperation(), std::move(patterns)); + } +}; +} // namespace + +void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns, + unsigned subgroupSize, + PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit); +} diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt index 945e3ccdfa87b..52484ac69a3e2 100644 --- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt @@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms MLIRMathToLLVM MLIRMathToROCDL MLIRAMDGPUToROCDL + MLIRGPUToAMDGPU MLIRFuncToLLVM MLIRGPUDialect MLIRGPUToGPURuntimeTransforms From 427c81705a1be5178cacbe50a213d5b3ee9f68b3 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 2 Apr 2025 17:48:56 -0500 Subject: [PATCH 02/28] Fix for numerical issues in MatVec tests Signed-off-by: Muzammiluddin Syed --- mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp index bab83c12157a9..b07ed0a7c636a 100644 --- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp +++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp @@ -82,26 +82,31 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, } if (ci.clusterSize >= 8) { - Value dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, - b.getUnitAttr()); + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4); + Value dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shr, permArg); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } if (ci.clusterSize >= 16) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8); Value dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_mirror, b.getUnitAttr()); + amdgpu::DPPPerm::row_shr, permArg); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } + const int allRows = 0xf; + const int allBanks = 0xf; + if (ci.clusterSize >= 32) { auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15); Value dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, - b.getUnitAttr(), 10, 15, false); + b.getUnitAttr(), 0xa, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } @@ -110,7 +115,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31); Value dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, - b.getUnitAttr(), 12, 15, false); + b.getUnitAttr(), allRows, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } From 655251b5fd2713b3eacb38953425c5d71288beb6 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Thu, 3 Apr 2025 15:08:59 -0500 Subject: [PATCH 03/28] Rewrites pattern to be closer to device lib impl. Signed-off-by: Muzammiluddin Syed --- .../mlir/Dialect/GPU/Transforms/Passes.h | 7 ++ .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 27 ++--- .../GPU/Transforms/SubgroupReduceLowering.cpp | 109 ++++++++++++++++++ 3 files changed, 130 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index 5cc65082a7e56..41e0759e958b5 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -62,6 +62,13 @@ void populateGpuLowerSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); +/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` +/// ops over scalar types. Assumes that the subgroup has +/// `subgroupSize` lanes. Applicable only to AMD GPUs. +void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns, + unsigned subgroupSize, + PatternBenefit benefit = 1); + /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns` /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp index b07ed0a7c636a..590fa7d9b4ffc 100644 --- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp +++ b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp @@ -67,7 +67,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1); Value dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shr, permArg); + amdgpu::DPPPerm::row_shl, permArg); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } @@ -76,39 +76,41 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2); Value dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shr, permArg); + amdgpu::DPPPerm::row_shl, permArg); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } if (ci.clusterSize >= 8) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 4); - Value dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shr, permArg); + Value dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, + b.getUnitAttr()); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } if (ci.clusterSize >= 16) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 8); Value dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shr, permArg); + amdgpu::DPPPerm::row_mirror, b.getUnitAttr()); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } const int allRows = 0xf; const int allBanks = 0xf; - + auto int32Type = IntegerType::get(b.getContext(), 32); if (ci.clusterSize >= 32) { auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15); Value dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, b.getUnitAttr(), 0xa, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); + result, dppResult); + if (ci.subgroupSize == 32) { + Value lane01 = b.create(loc, int32Type, 1); + result = b.create(loc, input.getType(), result, lane01); + } } if (ci.clusterSize == 64) { @@ -118,11 +120,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, b.getUnitAttr(), allRows, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); + Value lane63 = b.create(loc, int32Type, 63); + result = b.create(loc, input.getType(), result, lane63); } - auto int32Type = IntegerType::get(b.getContext(), 32); - Value lane63 = b.create(loc, int32Type, 63); - result = b.create(loc, input.getType(), result, lane63); assert(result.getType() == input.getType()); return result; } diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 43eff3eddcc49..f07ef6cf154a9 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -12,6 +12,8 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/GPU/Utils/GPUUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" @@ -362,6 +364,106 @@ struct VectorSubgroupReduceToShuffles final unsigned shuffleBitwidth = 0; bool matchClustered = false; }; + +Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, + gpu::AllReduceOperation mode, + const ClusterInfo &ci) { + Value result = input; + if (ci.clusterSize >= 2) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1); + Value dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shl, permArg); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize >= 4) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2); + Value dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shl, permArg); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize >= 8) { + Value dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, + b.getUnitAttr()); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + if (ci.clusterSize >= 16) { + Value dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_mirror, b.getUnitAttr()); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } + + const int allRows = 0xf; + const int allBanks = 0xf; + auto int32Type = IntegerType::get(b.getContext(), 32); + if (ci.clusterSize >= 32) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15); + Value dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, + b.getUnitAttr(), 0xa, allBanks, false); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + if (ci.subgroupSize == 32) { + Value lane01 = b.create(loc, int32Type, 1); + result = + b.create(loc, input.getType(), result, lane01); + } + } + + if (ci.clusterSize == 64) { + auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31); + Value dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, + b.getUnitAttr(), allRows, allBanks, false); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + Value lane63 = b.create(loc, int32Type, 63); + result = b.create(loc, input.getType(), result, lane63); + } + + assert(result.getType() == input.getType()); + return result; +} + +struct ScalarSubgroupReduceToDPP final + : OpRewritePattern { + ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize, + bool matchClustered, PatternBenefit benefit) + : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), + matchClustered(matchClustered) {} + + LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, + PatternRewriter &rewriter) const override { + if (op.getClusterSize().has_value() != matchClustered) { + return rewriter.notifyMatchFailure( + op, llvm::formatv("op is {0}clustered but pattern is configured to " + "only match {1}clustered ops", + matchClustered ? "non-" : "", + matchClustered ? "" : "non-")); + } + auto ci = getAndValidateClusterInfo(op, subgroupSize); + if (failed(ci)) + return failure(); + Location loc = op.getLoc(); + rewriter.replaceOp(op, createSubgroupDPPReduction( + rewriter, loc, op.getValue(), op.getOp(), *ci)); + return success(); + } + +private: + unsigned subgroupSize = 0; + bool matchClustered = false; +}; } // namespace void mlir::populateGpuBreakDownSubgroupReducePatterns( @@ -372,6 +474,13 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns( patterns.add(patterns.getContext(), benefit); } +void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( + RewritePatternSet &patterns, unsigned subgroupSize, + PatternBenefit benefit) { + patterns.add(patterns.getContext(), subgroupSize, + /*matchClustered=*/true, benefit); +} + void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth, PatternBenefit benefit) { From 081d6f77b9331366fd332e4c42d192df003dbfe9 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Thu, 3 Apr 2025 15:15:36 -0500 Subject: [PATCH 04/28] Removes AMDToGPUPass, moving pattern into existing pass Signed-off-by: Muzammiluddin Syed --- .../mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h | 32 --- mlir/include/mlir/Conversion/Passes.h | 1 - mlir/include/mlir/Conversion/Passes.td | 16 -- mlir/lib/Conversion/CMakeLists.txt | 1 - .../lib/Conversion/GPUToAMDGPU/CMakeLists.txt | 22 --- .../Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp | 182 ------------------ mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt | 1 - 7 files changed, 255 deletions(-) delete mode 100644 mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt delete mode 100644 mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h deleted file mode 100644 index 2d3bb384235ca..0000000000000 --- a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h +++ /dev/null @@ -1,32 +0,0 @@ -//===- GPUToAMDGPU.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_ -#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_ - - -#include "mlir/IR/PatternMatch.h" -#include -#include - -namespace mlir { - -class LLVMTypeConverter; -class RewritePatternSet; -class TypeConverter; -class Pass; - -#define GEN_PASS_DECL_CONVERTGPUTOAMDGPUPASS -#include "mlir/Conversion/Passes.h.inc" - -void populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns, - unsigned subgroupSize, - PatternBenefit benefit); - -} // namespace mlir - -#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPU_H_ \ No newline at end of file diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index 1189423799092..ccd862f67c068 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -34,7 +34,6 @@ #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" -#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h" #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index b28b4900e6814..bbba495e613b2 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -643,22 +643,6 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> { ]; } -//===----------------------------------------------------------------------===// -// GPUToAMDGPU -//===----------------------------------------------------------------------===// - -def ConvertGPUToAMDGPUPass : Pass<"convert-gpu-to-amdgpu"> { - let summary = "Generate AMDGPU operations for gpu operations"; - let dependentDialects = [ - "amdgpu::AMDGPUDialect", - "LLVM::LLVMDialect", - "ROCDL::ROCDLDialect", - ]; - let options = [Option<"subgroupSize", "subgroup-size", "unsigned", - /*default=*/"64", - "Size of subgroup">]; -} - //===----------------------------------------------------------------------===// // ConvertIndexToLLVMPass //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index b957a4473f1e6..b6c21440c571c 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -24,7 +24,6 @@ add_subdirectory(FuncToEmitC) add_subdirectory(FuncToLLVM) add_subdirectory(FuncToSPIRV) add_subdirectory(GPUCommon) -add_subdirectory(GPUToAMDGPU) add_subdirectory(GPUToLLVMSPV) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt deleted file mode 100644 index 9b82b5dc63d9c..0000000000000 --- a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -add_mlir_conversion_library(MLIRGPUToAMDGPU - GPUToAMDGPU.cpp - - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/GPUToAMDGPU - - DEPENDS - MLIRConversionPassIncGen - - LINK_COMPONENTS - Core - - LINK_LIBS PUBLIC - MLIRLLVMCommonConversion - MLIRLLVMDialect - MLIRGPUDialect - MLIRAMDGPUDialect - MLIRAMDGPUUtils - MLIRROCDLDialect - MLIRPass - MLIRTransforms - ) diff --git a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp b/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp deleted file mode 100644 index 590fa7d9b4ffc..0000000000000 --- a/mlir/lib/Conversion/GPUToAMDGPU/GPUToAMDGPU.cpp +++ /dev/null @@ -1,182 +0,0 @@ -//===- GPUToAMDGPU.cpp - GPU to AMDGPU dialect conversion -------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPU.h" - -#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/TypeUtilities.h" -#include "mlir/Pass/Pass.h" - -#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" - -#include "mlir/Transforms/WalkPatternRewriteDriver.h" -#include "llvm/Support/FormatVariadic.h" - -namespace mlir { -#define GEN_PASS_DEF_CONVERTGPUTOAMDGPUPASS -#include "mlir/Conversion/Passes.h.inc" -} // namespace mlir - -using namespace mlir; - -namespace { -struct ClusterInfo { - unsigned clusterStride; - unsigned clusterSize; - unsigned subgroupSize; -}; - -static FailureOr -getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { - assert(llvm::isPowerOf2_32(subgroupSize)); - - std::optional clusterSize = op.getClusterSize(); - assert(!clusterSize || - llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. - if (clusterSize && *clusterSize > subgroupSize) - return op.emitOpError() - << "cluster size " << *clusterSize - << " is greater than subgroup size " << subgroupSize; - unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); - - auto clusterStride = op.getClusterStride(); - assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. - if (clusterStride >= subgroupSize) - return op.emitOpError() - << "cluster stride " << clusterStride - << " is not less than subgroup size " << subgroupSize; - - return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; -} - -Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, - gpu::AllReduceOperation mode, - const ClusterInfo &ci) { - Value result = input; - if (ci.clusterSize >= 2) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1); - Value dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - if (ci.clusterSize >= 4) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2); - Value dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - if (ci.clusterSize >= 8) { - Value dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, - b.getUnitAttr()); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - if (ci.clusterSize >= 16) { - Value dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_mirror, b.getUnitAttr()); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - const int allRows = 0xf; - const int allBanks = 0xf; - auto int32Type = IntegerType::get(b.getContext(), 32); - if (ci.clusterSize >= 32) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15); - Value dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, - b.getUnitAttr(), 0xa, allBanks, false); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - if (ci.subgroupSize == 32) { - Value lane01 = b.create(loc, int32Type, 1); - result = b.create(loc, input.getType(), result, lane01); - } - } - - if (ci.clusterSize == 64) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31); - Value dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, - b.getUnitAttr(), allRows, allBanks, false); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - Value lane63 = b.create(loc, int32Type, 63); - result = b.create(loc, input.getType(), result, lane63); - } - - assert(result.getType() == input.getType()); - return result; -} - -struct ScalarSubgroupReduceToShuffles final - : OpRewritePattern { - ScalarSubgroupReduceToShuffles(MLIRContext *ctx, unsigned subgroupSize, - bool matchClustered, PatternBenefit benefit) - : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), - matchClustered(matchClustered) {} - - LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, - PatternRewriter &rewriter) const override { - if (op.getClusterSize().has_value() != matchClustered) { - return rewriter.notifyMatchFailure( - op, llvm::formatv("op is {0}clustered but pattern is configured to " - "only match {1}clustered ops", - matchClustered ? "non-" : "", - matchClustered ? "" : "non-")); - } - - auto ci = getAndValidateClusterInfo(op, subgroupSize); - if (failed(ci)) - return failure(); - - Location loc = op.getLoc(); - rewriter.replaceOp(op, createSubgroupDPPReduction( - rewriter, loc, op.getValue(), op.getOp(), *ci)); - return success(); - } - -private: - unsigned subgroupSize = 0; - bool matchClustered = false; -}; - -struct ConvertGPUToAMDGPUPass - : public impl::ConvertGPUToAMDGPUPassBase { - using Base::Base; - - void runOnOperation() override { - RewritePatternSet patterns(&getContext()); - int subgroupSizeInt = static_cast(subgroupSize); - populateAMDGPUOptimizedSubgroupReducePatterns(patterns, subgroupSizeInt, - PatternBenefit(1)); - walkAndApplyPatterns(getOperation(), std::move(patterns)); - } -}; -} // namespace - -void mlir::populateAMDGPUOptimizedSubgroupReducePatterns(RewritePatternSet &patterns, - unsigned subgroupSize, - PatternBenefit benefit) { - patterns.add( - patterns.getContext(), subgroupSize, /*matchClustered=*/true, benefit); -} diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt index 52484ac69a3e2..945e3ccdfa87b 100644 --- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt @@ -15,7 +15,6 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms MLIRMathToLLVM MLIRMathToROCDL MLIRAMDGPUToROCDL - MLIRGPUToAMDGPU MLIRFuncToLLVM MLIRGPUDialect MLIRGPUToGPURuntimeTransforms From 0d560c219bf11d4b5e6b9eb3eff7680c66a6ba5e Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Thu, 10 Apr 2025 14:06:51 -0500 Subject: [PATCH 05/28] Adding permlanex16 and other dpp related ops to mlir dialect Signed-off-by: Muzammiluddin Syed --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 +++- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 ++++++ mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 14 +++++++++++++ .../GPU/Transforms/SubgroupReduceLowering.cpp | 21 +++++++++++-------- mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir | 8 +++++++ 5 files changed, 43 insertions(+), 10 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 108d7237ff703..17c1162170073 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -524,7 +524,8 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm", I32EnumAttrCase<"row_mirror", 8>, I32EnumAttrCase<"row_half_mirror", 9>, I32EnumAttrCase<"row_bcast_15", 10>, - I32EnumAttrCase<"row_bcast_31", 11> + I32EnumAttrCase<"row_bcast_31", 11>, + I32EnumAttrCase<"row_share", 12> ]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::amdgpu"; @@ -557,6 +558,7 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result", - Reverse within a half-row (`row_half_mirror`) - Broadcast the 15th lane of each row to the next row (`row_bcast`) - Broadcast lane 31 to rows 2 and 3 (`row_bcast`) + - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`) }]; let results = (outs AnyType:$result); let assemblyFormat = [{ diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 5f697bdeef566..4d343c8f3200c 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1293,6 +1293,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern { ROW_HALF_MIRROR = 0x141, BCAST15 = 0x142, BCAST31 = 0x143, + ROW_SHARE0 = 0x150 }; auto kind = DppOp.getKind(); @@ -1350,6 +1351,11 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern { case DPPPerm::row_bcast_31: DppCtrl = DppCtrl::BCAST31; break; + case DPPPerm::row_share: + if (auto intAttr = cast(*permArgument)) { + DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0; + } + break; } // Check for row_mask, bank_mask, bound_ctrl if they exist and create diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 549a4376a4a04..af4438f028542 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -468,6 +468,20 @@ LogicalResult DPPOp::verify() { } break; } + + case DPPPerm::row_share: { + if (!permArgument) { + return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) + + "' value not specified"); + } + if (auto intAttr = dyn_cast(permArgument)) { + uint32_t attrValue = intAttr.getInt(); + if (attrValue < 0 || attrValue > 15) { + return emitOpError( + "Attribute value for 'row_share' must be between 0 and 15"); + } + } + } break; } return success(); } diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index f07ef6cf154a9..3e64681ad2dd2 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -370,7 +370,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, const ClusterInfo &ci) { Value result = input; if (ci.clusterSize >= 2) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 1); + auto permArg = b.getI32IntegerAttr(1); Value dppResult = b.create(loc, result.getType(), result, result, amdgpu::DPPPerm::row_shl, permArg); @@ -379,7 +379,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, } if (ci.clusterSize >= 4) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 2); + auto permArg = b.getI32IntegerAttr(2); Value dppResult = b.create(loc, result.getType(), result, result, amdgpu::DPPPerm::row_shl, permArg); @@ -405,16 +405,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, const int allRows = 0xf; const int allBanks = 0xf; - auto int32Type = IntegerType::get(b.getContext(), 32); + auto uint32Type = b.getIntegerType(32, false); if (ci.clusterSize >= 32) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 15); - Value dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, - b.getUnitAttr(), 0xa, allBanks, false); + // auto permArg = b.getI32IntegerAttr(15); + // Value dppResult = b.create( + // loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, + // b.getUnitAttr(), 0xa, allBanks, false); + auto uIntMax = llvm::APInt::getMaxValue(32u); + Value uIntMaxConst = b.create(loc, uint32Type, uIntMax); + Value dppResult = b.create(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); if (ci.subgroupSize == 32) { - Value lane01 = b.create(loc, int32Type, 1); + Value lane01 = b.create(loc, b.getI32Type(), 1); result = b.create(loc, input.getType(), result, lane01); } @@ -427,7 +430,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, b.getUnitAttr(), allRows, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); - Value lane63 = b.create(loc, int32Type, 63); + Value lane63 = b.create(loc, b.getI32Type(), 63); result = b.create(loc, input.getType(), result, lane63); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir index 14691e73e62d7..64b3328b70ab4 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir @@ -137,3 +137,11 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 { %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16 return %0 : f16 } + +func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 { + // CHECK-LABEL: func @dpp_row_share + // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32 + // CHECK: return %0 : i32 + %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32 + return %0 : i32 +} From 015e9b9353df71200cff96d75f84fa3c583101b1 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Fri, 11 Apr 2025 10:30:10 -0500 Subject: [PATCH 06/28] Fixing permlanex16 intrinsic failure Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 3e64681ad2dd2..b6bd67fa0ce53 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -405,14 +405,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, const int allRows = 0xf; const int allBanks = 0xf; - auto uint32Type = b.getIntegerType(32, false); if (ci.clusterSize >= 32) { - // auto permArg = b.getI32IntegerAttr(15); - // Value dppResult = b.create( - // loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, - // b.getUnitAttr(), 0xa, allBanks, false); auto uIntMax = llvm::APInt::getMaxValue(32u); - Value uIntMaxConst = b.create(loc, uint32Type, uIntMax); + Value uIntMaxConst = b.create(loc, b.getI32Type(), uIntMax); Value dppResult = b.create(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); From 945f0e83e96722b8dbecd1317baced566e8b3ff8 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Fri, 11 Apr 2025 11:27:53 -0500 Subject: [PATCH 07/28] simplify verbose typing Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index b6bd67fa0ce53..b9eae59584e94 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -419,7 +419,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, } if (ci.clusterSize == 64) { - auto permArg = b.getIntegerAttr(b.getIntegerType(32), 31); + auto permArg = b.getI32IntegerAttr(31); Value dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, b.getUnitAttr(), allRows, allBanks, false); From 1b356ed68d3a5f2067736a7ad3dc437fea31a7fc Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Fri, 11 Apr 2025 22:13:11 -0500 Subject: [PATCH 08/28] testing numerics Signed-off-by: Muzammiluddin Syed --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 59 ++++++++++++------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index b9eae59584e94..0790edc15921e 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -369,46 +369,63 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, gpu::AllReduceOperation mode, const ClusterInfo &ci) { Value result = input; + Value dppResult; + const int allRows = 0xf; + const int allBanks = 0xf; + const bool boundCtrl = true; if (ci.clusterSize >= 2) { auto permArg = b.getI32IntegerAttr(1); - Value dppResult = + dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg); + amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } if (ci.clusterSize >= 4) { auto permArg = b.getI32IntegerAttr(2); - Value dppResult = + dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg); + amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } - if (ci.clusterSize >= 8) { - Value dppResult = b.create( + if (ci.clusterSize <= 8) { + dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, - b.getUnitAttr()); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); + b.getUnitAttr(), allRows, allBanks, boundCtrl); + } else if (ci.clusterSize == 8) { + auto permArg = b.getI32IntegerAttr(4); + dppResult = + b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); } - - if (ci.clusterSize >= 16) { - Value dppResult = + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + + if (ci.clusterSize <= 16) { + dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, + b.getUnitAttr(), allRows, allBanks, boundCtrl); + } else if (ci.clusterSize == 16) { + auto permArg = b.getI32IntegerAttr(8); + dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_mirror, b.getUnitAttr()); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); + amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); } + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); - const int allRows = 0xf; - const int allBanks = 0xf; if (ci.clusterSize >= 32) { - auto uIntMax = llvm::APInt::getMaxValue(32u); - Value uIntMaxConst = b.create(loc, b.getI32Type(), uIntMax); - Value dppResult = b.create(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false); + auto permArg = b.getI32IntegerAttr(15); + dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, + b.getUnitAttr(), 0xa, allBanks, false); + // if (chipset.majorVersion == 9) + // auto uIntMax = llvm::APInt::getMaxValue(32u); + // Value uIntMaxConst = b.create(loc, b.getI32Type(), uIntMax); + // Value dppResult = b.create(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); if (ci.subgroupSize == 32) { @@ -420,7 +437,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, if (ci.clusterSize == 64) { auto permArg = b.getI32IntegerAttr(31); - Value dppResult = b.create( + dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, b.getUnitAttr(), allRows, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), From 7fd30c051c743dde370616db6d0942f9dfac03d2 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Fri, 11 Apr 2025 22:44:39 -0500 Subject: [PATCH 09/28] fixing Signed-off-by: Muzammiluddin Syed --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 0790edc15921e..b47553e41c501 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -391,31 +391,35 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result, dppResult); } - if (ci.clusterSize <= 8) { + if (ci.clusterSize == 8) { dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, b.getUnitAttr(), allRows, allBanks, boundCtrl); - } else if (ci.clusterSize == 8) { + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } else if (ci.clusterSize >= 8) { auto permArg = b.getI32IntegerAttr(4); - dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); + dppResult = b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shl, permArg, + allRows, allBanks, boundCtrl); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); } - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - if (ci.clusterSize <= 16) { + if (ci.clusterSize == 16) { dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, b.getUnitAttr(), allRows, allBanks, boundCtrl); - } else if (ci.clusterSize == 16) { + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + result, dppResult); + } else if (ci.clusterSize >= 16) { auto permArg = b.getI32IntegerAttr(8); - dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); - } - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + dppResult = b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_shl, permArg, + allRows, allBanks, boundCtrl); + result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); + } if (ci.clusterSize >= 32) { auto permArg = b.getI32IntegerAttr(15); From 0c28b4d08ff6158908c498f32341faaddb6e4909 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Fri, 11 Apr 2025 23:16:43 -0500 Subject: [PATCH 10/28] fixing Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index b47553e41c501..889c378ab0a9f 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -420,7 +420,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } - + Value lane01 = b.create(loc, b.getI32Type(), 1); if (ci.clusterSize >= 32) { auto permArg = b.getI32IntegerAttr(15); dppResult = b.create( @@ -433,7 +433,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); if (ci.subgroupSize == 32) { - Value lane01 = b.create(loc, b.getI32Type(), 1); result = b.create(loc, input.getType(), result, lane01); } @@ -446,8 +445,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, b.getUnitAttr(), allRows, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); - Value lane63 = b.create(loc, b.getI32Type(), 63); - result = b.create(loc, input.getType(), result, lane63); + // Value lane63 = b.create(loc, b.getI32Type(), 63); + result = b.create(loc, input.getType(), result, lane01); } assert(result.getType() == input.getType()); From bfda71216024b6c14166f9eb988a07518a66548b Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Fri, 11 Apr 2025 23:32:43 -0500 Subject: [PATCH 11/28] fixing Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 889c378ab0a9f..d774197dc6d15 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -420,7 +420,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } - Value lane01 = b.create(loc, b.getI32Type(), 1); + Value lane00 = b.create(loc, b.getI32Type(), 0); if (ci.clusterSize >= 32) { auto permArg = b.getI32IntegerAttr(15); dppResult = b.create( @@ -434,7 +434,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result, dppResult); if (ci.subgroupSize == 32) { result = - b.create(loc, input.getType(), result, lane01); + b.create(loc, input.getType(), result, lane00); } } @@ -446,7 +446,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); // Value lane63 = b.create(loc, b.getI32Type(), 63); - result = b.create(loc, input.getType(), result, lane01); + result = b.create(loc, input.getType(), result, lane00); } assert(result.getType() == input.getType()); From 54c08ef52b4ba507c9314fb2e89f00fd77d2fa85 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Sun, 13 Apr 2025 22:17:55 -0500 Subject: [PATCH 12/28] trying again Signed-off-by: Muzammiluddin Syed --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 67 ++++++++++++------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index d774197dc6d15..8dd637b28d4ae 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -374,53 +374,71 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, const int allBanks = 0xf; const bool boundCtrl = true; if (ci.clusterSize >= 2) { - auto permArg = b.getI32IntegerAttr(1); + // auto permArg = b.getI32IntegerAttr(1); + auto permArg = b.getI32ArrayAttr({1, 0, 3, 2}); dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); + amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } if (ci.clusterSize >= 4) { - auto permArg = b.getI32IntegerAttr(2); + // auto permArg = b.getI32IntegerAttr(2); + auto permArg = b.getI32ArrayAttr({2, 3, 0, 1}); dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg, allRows, allBanks, boundCtrl); + amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } - if (ci.clusterSize == 8) { + // if (ci.clusterSize == 8) { + // dppResult = b.create( + // loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, + // b.getUnitAttr(), allRows, allBanks, boundCtrl); + // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + // result, dppResult); + // } else if (ci.clusterSize >= 8) { + // auto permArg = b.getI32IntegerAttr(4); + // dppResult = b.create(loc, result.getType(), result, result, + // amdgpu::DPPPerm::row_shr, permArg, + // allRows, allBanks, boundCtrl); + // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + // result, dppResult); + // } + if (ci.clusterSize >= 8) { dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, b.getUnitAttr(), allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); - } else if (ci.clusterSize >= 8) { - auto permArg = b.getI32IntegerAttr(4); - dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg, - allRows, allBanks, boundCtrl); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); } - if (ci.clusterSize == 16) { + // if (ci.clusterSize == 16) { + // dppResult = b.create( + // loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, + // b.getUnitAttr(), allRows, allBanks, boundCtrl); + // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + // result, dppResult); + // } else if (ci.clusterSize >= 16) { + // auto permArg = b.getI32IntegerAttr(8); + // dppResult = b.create(loc, result.getType(), result, result, + // amdgpu::DPPPerm::row_shr, permArg, + // allRows, allBanks, boundCtrl); + // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + // result, dppResult); + // } + if (ci.clusterSize >= 16) { dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, b.getUnitAttr(), allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); - } else if (ci.clusterSize >= 16) { - auto permArg = b.getI32IntegerAttr(8); - dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_shl, permArg, - allRows, allBanks, boundCtrl); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); } - Value lane00 = b.create(loc, b.getI32Type(), 0); + + Value lane31 = b.create(loc, b.getI32Type(), 31); + Value lane63 = b.create(loc, b.getI32Type(), 63); if (ci.clusterSize >= 32) { auto permArg = b.getI32IntegerAttr(15); dppResult = b.create( @@ -434,7 +452,7 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result, dppResult); if (ci.subgroupSize == 32) { result = - b.create(loc, input.getType(), result, lane00); + b.create(loc, input.getType(), result, lane31); } } @@ -442,11 +460,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, auto permArg = b.getI32IntegerAttr(31); dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, - b.getUnitAttr(), allRows, allBanks, false); + b.getUnitAttr(), 0xc, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); - // Value lane63 = b.create(loc, b.getI32Type(), 63); - result = b.create(loc, input.getType(), result, lane00); + result = b.create(loc, input.getType(), result, lane63); } assert(result.getType() == input.getType()); From 6535bda2437e630ec19953985d59564d05fea336 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Mon, 14 Apr 2025 00:59:52 -0500 Subject: [PATCH 13/28] Fixing implementation Signed-off-by: Muzammiluddin Syed --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 72 +++++++------------ 1 file changed, 26 insertions(+), 46 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 8dd637b28d4ae..0c923828093b9 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" @@ -367,14 +368,14 @@ struct VectorSubgroupReduceToShuffles final Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, gpu::AllReduceOperation mode, - const ClusterInfo &ci) { + const ClusterInfo &ci, + amdgpu::Chipset chipset) { Value result = input; Value dppResult; const int allRows = 0xf; const int allBanks = 0xf; const bool boundCtrl = true; if (ci.clusterSize >= 2) { - // auto permArg = b.getI32IntegerAttr(1); auto permArg = b.getI32ArrayAttr({1, 0, 3, 2}); dppResult = b.create(loc, result.getType(), result, result, @@ -384,7 +385,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, } if (ci.clusterSize >= 4) { - // auto permArg = b.getI32IntegerAttr(2); auto permArg = b.getI32ArrayAttr({2, 3, 0, 1}); dppResult = b.create(loc, result.getType(), result, result, @@ -393,20 +393,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result, dppResult); } - // if (ci.clusterSize == 8) { - // dppResult = b.create( - // loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, - // b.getUnitAttr(), allRows, allBanks, boundCtrl); - // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - // result, dppResult); - // } else if (ci.clusterSize >= 8) { - // auto permArg = b.getI32IntegerAttr(4); - // dppResult = b.create(loc, result.getType(), result, result, - // amdgpu::DPPPerm::row_shr, permArg, - // allRows, allBanks, boundCtrl); - // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - // result, dppResult); - // } if (ci.clusterSize >= 8) { dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, @@ -415,20 +401,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result, dppResult); } - // if (ci.clusterSize == 16) { - // dppResult = b.create( - // loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, - // b.getUnitAttr(), allRows, allBanks, boundCtrl); - // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - // result, dppResult); - // } else if (ci.clusterSize >= 16) { - // auto permArg = b.getI32IntegerAttr(8); - // dppResult = b.create(loc, result.getType(), result, result, - // amdgpu::DPPPerm::row_shr, permArg, - // allRows, allBanks, boundCtrl); - // result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - // result, dppResult); - // } if (ci.clusterSize >= 16) { dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, @@ -440,14 +412,19 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, Value lane31 = b.create(loc, b.getI32Type(), 31); Value lane63 = b.create(loc, b.getI32Type(), 63); if (ci.clusterSize >= 32) { - auto permArg = b.getI32IntegerAttr(15); - dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, - b.getUnitAttr(), 0xa, allBanks, false); - // if (chipset.majorVersion == 9) - // auto uIntMax = llvm::APInt::getMaxValue(32u); - // Value uIntMaxConst = b.create(loc, b.getI32Type(), uIntMax); - // Value dppResult = b.create(loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false); + if (chipset.majorVersion <= 9) { + auto permArg = b.getI32IntegerAttr(15); + dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, + b.getUnitAttr(), 0xa, allBanks, false); + } else if (chipset.majorVersion == 10) { + auto uIntMax = llvm::APInt::getMaxValue(32u); + Value uIntMaxConst = + b.create(loc, b.getI32Type(), uIntMax); + Value dppResult = b.create( + loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, + true, false); + } result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); if (ci.subgroupSize == 32) { @@ -458,9 +435,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, if (ci.clusterSize == 64) { auto permArg = b.getI32IntegerAttr(31); - dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, - b.getUnitAttr(), 0xc, allBanks, false); + dppResult = b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::row_bcast_31, + b.getUnitAttr(), 0xc, allBanks, false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); result = b.create(loc, input.getType(), result, lane63); @@ -473,9 +450,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, struct ScalarSubgroupReduceToDPP final : OpRewritePattern { ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize, - bool matchClustered, PatternBenefit benefit) + bool matchClustered, Chipset chipset, + PatternBenefit benefit) : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), - matchClustered(matchClustered) {} + matchClustered(matchClustered), chipset(chipset) {} LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, PatternRewriter &rewriter) const override { @@ -498,6 +476,7 @@ struct ScalarSubgroupReduceToDPP final private: unsigned subgroupSize = 0; bool matchClustered = false; + Chipset chipset; }; } // namespace @@ -510,10 +489,11 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns( } void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, + RewritePatternSet &patterns, unsigned subgroupSize, Chipset chipset, PatternBenefit benefit) { patterns.add(patterns.getContext(), subgroupSize, - /*matchClustered=*/true, benefit); + /*matchClustered=*/true, chipset, + benefit); } void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( From 85e3b6271bf7568685bc57051a420b9ef02bc5bf Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Mon, 14 Apr 2025 16:11:33 -0500 Subject: [PATCH 14/28] Adding DPP test Signed-off-by: Muzammiluddin Syed --- .../mlir/Dialect/GPU/Transforms/Passes.h | 2 + .../GPU/Transforms/SubgroupReduceLowering.cpp | 57 +++++++++++-------- .../Dialect/GPU/subgroup-reduce-lowering.mlir | 33 +++++++++++ mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 24 ++++++-- 4 files changed, 89 insertions(+), 27 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index 41e0759e958b5..5b185e262deb0 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -13,6 +13,7 @@ #ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ #define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Utils/GPUUtils.h" #include "mlir/IR/PatternMatch.h" @@ -67,6 +68,7 @@ void populateGpuLowerSubgroupReduceToShufflePatterns( /// `subgroupSize` lanes. Applicable only to AMD GPUs. void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns, unsigned subgroupSize, + amdgpu::Chipset chipset, PatternBenefit benefit = 1); /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns` diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 0c923828093b9..a327730851ed4 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/MathExtras.h" #include #include +#include using namespace mlir; @@ -370,25 +371,27 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, gpu::AllReduceOperation mode, const ClusterInfo &ci, amdgpu::Chipset chipset) { - Value result = input; Value dppResult; + Value result = input; const int allRows = 0xf; const int allBanks = 0xf; const bool boundCtrl = true; + Value lane31 = b.create(loc, b.getI32Type(), 31); + Value lane63 = b.create(loc, b.getI32Type(), 63); if (ci.clusterSize >= 2) { auto permArg = b.getI32ArrayAttr({1, 0, 3, 2}); - dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl); + dppResult = b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::quad_perm, permArg, + allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } if (ci.clusterSize >= 4) { auto permArg = b.getI32ArrayAttr({2, 3, 0, 1}); - dppResult = - b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::quad_perm, permArg, allRows, allBanks, boundCtrl); + dppResult = b.create(loc, result.getType(), result, result, + amdgpu::DPPPerm::quad_perm, permArg, + allRows, allBanks, boundCtrl); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); } @@ -409,19 +412,15 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, result, dppResult); } - Value lane31 = b.create(loc, b.getI32Type(), 31); - Value lane63 = b.create(loc, b.getI32Type(), 63); if (ci.clusterSize >= 32) { if (chipset.majorVersion <= 9) { - auto permArg = b.getI32IntegerAttr(15); dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, - b.getUnitAttr(), 0xa, allBanks, false); + b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false); } else if (chipset.majorVersion == 10) { - auto uIntMax = llvm::APInt::getMaxValue(32u); Value uIntMaxConst = - b.create(loc, b.getI32Type(), uIntMax); - Value dppResult = b.create( + b.create(loc, b.getI32Type(), -1); + dppResult = b.create( loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false); } @@ -434,10 +433,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, } if (ci.clusterSize == 64) { - auto permArg = b.getI32IntegerAttr(31); - dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::row_bcast_31, - b.getUnitAttr(), 0xc, allBanks, false); + dppResult = b.create( + loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, + b.getUnitAttr(), 0xc, allBanks, /*bound_ctrl*/ false); result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); result = b.create(loc, input.getType(), result, lane63); @@ -447,10 +445,13 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, return result; } +/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` +/// ops over scalar types. Assumes that the subgroup has +/// `subgroupSize` lanes. Applicable only to AMD GPUs. struct ScalarSubgroupReduceToDPP final : OpRewritePattern { ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize, - bool matchClustered, Chipset chipset, + bool matchClustered, amdgpu::Chipset chipset, PatternBenefit benefit) : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), matchClustered(matchClustered), chipset(chipset) {} @@ -467,16 +468,26 @@ struct ScalarSubgroupReduceToDPP final auto ci = getAndValidateClusterInfo(op, subgroupSize); if (failed(ci)) return failure(); + + if (ci->clusterStride != 1) + return failure(); + + Type valueTy = op.getType(); + if (!valueTy.isIntOrFloat()) + return rewriter.notifyMatchFailure( + op, "value type is not a compatible scalar"); + Location loc = op.getLoc(); - rewriter.replaceOp(op, createSubgroupDPPReduction( - rewriter, loc, op.getValue(), op.getOp(), *ci)); + rewriter.replaceOp(op, + createSubgroupDPPReduction(rewriter, loc, op.getValue(), + op.getOp(), *ci, chipset)); return success(); } private: unsigned subgroupSize = 0; bool matchClustered = false; - Chipset chipset; + amdgpu::Chipset chipset; }; } // namespace @@ -489,7 +500,7 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns( } void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, Chipset chipset, + RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, PatternBenefit benefit) { patterns.add(patterns.getContext(), subgroupSize, /*matchClustered=*/true, chipset, diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 9f2aa1be52fc3..8ac1a5561aad6 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -6,14 +6,20 @@ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles" %s \ // RUN: | FileCheck %s --check-prefix=CHECK-SHFL +// RUN: mlir-opt --allow-unregistered-dialect \ +// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \ +// RUN: | FileCheck %s --check-prefix=CHECK-DPP + // CHECK-SUB: gpu.module @kernels { // CHECK-SHFL: gpu.module @kernels { +// CHECK-DPP: gpu.module @kernels { gpu.module @kernels { // CHECK-SUB-LABEL: gpu.func @kernel0( // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>) // // CHECK-SHFL-LABEL: gpu.func @kernel0( + // CHECK-DPP-LABEL: gpu.func @kernel0( gpu.func @kernel0(%arg0: vector<5xf16>) kernel { // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16> // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16> @@ -36,6 +42,7 @@ gpu.module @kernels { // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4) // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum2) : (vector<5xf16>) -> () @@ -52,6 +59,8 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>) // // CHECK-SHFL-LABEL: gpu.func @kernel1( + // + // CHECK-DPP-LABEL: gpu.func @kernel1( gpu.func @kernel1(%arg0: vector<1xf32>) kernel { // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32> // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32 @@ -68,6 +77,8 @@ gpu.module @kernels { // Note stride is dropped because it is == 1. // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32 // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} quad_perm + // CHECK-DPP: amdgpu.dpp {{.+}} row_half_mirror %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum2) : (vector<1xf32>) -> () @@ -131,6 +142,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) + // CHECK-DPP-LABEL: gpu.func @kernel3_clustered( + // CHECK-DPP-SAME: %[[ARG0:.+]]: i32) gpu.func @kernel3_clustered(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -144,6 +157,14 @@ gpu.module @kernels { // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32 // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> () + + // CHECK-DPP: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 + // CHECK-DPP: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 + // CHECK-DPP: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 + // CHECK-DPP: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 + // CHECK-DPP: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 + // CHECK-DPP: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-DPP: "test.consume"(%[[A2]]) : (i32) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -246,6 +267,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) + // CHECK-DPP-LABEL: gpu.func @kernel5_clustered + // CHECK-DPP-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5_clustered(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -257,6 +280,16 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () + + // CHECK-DPPL: %[[VAR0:.+]] =amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 + // CHECK-DPPL: %[[VAR1:.+]] =arith.addi %[[ARG0]], %[[VAR0]] : i16 + // CHECK-DPPL: %[[VAR2:.+]] =amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 + // CHECK-DPPL: %[[VAR3:.+]] =arith.addi %[[VAR1]], %[[VAR2]] : i16 + // CHECK-DPPL: %[[VAR4:.+]] =amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-DPPL: %[[VAR5:.+]] =arith.addi %[[VAR3]], %[[VAR4]] : i16 + // CHECK-DPPL: %[[VAR6:.+]] =amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-DPPL: %[[VAR7:.+]] =arith.addi %[[VAR5]], %[[VAR6]] : i16 + // CHECK-DPPL: "test.consume"(%[[VAR7]]) : (i16) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index a49d304baf5c6..7515e9050240d 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -10,10 +10,13 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/Index/IR/IndexDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/PatternMatch.h" @@ -28,8 +31,9 @@ struct TestGpuRewritePass MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestGpuRewritePass) void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } StringRef getArgument() const final { return "test-gpu-rewrite"; } StringRef getDescription() const final { @@ -54,7 +58,8 @@ struct TestGpuSubgroupReduceLoweringPass : PassWrapper(pass) {} void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } StringRef getArgument() const final { @@ -70,6 +75,12 @@ struct TestGpuSubgroupReduceLoweringPass llvm::cl::desc("Expand subgroup_reduce ops to shuffle ops."), llvm::cl::init(false)}; + Option target{ + *this, "target", + llvm::cl::desc("Target backend name which will be used to provide " + "compatible lowerings of subgroup reduce."), + llvm::cl::init("")}; + void runOnOperation() override { RewritePatternSet patterns(&getContext()); @@ -77,8 +88,13 @@ struct TestGpuSubgroupReduceLoweringPass // perform fewer failing matches. populateGpuBreakDownSubgroupReducePatterns(patterns, /*maxShuffleBitwidth=*/32, - PatternBenefit(2)); + PatternBenefit(3)); if (expandToShuffles) { + auto maybeChipset = amdgpu::Chipset::parse(target); + if (!failed(maybeChipset)) { + populateGpuLowerSubgroupReduceToDPPPatterns( + patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); + } populateGpuLowerSubgroupReduceToShufflePatterns( patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32); populateGpuLowerClusteredSubgroupReduceToShufflePatterns( From 3392f082d573676ce2c8b87fe726b285875e4ea0 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Mon, 14 Apr 2025 16:28:48 -0500 Subject: [PATCH 15/28] Addressing PR comments Signed-off-by: Muzammiluddin Syed --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 12 +++++++----- mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index a327730851ed4..a01b182501f36 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -376,8 +376,10 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, const int allRows = 0xf; const int allBanks = 0xf; const bool boundCtrl = true; - Value lane31 = b.create(loc, b.getI32Type(), 31); - Value lane63 = b.create(loc, b.getI32Type(), 63); + Value lane31 = + b.create(loc, b.getI32Type(), b.getI32IntegerAttr(31)); + Value lane63 = + b.create(loc, b.getI32Type(), b.getI32IntegerAttr(63)); if (ci.clusterSize >= 2) { auto permArg = b.getI32ArrayAttr({1, 0, 3, 2}); dppResult = b.create(loc, result.getType(), result, result, @@ -417,9 +419,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false); - } else if (chipset.majorVersion == 10) { - Value uIntMaxConst = - b.create(loc, b.getI32Type(), -1); + } else if (chipset.majorVersion >= 10) { + Value uIntMaxConst = b.create(loc, b.getI32Type(), + b.getI32IntegerAttr(-1)); dppResult = b.create( loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, true, false); diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index 7515e9050240d..97f9e33290f35 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -91,7 +91,7 @@ struct TestGpuSubgroupReduceLoweringPass PatternBenefit(3)); if (expandToShuffles) { auto maybeChipset = amdgpu::Chipset::parse(target); - if (!failed(maybeChipset)) { + if (succeeded(maybeChipset)) { populateGpuLowerSubgroupReduceToDPPPatterns( patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); } From b59922ab89a2e20948ff07cbf743571045c134c9 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Mon, 14 Apr 2025 17:01:12 -0500 Subject: [PATCH 16/28] removing unnecessary header Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index a01b182501f36..b0803ff050391 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -26,7 +26,6 @@ #include "llvm/Support/MathExtras.h" #include #include -#include using namespace mlir; From 6431293b3bb3741bd2e461acebb264a5ed29f81b Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 00:27:29 -0500 Subject: [PATCH 17/28] Addressing PR comments Signed-off-by: Muzammiluddin Syed --- .../mlir/Dialect/GPU/Transforms/Passes.h | 4 +++ .../GPU/Transforms/SubgroupReduceLowering.cpp | 14 ++++++-- .../Dialect/GPU/subgroup-reduce-lowering.mlir | 34 ++++++++++++++----- mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 2 ++ 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index 5b185e262deb0..f113649e0c908 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -71,6 +71,10 @@ void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns, amdgpu::Chipset chipset, PatternBenefit benefit = 1); +void populateGpuLowerClusteredSubgroupReduceToDPPPatterns( + RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, + PatternBenefit benefit = 1); + /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns` /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index b0803ff050391..0a0dc95b0c0d9 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -372,8 +372,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, amdgpu::Chipset chipset) { Value dppResult; Value result = input; - const int allRows = 0xf; - const int allBanks = 0xf; + constexpr int allRows = 0xf; + constexpr int allBanks = 0xf; const bool boundCtrl = true; Value lane31 = b.create(loc, b.getI32Type(), b.getI32IntegerAttr(31)); @@ -504,10 +504,18 @@ void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, PatternBenefit benefit) { patterns.add(patterns.getContext(), subgroupSize, - /*matchClustered=*/true, chipset, + /*matchClustered=*/false, chipset, benefit); } +void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns( + RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, + PatternBenefit benefit) { +patterns.add(patterns.getContext(), subgroupSize, + /*matchClustered=*/true, chipset, + benefit); +} + void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth, PatternBenefit benefit) { diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 8ac1a5561aad6..018ea835ea38c 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -97,6 +97,8 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>) // // CHECK-SHFL-LABEL: gpu.func @kernel2( + // CHECK-DPP-LABEL: gpu.func @kernel2( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel { // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8> // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> () @@ -114,6 +116,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) + // CHECK-DPP-LABEL: gpu.func @kernel3( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel3(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -174,6 +178,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) + // CHECK-DPP-LABEL: gpu.func @kernel3_clustered_strided( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel3_clustered_strided(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32 @@ -196,6 +202,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) + // CHECK-DPP-LABEL: gpu.func @kernel4( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel4(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -232,6 +240,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) + // CHECK-DPP-LABEL: gpu.func @kernel4_clustered( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -247,6 +257,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) + // CHECK-DPP-LABEL: gpu.func @kernel5( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel5(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -281,15 +293,15 @@ gpu.module @kernels { // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () - // CHECK-DPPL: %[[VAR0:.+]] =amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 - // CHECK-DPPL: %[[VAR1:.+]] =arith.addi %[[ARG0]], %[[VAR0]] : i16 - // CHECK-DPPL: %[[VAR2:.+]] =amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 - // CHECK-DPPL: %[[VAR3:.+]] =arith.addi %[[VAR1]], %[[VAR2]] : i16 - // CHECK-DPPL: %[[VAR4:.+]] =amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-DPPL: %[[VAR5:.+]] =arith.addi %[[VAR3]], %[[VAR4]] : i16 - // CHECK-DPPL: %[[VAR6:.+]] =amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 - // CHECK-DPPL: %[[VAR7:.+]] =arith.addi %[[VAR5]], %[[VAR6]] : i16 - // CHECK-DPPL: "test.consume"(%[[VAR7]]) : (i16) -> () + // CHECK-DPP: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 + // CHECK-DPP: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 + // CHECK-DPP: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 + // CHECK-DPP: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 + // CHECK-DPP: "test.consume"(%[[VAR7]]) : (i16) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () @@ -299,6 +311,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel6( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) + // CHECK-DPP-LABEL: gpu.func @kernel6( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel6(%arg0: vector<3xi8>) kernel { // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8> // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8> @@ -322,6 +336,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) + // CHECK-DPP-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel { // CHECK-SHFL-COUNT-5: gpu.shuffle xor %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>) diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index 97f9e33290f35..f34b882c1be86 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -94,6 +94,8 @@ struct TestGpuSubgroupReduceLoweringPass if (succeeded(maybeChipset)) { populateGpuLowerSubgroupReduceToDPPPatterns( patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); + populateGpuLowerClusteredSubgroupReduceToDPPPatterns( + patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); } populateGpuLowerSubgroupReduceToShufflePatterns( patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32); From ae25fa0a55e931f1450120c1d5d298f3f0d27d49 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 01:06:47 -0500 Subject: [PATCH 18/28] moving permlanex16 changes to another commit Signed-off-by: Muzammiluddin Syed --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 +--- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 ------ mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 14 ------------- .../GPU/Transforms/SubgroupReduceLowering.cpp | 21 ++++++++++--------- mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir | 8 ------- 5 files changed, 12 insertions(+), 41 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 17c1162170073..108d7237ff703 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -524,8 +524,7 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm", I32EnumAttrCase<"row_mirror", 8>, I32EnumAttrCase<"row_half_mirror", 9>, I32EnumAttrCase<"row_bcast_15", 10>, - I32EnumAttrCase<"row_bcast_31", 11>, - I32EnumAttrCase<"row_share", 12> + I32EnumAttrCase<"row_bcast_31", 11> ]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::amdgpu"; @@ -558,7 +557,6 @@ def AMDGPU_DPPOp : AMDGPU_Op<"dpp", [SameTypeOperands, AllTypesMatch<["result", - Reverse within a half-row (`row_half_mirror`) - Broadcast the 15th lane of each row to the next row (`row_bcast`) - Broadcast lane 31 to rows 2 and 3 (`row_bcast`) - - Broadcast a lane [0-15] within row 0 to all lanes of row 0 (`row_share`) }]; let results = (outs AnyType:$result); let assemblyFormat = [{ diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 4d343c8f3200c..5f697bdeef566 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1293,7 +1293,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern { ROW_HALF_MIRROR = 0x141, BCAST15 = 0x142, BCAST31 = 0x143, - ROW_SHARE0 = 0x150 }; auto kind = DppOp.getKind(); @@ -1351,11 +1350,6 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern { case DPPPerm::row_bcast_31: DppCtrl = DppCtrl::BCAST31; break; - case DPPPerm::row_share: - if (auto intAttr = cast(*permArgument)) { - DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHARE0; - } - break; } // Check for row_mask, bank_mask, bound_ctrl if they exist and create diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index af4438f028542..549a4376a4a04 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -468,20 +468,6 @@ LogicalResult DPPOp::verify() { } break; } - - case DPPPerm::row_share: { - if (!permArgument) { - return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) + - "' value not specified"); - } - if (auto intAttr = dyn_cast(permArgument)) { - uint32_t attrValue = intAttr.getInt(); - if (attrValue < 0 || attrValue > 15) { - return emitOpError( - "Attribute value for 'row_share' must be between 0 and 15"); - } - } - } break; } return success(); } diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 0a0dc95b0c0d9..77201f319164f 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -366,7 +366,7 @@ struct VectorSubgroupReduceToShuffles final bool matchClustered = false; }; -Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, +std::optional createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, gpu::AllReduceOperation mode, const ClusterInfo &ci, amdgpu::Chipset chipset) { @@ -418,12 +418,8 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, dppResult = b.create( loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false); - } else if (chipset.majorVersion >= 10) { - Value uIntMaxConst = b.create(loc, b.getI32Type(), - b.getI32IntegerAttr(-1)); - dppResult = b.create( - loc, input.getType(), result, result, uIntMaxConst, uIntMaxConst, - true, false); + } else { + return std::nullopt; } result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), result, dppResult); @@ -479,9 +475,14 @@ struct ScalarSubgroupReduceToDPP final op, "value type is not a compatible scalar"); Location loc = op.getLoc(); - rewriter.replaceOp(op, - createSubgroupDPPReduction(rewriter, loc, op.getValue(), - op.getOp(), *ci, chipset)); + std::optional dpp = createSubgroupDPPReduction( + rewriter, loc, op.getValue(), op.getOp(), *ci, chipset); + if (!dpp) + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); + + rewriter.replaceOp(op, *dpp); return success(); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir index 64b3328b70ab4..14691e73e62d7 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/dpp.mlir @@ -137,11 +137,3 @@ func.func @row_bcast_update_dpp_f16(%arg0: f16, %arg1: f16) -> f16 { %0 = amdgpu.dpp %arg0 %arg1 row_bcast_15 { bound_ctrl = true } : f16 return %0 : f16 } - -func.func @dpp_row_share(%arg0: i32, %arg1: i32) -> i32 { - // CHECK-LABEL: func @dpp_row_share - // CHECK: rocdl.update.dpp %arg0, %arg1 with 351, 15, 15, false : i32 - // CHECK: return %0 : i32 - %0 = amdgpu.dpp %arg0 %arg1 row_share ( 0xf : i32 ) : i32 - return %0 : i32 -} From 97450983127a0ce7ca43d4e120fe84215225ebbd Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 02:27:29 -0500 Subject: [PATCH 19/28] fixing test Signed-off-by: Muzammiluddin Syed --- .../Dialect/GPU/subgroup-reduce-lowering.mlir | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 018ea835ea38c..11db35e31588b 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -32,11 +32,15 @@ gpu.module @kernels { // CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16 // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16> // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> () + // CHECK-DPP-COUNT-6: amdgpu.dpp + // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum0) : (vector<5xf16>) -> () // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-6: amdgpu.dpp + // CHECK-DPP: rocdl.readlane %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum1) : (vector<5xf16>) -> () @@ -66,11 +70,15 @@ gpu.module @kernels { // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32 // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32> // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> () + // CHECK-DPP-COUNT-6: amdgpu.dpp + // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum0) : (vector<1xf32>) -> () // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32 // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-6: amdgpu.dpp + // CHECK-DPP: rocdl.readlane %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum1) : (vector<1xf32>) -> () @@ -84,6 +92,7 @@ gpu.module @kernels { // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32 // CHECK-SUB: "test.consume" + // CHECK-DPP-NOT: amdgpu.dpp %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum3) : (vector<1xf32>) -> () @@ -137,6 +146,9 @@ gpu.module @kernels { // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32 // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32 // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> () + + // CHECK-DPP-COUNT-6: amdgpu.dpp + // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -258,7 +270,6 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) // CHECK-DPP-LABEL: gpu.func @kernel5( - // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel5(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -270,6 +281,8 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () + // CHECK-DPP-COUNT-6: amdgpu.dpp + // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () From a6c35b3a88cc22eb5f01447cdd69f5b1c017fd4a Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 10:38:42 -0500 Subject: [PATCH 20/28] fixing code formatting Signed-off-by: Muzammiluddin Syed --- .../mlir/Dialect/GPU/Transforms/Passes.h | 14 ++++++----- .../GPU/Transforms/SubgroupReduceLowering.cpp | 23 ++++++++++--------- mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 10 ++++---- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index f113649e0c908..a13ad33df29cd 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -63,6 +63,12 @@ void populateGpuLowerSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); +/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns` +/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. +void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); + /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` /// ops over scalar types. Assumes that the subgroup has /// `subgroupSize` lanes. Applicable only to AMD GPUs. @@ -71,16 +77,12 @@ void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns, amdgpu::Chipset chipset, PatternBenefit benefit = 1); +/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns` +/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. void populateGpuLowerClusteredSubgroupReduceToDPPPatterns( RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, PatternBenefit benefit = 1); -/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns` -/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. -void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( - RewritePatternSet &patterns, unsigned subgroupSize, - unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); - /// Collect all patterns to rewrite ops within the GPU dialect. inline void populateGpuRewritePatterns(RewritePatternSet &patterns) { populateGpuAllReducePatterns(patterns); diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 77201f319164f..55176f5b10959 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -10,13 +10,13 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" -#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/GPU/Utils/GPUUtils.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Location.h" @@ -366,10 +366,11 @@ struct VectorSubgroupReduceToShuffles final bool matchClustered = false; }; -std::optional createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input, - gpu::AllReduceOperation mode, - const ClusterInfo &ci, - amdgpu::Chipset chipset) { +std::optional createSubgroupDPPReduction(OpBuilder &b, Location loc, + Value input, + gpu::AllReduceOperation mode, + const ClusterInfo &ci, + amdgpu::Chipset chipset) { Value dppResult; Value result = input; constexpr int allRows = 0xf; @@ -510,11 +511,11 @@ void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( } void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, - PatternBenefit benefit) { -patterns.add(patterns.getContext(), subgroupSize, - /*matchClustered=*/true, chipset, - benefit); + RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, + PatternBenefit benefit) { + patterns.add(patterns.getContext(), subgroupSize, + /*matchClustered=*/true, chipset, + benefit); } void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index f34b882c1be86..fe402da4cc105 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -31,9 +31,8 @@ struct TestGpuRewritePass MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestGpuRewritePass) void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } StringRef getArgument() const final { return "test-gpu-rewrite"; } StringRef getDescription() const final { @@ -58,8 +57,9 @@ struct TestGpuSubgroupReduceLoweringPass : PassWrapper(pass) {} void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry + .insert(); } StringRef getArgument() const final { From 8a9cefb8bf364a961639e9b209d9a78f658a9d26 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 15:28:57 -0500 Subject: [PATCH 21/28] Updating implementation to support gfx 10+ Signed-off-by: Muzammiluddin Syed --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 148 +++++++++++------- 1 file changed, 90 insertions(+), 58 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 55176f5b10959..c1dedd9216a14 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -22,6 +22,7 @@ #include "mlir/IR/Location.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include @@ -371,72 +372,103 @@ std::optional createSubgroupDPPReduction(OpBuilder &b, Location loc, gpu::AllReduceOperation mode, const ClusterInfo &ci, amdgpu::Chipset chipset) { - Value dppResult; Value result = input; constexpr int allRows = 0xf; constexpr int allBanks = 0xf; const bool boundCtrl = true; - Value lane31 = - b.create(loc, b.getI32Type(), b.getI32IntegerAttr(31)); - Value lane63 = - b.create(loc, b.getI32Type(), b.getI32IntegerAttr(63)); - if (ci.clusterSize >= 2) { - auto permArg = b.getI32ArrayAttr({1, 0, 3, 2}); - dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::quad_perm, permArg, - allRows, allBanks, boundCtrl); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - if (ci.clusterSize >= 4) { - auto permArg = b.getI32ArrayAttr({2, 3, 0, 1}); - dppResult = b.create(loc, result.getType(), result, result, - amdgpu::DPPPerm::quad_perm, permArg, - allRows, allBanks, boundCtrl); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - if (ci.clusterSize >= 8) { - dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_half_mirror, - b.getUnitAttr(), allRows, allBanks, boundCtrl); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - if (ci.clusterSize >= 16) { - dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_mirror, - b.getUnitAttr(), allRows, allBanks, boundCtrl); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - } - - if (ci.clusterSize >= 32) { - if (chipset.majorVersion <= 9) { + Value lane0 = + b.create(loc, b.getI32Type(), b.getI32IntegerAttr(0)); + Value lane32 = + b.create(loc, b.getI32Type(), b.getI32IntegerAttr(32)); + + auto dppReduceAcrossLanes = [&](int numLanes, + Value res) -> std::optional { + Value dppResult, laneVal; + + switch (numLanes) { + case 2: + // Perform reduction between all lanes N <-> N+1. + dppResult = b.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + b.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); + break; + case 4: + // Perform reduction between all lanes N <-> N+2. dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_15, - b.getUnitAttr(), 0xa, allBanks, /*bound_ctrl*/ false); - } else { + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + b.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); + break; + case 8: + // Perform reduction between all lanes N <-> 7-N, + // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. + dppResult = b.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, + b.getUnitAttr(), allRows, allBanks, boundCtrl); + break; + case 16: + // Perform reduction between all lanes N <-> 15-N, + // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. + dppResult = b.create( + loc, result.getType(), res, res, amdgpu::DPPPerm::row_mirror, + b.getUnitAttr(), allRows, allBanks, boundCtrl); + break; + case 32: + if (chipset.majorVersion <= 9) { + // Broadcast last value from each row to next row. + // Use row mask to avoid polluting rows 1 and 3. + dppResult = b.create(loc, res.getType(), res, res, + amdgpu::DPPPerm::row_bcast_15, + b.getUnitAttr(), 0xa, allBanks, + /*bound_ctrl*/ false); + } else if (chipset.majorVersion <= 12) { + // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). + dppResult = b.create(loc, res.getType(), res, res, + -1, -1, /*fi=*/true, + /*bound_ctrl=*/false); + if (ci.subgroupSize == 32) { + dppResult = + b.create(loc, res.getType(), res, lane0); + } + } else { + return std::nullopt; + } + break; + case 64: + if (chipset.majorVersion <= 9) { + // Broadcast 31st lane value to rows 2 and 3. + // Use row mask to avoid polluting rows 0 and 1. + dppResult = b.create(loc, res.getType(), res, res, + amdgpu::DPPPerm::row_bcast_31, + b.getUnitAttr(), 0xc, allBanks, + /*bound_ctrl*/ false); + } else if (chipset.majorVersion <= 12) { + // Assume reduction across 32 lanes has been done. + // Perform final reduction manually by summing values in lane 0 and + // lane 32. + dppResult = + b.create(loc, res.getType(), res, lane32); + laneVal = b.create(loc, res.getType(), res, lane0); + return vector::makeArithReduction( + b, loc, gpu::convertReductionKind(mode), dppResult, laneVal); + } else { + return std::nullopt; + } + break; + default: + // Should never reach here given previous validation of ClusterInfo. + llvm_unreachable("ERROR: Unexpected cluster size."); return std::nullopt; } - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - if (ci.subgroupSize == 32) { - result = - b.create(loc, input.getType(), result, lane31); + return vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), + res, dppResult); + }; + + for (unsigned cs = 2; cs <= ci.clusterSize; cs = cs << 1) { + if (auto dpp = dppReduceAcrossLanes(cs, result)) { + result = *dpp; + continue; } - } - - if (ci.clusterSize == 64) { - dppResult = b.create( - loc, result.getType(), result, result, amdgpu::DPPPerm::row_bcast_31, - b.getUnitAttr(), 0xc, allBanks, /*bound_ctrl*/ false); - result = vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - result, dppResult); - result = b.create(loc, input.getType(), result, lane63); + return std::nullopt; } assert(result.getType() == input.getType()); From c39520333c5ee1b779c324c51805718fbb8de963 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 15:54:08 -0500 Subject: [PATCH 22/28] Small formatting change Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index c1dedd9216a14..28c569ee2bd83 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -463,7 +463,7 @@ std::optional createSubgroupDPPReduction(OpBuilder &b, Location loc, res, dppResult); }; - for (unsigned cs = 2; cs <= ci.clusterSize; cs = cs << 1) { + for (unsigned cs = 2; cs <= ci.clusterSize; cs <<= 1) { if (auto dpp = dppReduceAcrossLanes(cs, result)) { result = *dpp; continue; From ab15c44eea3e0d4fff1cb133a059e459b62229a7 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 16:50:03 -0500 Subject: [PATCH 23/28] Removing ReadlaneOps from test Signed-off-by: Muzammiluddin Syed --- mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 11db35e31588b..139edf6882df6 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -33,14 +33,12 @@ gpu.module @kernels { // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16> // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> () // CHECK-DPP-COUNT-6: amdgpu.dpp - // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum0) : (vector<5xf16>) -> () // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform // CHECK-SUB: "test.consume" // CHECK-DPP-COUNT-6: amdgpu.dpp - // CHECK-DPP: rocdl.readlane %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum1) : (vector<5xf16>) -> () @@ -71,14 +69,12 @@ gpu.module @kernels { // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32> // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> () // CHECK-DPP-COUNT-6: amdgpu.dpp - // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum0) : (vector<1xf32>) -> () // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32 // CHECK-SUB: "test.consume" // CHECK-DPP-COUNT-6: amdgpu.dpp - // CHECK-DPP: rocdl.readlane %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum1) : (vector<1xf32>) -> () @@ -148,7 +144,6 @@ gpu.module @kernels { // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> () // CHECK-DPP-COUNT-6: amdgpu.dpp - // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -282,7 +277,6 @@ gpu.module @kernels { // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () // CHECK-DPP-COUNT-6: amdgpu.dpp - // CHECK-DPP: rocdl.readlane %sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () From 55f442e488acc1bec1b2ecdd50564a884be87738 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 16:51:09 -0500 Subject: [PATCH 24/28] Improve dpp implementation Signed-off-by: Muzammiluddin Syed --- .../GPU/Transforms/SubgroupReduceLowering.cpp | 215 +++++++++--------- 1 file changed, 107 insertions(+), 108 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 28c569ee2bd83..2200754f55938 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -367,112 +367,112 @@ struct VectorSubgroupReduceToShuffles final bool matchClustered = false; }; -std::optional createSubgroupDPPReduction(OpBuilder &b, Location loc, - Value input, - gpu::AllReduceOperation mode, - const ClusterInfo &ci, - amdgpu::Chipset chipset) { - Value result = input; +FailureOr +createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op, + Value input, gpu::AllReduceOperation mode, + const ClusterInfo &ci, amdgpu::Chipset chipset) { + Location loc = op.getLoc(); + Value dpp; + Value res = input; constexpr int allRows = 0xf; constexpr int allBanks = 0xf; const bool boundCtrl = true; - Value lane0 = - b.create(loc, b.getI32Type(), b.getI32IntegerAttr(0)); - Value lane32 = - b.create(loc, b.getI32Type(), b.getI32IntegerAttr(32)); - - auto dppReduceAcrossLanes = [&](int numLanes, - Value res) -> std::optional { - Value dppResult, laneVal; - - switch (numLanes) { - case 2: - // Perform reduction between all lanes N <-> N+1. - dppResult = b.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, - b.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); - break; - case 4: - // Perform reduction between all lanes N <-> N+2. - dppResult = b.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, - b.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); - break; - case 8: - // Perform reduction between all lanes N <-> 7-N, - // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. - dppResult = b.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, - b.getUnitAttr(), allRows, allBanks, boundCtrl); - break; - case 16: - // Perform reduction between all lanes N <-> 15-N, - // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. - dppResult = b.create( - loc, result.getType(), res, res, amdgpu::DPPPerm::row_mirror, - b.getUnitAttr(), allRows, allBanks, boundCtrl); - break; - case 32: - if (chipset.majorVersion <= 9) { - // Broadcast last value from each row to next row. - // Use row mask to avoid polluting rows 1 and 3. - dppResult = b.create(loc, res.getType(), res, res, - amdgpu::DPPPerm::row_bcast_15, - b.getUnitAttr(), 0xa, allBanks, - /*bound_ctrl*/ false); - } else if (chipset.majorVersion <= 12) { - // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). - dppResult = b.create(loc, res.getType(), res, res, - -1, -1, /*fi=*/true, - /*bound_ctrl=*/false); - if (ci.subgroupSize == 32) { - dppResult = - b.create(loc, res.getType(), res, lane0); - } - } else { - return std::nullopt; - } - break; - case 64: - if (chipset.majorVersion <= 9) { - // Broadcast 31st lane value to rows 2 and 3. - // Use row mask to avoid polluting rows 0 and 1. - dppResult = b.create(loc, res.getType(), res, res, - amdgpu::DPPPerm::row_bcast_31, - b.getUnitAttr(), 0xc, allBanks, - /*bound_ctrl*/ false); - } else if (chipset.majorVersion <= 12) { - // Assume reduction across 32 lanes has been done. - // Perform final reduction manually by summing values in lane 0 and - // lane 32. - dppResult = - b.create(loc, res.getType(), res, lane32); - laneVal = b.create(loc, res.getType(), res, lane0); - return vector::makeArithReduction( - b, loc, gpu::convertReductionKind(mode), dppResult, laneVal); - } else { - return std::nullopt; + if (ci.clusterSize >= 2) { + // Perform reduction between all lanes N <-> N+1. + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + + if (ci.clusterSize >= 4) { + // Perform reduction between all lanes N <-> N+2. + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 8) { + // Perform reduction between all lanes N <-> 7-N, + // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 16) { + // Perform reduction between all lanes N <-> 15-N, + // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 32) { + if (chipset.majorVersion <= 9) { + // Broadcast last value from each row to next row. + // Use row mask to avoid polluting rows 1 and 3. + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, + rewriter.getUnitAttr(), 0xa, allBanks, + /*bound_ctrl*/ false); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + } else if (chipset.majorVersion <= 12) { + // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). + Value uint32Max = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); + dpp = rewriter.create(loc, res.getType(), res, res, + uint32Max, uint32Max, + /*fi=*/true, + /*bound_ctrl=*/false); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + if (ci.subgroupSize == 32) { + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + dpp = + rewriter.create(loc, res.getType(), res, lane0); } - break; - default: - // Should never reach here given previous validation of ClusterInfo. - llvm_unreachable("ERROR: Unexpected cluster size."); - return std::nullopt; + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); } - return vector::makeArithReduction(b, loc, gpu::convertReductionKind(mode), - res, dppResult); - }; - - for (unsigned cs = 2; cs <= ci.clusterSize; cs <<= 1) { - if (auto dpp = dppReduceAcrossLanes(cs, result)) { - result = *dpp; - continue; + } + if (ci.clusterSize >= 64) { + if (chipset.majorVersion <= 9) { + // Broadcast 31st lane value to rows 2 and 3. + // Use row mask to avoid polluting rows 0 and 1. + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, + rewriter.getUnitAttr(), 0xc, allBanks, + /*bound_ctrl*/ false); + + } else if (chipset.majorVersion <= 12) { + // Assume reduction across 32 lanes has been done. + // Perform final reduction manually by summing values in lane 0 and + // lane 32. + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + Value lane32 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); + dpp = rewriter.create(loc, res.getType(), res, lane32); + res = rewriter.create(loc, res.getType(), res, lane0); + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); } - return std::nullopt; + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); } - - assert(result.getType() == input.getType()); - return result; + assert(res.getType() == input.getType()); + return res; } /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` @@ -500,22 +500,21 @@ struct ScalarSubgroupReduceToDPP final return failure(); if (ci->clusterStride != 1) - return failure(); + return rewriter.notifyMatchFailure( + op, "Supgroup reductions using DPP are currently only available for " + "clusters of contiguous lanes."); Type valueTy = op.getType(); if (!valueTy.isIntOrFloat()) return rewriter.notifyMatchFailure( op, "value type is not a compatible scalar"); - Location loc = op.getLoc(); - std::optional dpp = createSubgroupDPPReduction( - rewriter, loc, op.getValue(), op.getOp(), *ci, chipset); - if (!dpp) - return rewriter.notifyMatchFailure( - op, "Subgroup reduce lowering to DPP not currently supported for " - "this device."); + FailureOr dpp = createSubgroupDPPReduction( + rewriter, op, op.getValue(), op.getOp(), *ci, chipset); + if (failed(dpp)) + return failure(); - rewriter.replaceOp(op, *dpp); + rewriter.replaceOp(op, dpp.value()); return success(); } From 644228894d572de1dc4790a0f614e827c5ca8f9a Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Wed, 16 Apr 2025 22:07:58 -0500 Subject: [PATCH 25/28] fixing formatting Signed-off-by: Muzammiluddin Syed --- .../Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 2200754f55938..56fae105c4d45 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -425,7 +425,7 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op, } else if (chipset.majorVersion <= 12) { // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). Value uint32Max = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); dpp = rewriter.create(loc, res.getType(), res, res, uint32Max, uint32Max, /*fi=*/true, @@ -440,8 +440,8 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op, } } else { return rewriter.notifyMatchFailure( - op, "Subgroup reduce lowering to DPP not currently supported for " - "this device."); + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); } } if (ci.clusterSize >= 64) { @@ -465,8 +465,8 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op, res = rewriter.create(loc, res.getType(), res, lane0); } else { return rewriter.notifyMatchFailure( - op, "Subgroup reduce lowering to DPP not currently supported for " - "this device."); + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); } res = vector::makeArithReduction(rewriter, loc, gpu::convertReductionKind(mode), res, dpp); From 848c6baa2aad527a98317182e83e96bc06eb9b88 Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Tue, 22 Apr 2025 10:40:47 -0500 Subject: [PATCH 26/28] Fixing implementation of DPP subgroup reduce Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 56fae105c4d45..f2fc9a4e39bcd 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -368,7 +368,7 @@ struct VectorSubgroupReduceToShuffles final }; FailureOr -createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op, +createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, Value input, gpu::AllReduceOperation mode, const ClusterInfo &ci, amdgpu::Chipset chipset) { Location loc = op.getLoc(); @@ -435,7 +435,7 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp &op, if (ci.subgroupSize == 32) { Value lane0 = rewriter.create( loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); - dpp = + res = rewriter.create(loc, res.getType(), res, lane0); } } else { From ad77f7a3c5238784dd8913fcf55360d61134279e Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Tue, 22 Apr 2025 22:50:06 -0500 Subject: [PATCH 27/28] [mlir][AMDGPU] Improving DPP implementation of subgrroup reduce Signed-off-by: Muzammiluddin Syed --- .../mlir/Dialect/GPU/Transforms/Passes.h | 4 +- .../include/mlir/Dialect/GPU/Utils/GPUUtils.h | 2 + .../mlir/Dialect/GPU/Utils/ReductionUtils.h | 41 +++ mlir/lib/Dialect/GPU/CMakeLists.txt | 1 + .../GPU/Transforms/SubgroupReduceLowering.cpp | 332 ++++++++++-------- mlir/lib/Dialect/GPU/Utils/CMakeLists.txt | 5 +- mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp | 170 +++++++++ mlir/lib/Dialect/GPU/Utils/Utils.cpp | 26 ++ mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 4 +- 9 files changed, 431 insertions(+), 154 deletions(-) create mode 100644 mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h create mode 100644 mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index a13ad33df29cd..5c63ad5f32b71 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -74,13 +74,15 @@ void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( /// `subgroupSize` lanes. Applicable only to AMD GPUs. void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit = 1); /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns` /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. void populateGpuLowerClusteredSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit = 1); /// Collect all patterns to rewrite ops within the GPU dialect. diff --git a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h index 073493971e6b7..a55f0e1f09a36 100644 --- a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h +++ b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h @@ -29,6 +29,8 @@ class LaunchOp; /// Returns the matching vector combining kind. vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode); +/// Returns the matching gpu allreduce mode. +gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind); } // namespace gpu /// Get a gpu.func created from outlining the region of a gpu.launch op with the diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h new file mode 100644 index 0000000000000..f766dab8c02df --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h @@ -0,0 +1,41 @@ +//===- ReductionUtils.h - Reduction Utilities -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ +#define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Value.h" + +namespace mlir { + +struct ClusterInfo { + unsigned clusterStride; + unsigned clusterSize; + unsigned subgroupSize; +}; + +FailureOr getAndValidateClusterInfo(gpu::SubgroupReduceOp op, + unsigned subgroupSize); + +FailureOr +createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, + Value input, gpu::AllReduceOperation mode, + const ClusterInfo &ci, amdgpu::Chipset chipset, + function_ref packFn, + function_ref unpackFn); + +} // namespace mlir + +#endif // MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index 013311ec027da..1074760aa959e 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -53,6 +53,7 @@ add_mlir_dialect_library(MLIRGPUTransforms LINK_LIBS PUBLIC MLIRAffineUtils + MLIRAMDGPUDialect MLIRArithDialect MLIRAsyncDialect MLIRBufferizationDialect diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index f2fc9a4e39bcd..57af63cbe5eca 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/GPU/Utils/GPUUtils.h" +#include "mlir/Dialect/GPU/Utils/ReductionUtils.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinTypes.h" @@ -148,34 +149,34 @@ struct ScalarizeSingleElementReduce final } }; -struct ClusterInfo { - unsigned clusterStride; - unsigned clusterSize; - unsigned subgroupSize; -}; - -static FailureOr -getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { - assert(llvm::isPowerOf2_32(subgroupSize)); - - std::optional clusterSize = op.getClusterSize(); - assert(!clusterSize || - llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. - if (clusterSize && *clusterSize > subgroupSize) - return op.emitOpError() - << "cluster size " << *clusterSize - << " is greater than subgroup size " << subgroupSize; - unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); - - auto clusterStride = op.getClusterStride(); - assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. - if (clusterStride >= subgroupSize) - return op.emitOpError() - << "cluster stride " << clusterStride - << " is not less than subgroup size " << subgroupSize; - - return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; -} +// struct ClusterInfo { +// unsigned clusterStride; +// unsigned clusterSize; +// unsigned subgroupSize; +// }; + +// static FailureOr +// getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { +// assert(llvm::isPowerOf2_32(subgroupSize)); + +// std::optional clusterSize = op.getClusterSize(); +// assert(!clusterSize || +// llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. +// if (clusterSize && *clusterSize > subgroupSize) +// return op.emitOpError() +// << "cluster size " << *clusterSize +// << " is greater than subgroup size " << subgroupSize; +// unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + +// auto clusterStride = op.getClusterStride(); +// assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. +// if (clusterStride >= subgroupSize) +// return op.emitOpError() +// << "cluster stride " << clusterStride +// << " is not less than subgroup size " << subgroupSize; + +// return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +// } /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn` /// and `unpackFn` to convert to the native shuffle type and to the reduction @@ -367,113 +368,113 @@ struct VectorSubgroupReduceToShuffles final bool matchClustered = false; }; -FailureOr -createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, - Value input, gpu::AllReduceOperation mode, - const ClusterInfo &ci, amdgpu::Chipset chipset) { - Location loc = op.getLoc(); - Value dpp; - Value res = input; - constexpr int allRows = 0xf; - constexpr int allBanks = 0xf; - const bool boundCtrl = true; - if (ci.clusterSize >= 2) { - // Perform reduction between all lanes N <-> N+1. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, - rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - - if (ci.clusterSize >= 4) { - // Perform reduction between all lanes N <-> N+2. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, - rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - if (ci.clusterSize >= 8) { - // Perform reduction between all lanes N <-> 7-N, - // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, - rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - if (ci.clusterSize >= 16) { - // Perform reduction between all lanes N <-> 15-N, - // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, - rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - if (ci.clusterSize >= 32) { - if (chipset.majorVersion <= 9) { - // Broadcast last value from each row to next row. - // Use row mask to avoid polluting rows 1 and 3. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, - rewriter.getUnitAttr(), 0xa, allBanks, - /*bound_ctrl*/ false); - res = vector::makeArithReduction( - rewriter, loc, gpu::convertReductionKind(mode), res, dpp); - } else if (chipset.majorVersion <= 12) { - // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). - Value uint32Max = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); - dpp = rewriter.create(loc, res.getType(), res, res, - uint32Max, uint32Max, - /*fi=*/true, - /*bound_ctrl=*/false); - res = vector::makeArithReduction( - rewriter, loc, gpu::convertReductionKind(mode), res, dpp); - if (ci.subgroupSize == 32) { - Value lane0 = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); - res = - rewriter.create(loc, res.getType(), res, lane0); - } - } else { - return rewriter.notifyMatchFailure( - op, "Subgroup reduce lowering to DPP not currently supported for " - "this device."); - } - } - if (ci.clusterSize >= 64) { - if (chipset.majorVersion <= 9) { - // Broadcast 31st lane value to rows 2 and 3. - // Use row mask to avoid polluting rows 0 and 1. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, - rewriter.getUnitAttr(), 0xc, allBanks, - /*bound_ctrl*/ false); - - } else if (chipset.majorVersion <= 12) { - // Assume reduction across 32 lanes has been done. - // Perform final reduction manually by summing values in lane 0 and - // lane 32. - Value lane0 = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); - Value lane32 = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); - dpp = rewriter.create(loc, res.getType(), res, lane32); - res = rewriter.create(loc, res.getType(), res, lane0); - } else { - return rewriter.notifyMatchFailure( - op, "Subgroup reduce lowering to DPP not currently supported for " - "this device."); - } - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - assert(res.getType() == input.getType()); - return res; -} +// FailureOr +// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, +// Value input, gpu::AllReduceOperation mode, +// const ClusterInfo &ci, amdgpu::Chipset chipset) { +// Location loc = op.getLoc(); +// Value dpp; +// Value res = input; +// constexpr int allRows = 0xf; +// constexpr int allBanks = 0xf; +// const bool boundCtrl = true; +// if (ci.clusterSize >= 2) { +// // Perform reduction between all lanes N <-> N+1. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, +// rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } + +// if (ci.clusterSize >= 4) { +// // Perform reduction between all lanes N <-> N+2. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, +// rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 8) { +// // Perform reduction between all lanes N <-> 7-N, +// // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, +// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 16) { +// // Perform reduction between all lanes N <-> 15-N, +// // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, +// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 32) { +// if (chipset.majorVersion <= 9) { +// // Broadcast last value from each row to next row. +// // Use row mask to avoid polluting rows 1 and 3. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, +// rewriter.getUnitAttr(), 0xa, allBanks, +// /*bound_ctrl*/ false); +// res = vector::makeArithReduction( +// rewriter, loc, gpu::convertReductionKind(mode), res, dpp); +// } else if (chipset.majorVersion <= 12) { +// // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). +// Value uint32Max = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); +// dpp = rewriter.create(loc, res.getType(), res, res, +// uint32Max, uint32Max, +// /*fi=*/true, +// /*bound_ctrl=*/false); +// res = vector::makeArithReduction( +// rewriter, loc, gpu::convertReductionKind(mode), res, dpp); +// if (ci.subgroupSize == 32) { +// Value lane0 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); +// res = +// rewriter.create(loc, res.getType(), res, lane0); +// } +// } else { +// return rewriter.notifyMatchFailure( +// op, "Subgroup reduce lowering to DPP not currently supported for " +// "this device."); +// } +// } +// if (ci.clusterSize >= 64) { +// if (chipset.majorVersion <= 9) { +// // Broadcast 31st lane value to rows 2 and 3. +// // Use row mask to avoid polluting rows 0 and 1. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, +// rewriter.getUnitAttr(), 0xc, allBanks, +// /*bound_ctrl*/ false); + +// } else if (chipset.majorVersion <= 12) { +// // Assume reduction across 32 lanes has been done. +// // Perform final reduction manually by summing values in lane 0 and +// // lane 32. +// Value lane0 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); +// Value lane32 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); +// dpp = rewriter.create(loc, res.getType(), res, lane32); +// res = rewriter.create(loc, res.getType(), res, lane0); +// } else { +// return rewriter.notifyMatchFailure( +// op, "Subgroup reduce lowering to DPP not currently supported for " +// "this device."); +// } +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// assert(res.getType() == input.getType()); +// return res; +// } /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` /// ops over scalar types. Assumes that the subgroup has @@ -481,9 +482,9 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, struct ScalarSubgroupReduceToDPP final : OpRewritePattern { ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize, - bool matchClustered, amdgpu::Chipset chipset, - PatternBenefit benefit) - : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), + unsigned shuffleBitwidth, bool matchClustered, + amdgpu::Chipset chipset, PatternBenefit benefit) + : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth), matchClustered(matchClustered), chipset(chipset) {} LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, @@ -505,12 +506,42 @@ struct ScalarSubgroupReduceToDPP final "clusters of contiguous lanes."); Type valueTy = op.getType(); - if (!valueTy.isIntOrFloat()) + unsigned elemBitwidth = + getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth(); + if (!valueTy.isIntOrFloat() || elemBitwidth > shuffleBitwidth) return rewriter.notifyMatchFailure( op, "value type is not a compatible scalar"); + Location loc = op.getLoc(); + // Since this is already a native shuffle scalar, no packing is necessary. + if (elemBitwidth == shuffleBitwidth) { + auto identityFn = [](Value v) { return v; }; + FailureOr dpp = + createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(), + *ci, chipset, identityFn, identityFn); + if (failed(dpp)) + return failure(); + rewriter.replaceOp(op, dpp.value()); + return success(); + } + + auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth); + auto equivIntType = rewriter.getIntegerType(elemBitwidth); + auto packFn = [loc, &rewriter, equivIntType, + shuffleIntType](Value unpackedVal) -> Value { + auto asInt = + rewriter.create(loc, equivIntType, unpackedVal); + return rewriter.create(loc, shuffleIntType, asInt); + }; + auto unpackFn = [loc, &rewriter, equivIntType, + valueTy](Value packedVal) -> Value { + auto asInt = + rewriter.create(loc, equivIntType, packedVal); + return rewriter.create(loc, valueTy, asInt); + }; + FailureOr dpp = createSubgroupDPPReduction( - rewriter, op, op.getValue(), op.getOp(), *ci, chipset); + rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn); if (failed(dpp)) return failure(); @@ -520,6 +551,7 @@ struct ScalarSubgroupReduceToDPP final private: unsigned subgroupSize = 0; + unsigned shuffleBitwidth = 0; bool matchClustered = false; amdgpu::Chipset chipset; }; @@ -534,19 +566,19 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns( } void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, - PatternBenefit benefit) { - patterns.add(patterns.getContext(), subgroupSize, - /*matchClustered=*/false, chipset, - benefit); + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/false, chipset, benefit); } void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, - PatternBenefit benefit) { - patterns.add(patterns.getContext(), subgroupSize, - /*matchClustered=*/true, chipset, - benefit); + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/true, chipset, benefit); } void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt index 69094c518a159..e7489eaac4988 100644 --- a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt @@ -1,14 +1,17 @@ add_mlir_dialect_library(MLIRGPUUtils Utils.cpp DistributionUtils.cpp + ReductionUtils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils LINK_LIBS PUBLIC - MLIRArithDialect MLIRAffineDialect + MLIRArithDialect + MLIRAMDGPUDialect MLIRGPUDialect + MLIRROCDLDialect MLIRSupport MLIRIR ) diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp new file mode 100644 index 0000000000000..255c4152bd5a4 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp @@ -0,0 +1,170 @@ +//===- ReductionUtils.cpp - Distribution tools for GPUOps --------------===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements distribution utility methods. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" +#include "mlir/Dialect/GPU/Utils/GPUUtils.h" +#include "mlir/Dialect/GPU/Utils/ReductionUtils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/FunctionInterfaces.h" + +#include + +using namespace mlir; + +FailureOr mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op, + unsigned subgroupSize) { + assert(llvm::isPowerOf2_32(subgroupSize)); + + std::optional clusterSize = op.getClusterSize(); + assert(!clusterSize || + llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. + if (clusterSize && *clusterSize > subgroupSize) + return op.emitOpError() + << "cluster size " << *clusterSize + << " is greater than subgroup size " << subgroupSize; + unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + + auto clusterStride = op.getClusterStride(); + assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. + if (clusterStride >= subgroupSize) + return op.emitOpError() + << "cluster stride " << clusterStride + << " is not less than subgroup size " << subgroupSize; + + return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +} + +FailureOr mlir::createSubgroupDPPReduction( + PatternRewriter &rewriter, gpu::SubgroupReduceOp op, Value input, + gpu::AllReduceOperation mode, const ClusterInfo &ci, + amdgpu::Chipset chipset, function_ref packFn, + function_ref unpackFn) { + Location loc = op.getLoc(); + Value dpp; + Value res = input; + constexpr int allRows = 0xf; + constexpr int allBanks = 0xf; + const bool boundCtrl = true; + if (ci.clusterSize >= 2) { + // Perform reduction between all lanes N <-> N+1. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + + if (ci.clusterSize >= 4) { + // Perform reduction between all lanes N <-> N+2. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 8) { + // Perform reduction between all lanes N <-> 7-N, + // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 16) { + // Perform reduction between all lanes N <-> 15-N, + // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 32) { + if (chipset.majorVersion <= 9) { + // Broadcast last value from each row to next row. + // Use row mask to avoid polluting rows 1 and 3. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, + rewriter.getUnitAttr(), 0xa, allBanks, + /*bound_ctrl*/ false); + dpp = unpackFn(dpp); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + } else if (chipset.majorVersion <= 12) { + // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). + Value uint32Max = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); + res = packFn(res); + dpp = rewriter.create(loc, res.getType(), res, res, + uint32Max, uint32Max, + /*fi=*/true, + /*bound_ctrl=*/false); + dpp = unpackFn(dpp); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + if (ci.subgroupSize == 32) { + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + res = + rewriter.create(loc, res.getType(), res, lane0); + } + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); + } + } + if (ci.clusterSize >= 64) { + if (chipset.majorVersion <= 9) { + // Broadcast 31st lane value to rows 2 and 3. + // Use row mask to avoid polluting rows 0 and 1. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, + rewriter.getUnitAttr(), 0xc, allBanks, + /*bound_ctrl*/ false); + dpp = unpackFn(dpp); + + } else if (chipset.majorVersion <= 12) { + // Assume reduction across 32 lanes has been done. + // Perform final reduction manually by summing values in lane 0 and + // lane 32. + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + Value lane32 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); + dpp = rewriter.create(loc, res.getType(), res, lane32); + res = rewriter.create(loc, res.getType(), res, lane0); + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); + } + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + assert(res.getType() == input.getType()); + return res; +} \ No newline at end of file diff --git a/mlir/lib/Dialect/GPU/Utils/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp index 1f09875b3e273..53b1e0883055c 100644 --- a/mlir/lib/Dialect/GPU/Utils/Utils.cpp +++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp @@ -41,4 +41,30 @@ vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode) { llvm_unreachable("Vector and GPU reduction kinds should match 1:1"); } +gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind) { + switch (kind) { +#define MAP_CASE(X) \ + case vector::CombiningKind::X: \ + return gpu::AllReduceOperation::X + + MAP_CASE(ADD); + MAP_CASE(MUL); + MAP_CASE(MINUI); + MAP_CASE(MINSI); + MAP_CASE(MINNUMF); + MAP_CASE(MAXSI); + MAP_CASE(MAXUI); + MAP_CASE(MAXNUMF); + MAP_CASE(AND); + MAP_CASE(OR); + MAP_CASE(XOR); + MAP_CASE(MINIMUMF); + MAP_CASE(MAXIMUMF); + +#undef MAP_CASE + } + + llvm_unreachable("Vector and GPU reduction kinds should match 1:1"); +} + } // namespace mlir::gpu diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index fe402da4cc105..4ebcf897fd532 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -93,9 +93,9 @@ struct TestGpuSubgroupReduceLoweringPass auto maybeChipset = amdgpu::Chipset::parse(target); if (succeeded(maybeChipset)) { populateGpuLowerSubgroupReduceToDPPPatterns( - patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); + patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2)); populateGpuLowerClusteredSubgroupReduceToDPPPatterns( - patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); + patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2)); } populateGpuLowerSubgroupReduceToShufflePatterns( patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32); From 8315e1eb7e9aa57db6e3e9a1f58b5fa2c62fe0cf Mon Sep 17 00:00:00 2001 From: Muzammiluddin Syed Date: Tue, 22 Apr 2025 22:56:23 -0500 Subject: [PATCH 28/28] Formatting fix Signed-off-by: Muzammiluddin Syed --- mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp index 255c4152bd5a4..2f50a1ec87cba 100644 --- a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp +++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp @@ -51,6 +51,7 @@ FailureOr mlir::createSubgroupDPPReduction( gpu::AllReduceOperation mode, const ClusterInfo &ci, amdgpu::Chipset chipset, function_ref packFn, function_ref unpackFn) { + Location loc = op.getLoc(); Value dpp; Value res = input; @@ -167,4 +168,4 @@ FailureOr mlir::createSubgroupDPPReduction( } assert(res.getType() == input.getType()); return res; -} \ No newline at end of file +}