diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index 5cc65082a7e56..5c63ad5f32b71 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -13,6 +13,7 @@ #ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ #define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Utils/GPUUtils.h" #include "mlir/IR/PatternMatch.h" @@ -68,6 +69,22 @@ void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); +/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` +/// ops over scalar types. Assumes that the subgroup has +/// `subgroupSize` lanes. Applicable only to AMD GPUs. +void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns, + unsigned subgroupSize, + unsigned shuffleBitwidth, + amdgpu::Chipset chipset, + PatternBenefit benefit = 1); + +/// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns` +/// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. +void populateGpuLowerClusteredSubgroupReduceToDPPPatterns( + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, + PatternBenefit benefit = 1); + /// Collect all patterns to rewrite ops within the GPU dialect. inline void populateGpuRewritePatterns(RewritePatternSet &patterns) { populateGpuAllReducePatterns(patterns); diff --git a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h index 073493971e6b7..a55f0e1f09a36 100644 --- a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h +++ b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h @@ -29,6 +29,8 @@ class LaunchOp; /// Returns the matching vector combining kind. vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode); +/// Returns the matching gpu allreduce mode. +gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind); } // namespace gpu /// Get a gpu.func created from outlining the region of a gpu.launch op with the diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h new file mode 100644 index 0000000000000..f766dab8c02df --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h @@ -0,0 +1,41 @@ +//===- ReductionUtils.h - Reduction Utilities -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ +#define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Value.h" + +namespace mlir { + +struct ClusterInfo { + unsigned clusterStride; + unsigned clusterSize; + unsigned subgroupSize; +}; + +FailureOr getAndValidateClusterInfo(gpu::SubgroupReduceOp op, + unsigned subgroupSize); + +FailureOr +createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, + Value input, gpu::AllReduceOperation mode, + const ClusterInfo &ci, amdgpu::Chipset chipset, + function_ref packFn, + function_ref unpackFn); + +} // namespace mlir + +#endif // MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index 013311ec027da..1074760aa959e 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -53,6 +53,7 @@ add_mlir_dialect_library(MLIRGPUTransforms LINK_LIBS PUBLIC MLIRAffineUtils + MLIRAMDGPUDialect MLIRArithDialect MLIRAsyncDialect MLIRBufferizationDialect diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index 43eff3eddcc49..57af63cbe5eca 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -10,15 +10,20 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/GPU/Utils/GPUUtils.h" +#include "mlir/Dialect/GPU/Utils/ReductionUtils.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Location.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include @@ -144,34 +149,34 @@ struct ScalarizeSingleElementReduce final } }; -struct ClusterInfo { - unsigned clusterStride; - unsigned clusterSize; - unsigned subgroupSize; -}; - -static FailureOr -getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { - assert(llvm::isPowerOf2_32(subgroupSize)); - - std::optional clusterSize = op.getClusterSize(); - assert(!clusterSize || - llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. - if (clusterSize && *clusterSize > subgroupSize) - return op.emitOpError() - << "cluster size " << *clusterSize - << " is greater than subgroup size " << subgroupSize; - unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); - - auto clusterStride = op.getClusterStride(); - assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. - if (clusterStride >= subgroupSize) - return op.emitOpError() - << "cluster stride " << clusterStride - << " is not less than subgroup size " << subgroupSize; - - return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; -} +// struct ClusterInfo { +// unsigned clusterStride; +// unsigned clusterSize; +// unsigned subgroupSize; +// }; + +// static FailureOr +// getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { +// assert(llvm::isPowerOf2_32(subgroupSize)); + +// std::optional clusterSize = op.getClusterSize(); +// assert(!clusterSize || +// llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. +// if (clusterSize && *clusterSize > subgroupSize) +// return op.emitOpError() +// << "cluster size " << *clusterSize +// << " is greater than subgroup size " << subgroupSize; +// unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + +// auto clusterStride = op.getClusterStride(); +// assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. +// if (clusterStride >= subgroupSize) +// return op.emitOpError() +// << "cluster stride " << clusterStride +// << " is not less than subgroup size " << subgroupSize; + +// return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +// } /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn` /// and `unpackFn` to convert to the native shuffle type and to the reduction @@ -362,6 +367,194 @@ struct VectorSubgroupReduceToShuffles final unsigned shuffleBitwidth = 0; bool matchClustered = false; }; + +// FailureOr +// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, +// Value input, gpu::AllReduceOperation mode, +// const ClusterInfo &ci, amdgpu::Chipset chipset) { +// Location loc = op.getLoc(); +// Value dpp; +// Value res = input; +// constexpr int allRows = 0xf; +// constexpr int allBanks = 0xf; +// const bool boundCtrl = true; +// if (ci.clusterSize >= 2) { +// // Perform reduction between all lanes N <-> N+1. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, +// rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } + +// if (ci.clusterSize >= 4) { +// // Perform reduction between all lanes N <-> N+2. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, +// rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 8) { +// // Perform reduction between all lanes N <-> 7-N, +// // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, +// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 16) { +// // Perform reduction between all lanes N <-> 15-N, +// // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, +// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 32) { +// if (chipset.majorVersion <= 9) { +// // Broadcast last value from each row to next row. +// // Use row mask to avoid polluting rows 1 and 3. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, +// rewriter.getUnitAttr(), 0xa, allBanks, +// /*bound_ctrl*/ false); +// res = vector::makeArithReduction( +// rewriter, loc, gpu::convertReductionKind(mode), res, dpp); +// } else if (chipset.majorVersion <= 12) { +// // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). +// Value uint32Max = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); +// dpp = rewriter.create(loc, res.getType(), res, res, +// uint32Max, uint32Max, +// /*fi=*/true, +// /*bound_ctrl=*/false); +// res = vector::makeArithReduction( +// rewriter, loc, gpu::convertReductionKind(mode), res, dpp); +// if (ci.subgroupSize == 32) { +// Value lane0 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); +// res = +// rewriter.create(loc, res.getType(), res, lane0); +// } +// } else { +// return rewriter.notifyMatchFailure( +// op, "Subgroup reduce lowering to DPP not currently supported for " +// "this device."); +// } +// } +// if (ci.clusterSize >= 64) { +// if (chipset.majorVersion <= 9) { +// // Broadcast 31st lane value to rows 2 and 3. +// // Use row mask to avoid polluting rows 0 and 1. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, +// rewriter.getUnitAttr(), 0xc, allBanks, +// /*bound_ctrl*/ false); + +// } else if (chipset.majorVersion <= 12) { +// // Assume reduction across 32 lanes has been done. +// // Perform final reduction manually by summing values in lane 0 and +// // lane 32. +// Value lane0 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); +// Value lane32 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); +// dpp = rewriter.create(loc, res.getType(), res, lane32); +// res = rewriter.create(loc, res.getType(), res, lane0); +// } else { +// return rewriter.notifyMatchFailure( +// op, "Subgroup reduce lowering to DPP not currently supported for " +// "this device."); +// } +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// assert(res.getType() == input.getType()); +// return res; +// } + +/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` +/// ops over scalar types. Assumes that the subgroup has +/// `subgroupSize` lanes. Applicable only to AMD GPUs. +struct ScalarSubgroupReduceToDPP final + : OpRewritePattern { + ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize, + unsigned shuffleBitwidth, bool matchClustered, + amdgpu::Chipset chipset, PatternBenefit benefit) + : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth), + matchClustered(matchClustered), chipset(chipset) {} + + LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, + PatternRewriter &rewriter) const override { + if (op.getClusterSize().has_value() != matchClustered) { + return rewriter.notifyMatchFailure( + op, llvm::formatv("op is {0}clustered but pattern is configured to " + "only match {1}clustered ops", + matchClustered ? "non-" : "", + matchClustered ? "" : "non-")); + } + auto ci = getAndValidateClusterInfo(op, subgroupSize); + if (failed(ci)) + return failure(); + + if (ci->clusterStride != 1) + return rewriter.notifyMatchFailure( + op, "Supgroup reductions using DPP are currently only available for " + "clusters of contiguous lanes."); + + Type valueTy = op.getType(); + unsigned elemBitwidth = + getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth(); + if (!valueTy.isIntOrFloat() || elemBitwidth > shuffleBitwidth) + return rewriter.notifyMatchFailure( + op, "value type is not a compatible scalar"); + + Location loc = op.getLoc(); + // Since this is already a native shuffle scalar, no packing is necessary. + if (elemBitwidth == shuffleBitwidth) { + auto identityFn = [](Value v) { return v; }; + FailureOr dpp = + createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(), + *ci, chipset, identityFn, identityFn); + if (failed(dpp)) + return failure(); + rewriter.replaceOp(op, dpp.value()); + return success(); + } + + auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth); + auto equivIntType = rewriter.getIntegerType(elemBitwidth); + auto packFn = [loc, &rewriter, equivIntType, + shuffleIntType](Value unpackedVal) -> Value { + auto asInt = + rewriter.create(loc, equivIntType, unpackedVal); + return rewriter.create(loc, shuffleIntType, asInt); + }; + auto unpackFn = [loc, &rewriter, equivIntType, + valueTy](Value packedVal) -> Value { + auto asInt = + rewriter.create(loc, equivIntType, packedVal); + return rewriter.create(loc, valueTy, asInt); + }; + + FailureOr dpp = createSubgroupDPPReduction( + rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn); + if (failed(dpp)) + return failure(); + + rewriter.replaceOp(op, dpp.value()); + return success(); + } + +private: + unsigned subgroupSize = 0; + unsigned shuffleBitwidth = 0; + bool matchClustered = false; + amdgpu::Chipset chipset; +}; } // namespace void mlir::populateGpuBreakDownSubgroupReducePatterns( @@ -372,6 +565,22 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns( patterns.add(patterns.getContext(), benefit); } +void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/false, chipset, benefit); +} + +void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns( + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/true, chipset, benefit); +} + void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( RewritePatternSet &patterns, unsigned subgroupSize, unsigned shuffleBitwidth, PatternBenefit benefit) { diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt index 69094c518a159..e7489eaac4988 100644 --- a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt @@ -1,14 +1,17 @@ add_mlir_dialect_library(MLIRGPUUtils Utils.cpp DistributionUtils.cpp + ReductionUtils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils LINK_LIBS PUBLIC - MLIRArithDialect MLIRAffineDialect + MLIRArithDialect + MLIRAMDGPUDialect MLIRGPUDialect + MLIRROCDLDialect MLIRSupport MLIRIR ) diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp new file mode 100644 index 0000000000000..2f50a1ec87cba --- /dev/null +++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp @@ -0,0 +1,171 @@ +//===- ReductionUtils.cpp - Distribution tools for GPUOps --------------===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements distribution utility methods. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" +#include "mlir/Dialect/GPU/Utils/GPUUtils.h" +#include "mlir/Dialect/GPU/Utils/ReductionUtils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/FunctionInterfaces.h" + +#include + +using namespace mlir; + +FailureOr mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op, + unsigned subgroupSize) { + assert(llvm::isPowerOf2_32(subgroupSize)); + + std::optional clusterSize = op.getClusterSize(); + assert(!clusterSize || + llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. + if (clusterSize && *clusterSize > subgroupSize) + return op.emitOpError() + << "cluster size " << *clusterSize + << " is greater than subgroup size " << subgroupSize; + unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + + auto clusterStride = op.getClusterStride(); + assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. + if (clusterStride >= subgroupSize) + return op.emitOpError() + << "cluster stride " << clusterStride + << " is not less than subgroup size " << subgroupSize; + + return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +} + +FailureOr mlir::createSubgroupDPPReduction( + PatternRewriter &rewriter, gpu::SubgroupReduceOp op, Value input, + gpu::AllReduceOperation mode, const ClusterInfo &ci, + amdgpu::Chipset chipset, function_ref packFn, + function_ref unpackFn) { + + Location loc = op.getLoc(); + Value dpp; + Value res = input; + constexpr int allRows = 0xf; + constexpr int allBanks = 0xf; + const bool boundCtrl = true; + if (ci.clusterSize >= 2) { + // Perform reduction between all lanes N <-> N+1. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + + if (ci.clusterSize >= 4) { + // Perform reduction between all lanes N <-> N+2. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 8) { + // Perform reduction between all lanes N <-> 7-N, + // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 16) { + // Perform reduction between all lanes N <-> 15-N, + // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 32) { + if (chipset.majorVersion <= 9) { + // Broadcast last value from each row to next row. + // Use row mask to avoid polluting rows 1 and 3. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, + rewriter.getUnitAttr(), 0xa, allBanks, + /*bound_ctrl*/ false); + dpp = unpackFn(dpp); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + } else if (chipset.majorVersion <= 12) { + // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). + Value uint32Max = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); + res = packFn(res); + dpp = rewriter.create(loc, res.getType(), res, res, + uint32Max, uint32Max, + /*fi=*/true, + /*bound_ctrl=*/false); + dpp = unpackFn(dpp); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + if (ci.subgroupSize == 32) { + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + res = + rewriter.create(loc, res.getType(), res, lane0); + } + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); + } + } + if (ci.clusterSize >= 64) { + if (chipset.majorVersion <= 9) { + // Broadcast 31st lane value to rows 2 and 3. + // Use row mask to avoid polluting rows 0 and 1. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, + rewriter.getUnitAttr(), 0xc, allBanks, + /*bound_ctrl*/ false); + dpp = unpackFn(dpp); + + } else if (chipset.majorVersion <= 12) { + // Assume reduction across 32 lanes has been done. + // Perform final reduction manually by summing values in lane 0 and + // lane 32. + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + Value lane32 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); + dpp = rewriter.create(loc, res.getType(), res, lane32); + res = rewriter.create(loc, res.getType(), res, lane0); + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); + } + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + assert(res.getType() == input.getType()); + return res; +} diff --git a/mlir/lib/Dialect/GPU/Utils/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp index 1f09875b3e273..53b1e0883055c 100644 --- a/mlir/lib/Dialect/GPU/Utils/Utils.cpp +++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp @@ -41,4 +41,30 @@ vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode) { llvm_unreachable("Vector and GPU reduction kinds should match 1:1"); } +gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind) { + switch (kind) { +#define MAP_CASE(X) \ + case vector::CombiningKind::X: \ + return gpu::AllReduceOperation::X + + MAP_CASE(ADD); + MAP_CASE(MUL); + MAP_CASE(MINUI); + MAP_CASE(MINSI); + MAP_CASE(MINNUMF); + MAP_CASE(MAXSI); + MAP_CASE(MAXUI); + MAP_CASE(MAXNUMF); + MAP_CASE(AND); + MAP_CASE(OR); + MAP_CASE(XOR); + MAP_CASE(MINIMUMF); + MAP_CASE(MAXIMUMF); + +#undef MAP_CASE + } + + llvm_unreachable("Vector and GPU reduction kinds should match 1:1"); +} + } // namespace mlir::gpu diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir index 9f2aa1be52fc3..139edf6882df6 100644 --- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir +++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir @@ -6,14 +6,20 @@ // RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles" %s \ // RUN: | FileCheck %s --check-prefix=CHECK-SHFL +// RUN: mlir-opt --allow-unregistered-dialect \ +// RUN: --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \ +// RUN: | FileCheck %s --check-prefix=CHECK-DPP + // CHECK-SUB: gpu.module @kernels { // CHECK-SHFL: gpu.module @kernels { +// CHECK-DPP: gpu.module @kernels { gpu.module @kernels { // CHECK-SUB-LABEL: gpu.func @kernel0( // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<5xf16>) // // CHECK-SHFL-LABEL: gpu.func @kernel0( + // CHECK-DPP-LABEL: gpu.func @kernel0( gpu.func @kernel0(%arg0: vector<5xf16>) kernel { // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16> // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16> @@ -26,16 +32,19 @@ gpu.module @kernels { // CHECK-SUB: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16 // CHECK-SUB: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16> // CHECK-SUB: "test.consume"(%[[V2]]) : (vector<5xf16>) -> () + // CHECK-DPP-COUNT-6: amdgpu.dpp %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum0) : (vector<5xf16>) -> () // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-6: amdgpu.dpp %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum1) : (vector<5xf16>) -> () // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4) // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>) "test.consume"(%sum2) : (vector<5xf16>) -> () @@ -52,27 +61,34 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<1xf32>) // // CHECK-SHFL-LABEL: gpu.func @kernel1( + // + // CHECK-DPP-LABEL: gpu.func @kernel1( gpu.func @kernel1(%arg0: vector<1xf32>) kernel { // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32> // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32 // CHECK-SUB: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32> // CHECK-SUB: "test.consume"(%[[V0]]) : (vector<1xf32>) -> () + // CHECK-DPP-COUNT-6: amdgpu.dpp %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum0) : (vector<1xf32>) -> () // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32 // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-6: amdgpu.dpp %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum1) : (vector<1xf32>) -> () // Note stride is dropped because it is == 1. // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32 // CHECK-SUB: "test.consume" + // CHECK-DPP-COUNT-2: amdgpu.dpp {{.+}} quad_perm + // CHECK-DPP: amdgpu.dpp {{.+}} row_half_mirror %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum2) : (vector<1xf32>) -> () // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32 // CHECK-SUB: "test.consume" + // CHECK-DPP-NOT: amdgpu.dpp %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>) "test.consume"(%sum3) : (vector<1xf32>) -> () @@ -86,6 +102,8 @@ gpu.module @kernels { // CHECK-SUB-SAME: %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>) // // CHECK-SHFL-LABEL: gpu.func @kernel2( + // CHECK-DPP-LABEL: gpu.func @kernel2( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel { // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8> // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> () @@ -103,6 +121,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) + // CHECK-DPP-LABEL: gpu.func @kernel3( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel3(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -122,6 +142,8 @@ gpu.module @kernels { // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32 // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32 // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> () + + // CHECK-DPP-COUNT-6: amdgpu.dpp %sum0 = gpu.subgroup_reduce add %arg0 : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -131,6 +153,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) + // CHECK-DPP-LABEL: gpu.func @kernel3_clustered( + // CHECK-DPP-SAME: %[[ARG0:.+]]: i32) gpu.func @kernel3_clustered(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -144,6 +168,14 @@ gpu.module @kernels { // CHECK-SHFL: %[[S2:.+]], %{{.+}} = gpu.shuffle xor %[[A1]], %[[C4]], %[[C32]] : i32 // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32 // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> () + + // CHECK-DPP: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32 + // CHECK-DPP: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32 + // CHECK-DPP: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32 + // CHECK-DPP: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32 + // CHECK-DPP: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]] row_half_mirror(unit) {bound_ctrl = true} : i32 + // CHECK-DPP: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32 + // CHECK-DPP: "test.consume"(%[[A2]]) : (i32) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32 "test.consume"(%sum0) : (i32) -> () @@ -153,6 +185,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i32) + // CHECK-DPP-LABEL: gpu.func @kernel3_clustered_strided( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel3_clustered_strided(%arg0: i32) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32 @@ -175,6 +209,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) + // CHECK-DPP-LABEL: gpu.func @kernel4( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel4(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -211,6 +247,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<2xf16>) + // CHECK-DPP-LABEL: gpu.func @kernel4_clustered( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel { // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32 // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32 @@ -226,6 +264,7 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) + // CHECK-DPP-LABEL: gpu.func @kernel5( gpu.func @kernel5(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -237,6 +276,7 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () + // CHECK-DPP-COUNT-6: amdgpu.dpp %sum0 = gpu.subgroup_reduce add %arg0 : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () @@ -246,6 +286,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered( // CHECK-SHFL-SAME: %[[ARG0:.+]]: i16) + // CHECK-DPP-LABEL: gpu.func @kernel5_clustered + // CHECK-DPP-SAME: %[[ARG0:.+]]: i16) gpu.func @kernel5_clustered(%arg0: i16) kernel { // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32 // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32 @@ -257,6 +299,16 @@ gpu.module @kernels { // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16 // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16 // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> () + + // CHECK-DPP: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]] quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16 + // CHECK-DPP: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]] quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16 + // CHECK-DPP: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]] row_half_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16 + // CHECK-DPP: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]] row_mirror(unit) {bound_ctrl = true} : i16 + // CHECK-DPP: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16 + // CHECK-DPP: "test.consume"(%[[VAR7]]) : (i16) -> () %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16 "test.consume"(%sum0) : (i16) -> () @@ -266,6 +318,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel6( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) + // CHECK-DPP-LABEL: gpu.func @kernel6( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel6(%arg0: vector<3xi8>) kernel { // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8> // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8> @@ -289,6 +343,8 @@ gpu.module @kernels { // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size( // CHECK-SHFL-SAME: %[[ARG0:.+]]: vector<3xi8>) + // CHECK-DPP-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size( + // CHECK-DPP-NOT: amdgpu.dpp gpu.func @kernel_cluster_size_is_subgroup_size(%arg0: vector<3xi8>) kernel { // CHECK-SHFL-COUNT-5: gpu.shuffle xor %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (vector<3xi8>) -> (vector<3xi8>) diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index a49d304baf5c6..4ebcf897fd532 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -10,10 +10,13 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/Index/IR/IndexDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/PatternMatch.h" @@ -54,7 +57,9 @@ struct TestGpuSubgroupReduceLoweringPass : PassWrapper(pass) {} void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry + .insert(); } StringRef getArgument() const final { @@ -70,6 +75,12 @@ struct TestGpuSubgroupReduceLoweringPass llvm::cl::desc("Expand subgroup_reduce ops to shuffle ops."), llvm::cl::init(false)}; + Option target{ + *this, "target", + llvm::cl::desc("Target backend name which will be used to provide " + "compatible lowerings of subgroup reduce."), + llvm::cl::init("")}; + void runOnOperation() override { RewritePatternSet patterns(&getContext()); @@ -77,8 +88,15 @@ struct TestGpuSubgroupReduceLoweringPass // perform fewer failing matches. populateGpuBreakDownSubgroupReducePatterns(patterns, /*maxShuffleBitwidth=*/32, - PatternBenefit(2)); + PatternBenefit(3)); if (expandToShuffles) { + auto maybeChipset = amdgpu::Chipset::parse(target); + if (succeeded(maybeChipset)) { + populateGpuLowerSubgroupReduceToDPPPatterns( + patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2)); + populateGpuLowerClusteredSubgroupReduceToDPPPatterns( + patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2)); + } populateGpuLowerSubgroupReduceToShufflePatterns( patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32); populateGpuLowerClusteredSubgroupReduceToShufflePatterns(