diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index a13ad33df29cd..5c63ad5f32b71 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -74,13 +74,15 @@ void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( /// `subgroupSize` lanes. Applicable only to AMD GPUs. void populateGpuLowerSubgroupReduceToDPPPatterns(RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit = 1); /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToDPPPatterns` /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. void populateGpuLowerClusteredSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit = 1); /// Collect all patterns to rewrite ops within the GPU dialect. diff --git a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h index 073493971e6b7..a55f0e1f09a36 100644 --- a/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h +++ b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h @@ -29,6 +29,8 @@ class LaunchOp; /// Returns the matching vector combining kind. vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode); +/// Returns the matching gpu allreduce mode. +gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind); } // namespace gpu /// Get a gpu.func created from outlining the region of a gpu.launch op with the diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h new file mode 100644 index 0000000000000..f766dab8c02df --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h @@ -0,0 +1,41 @@ +//===- ReductionUtils.h - Reduction Utilities -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ +#define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Value.h" + +namespace mlir { + +struct ClusterInfo { + unsigned clusterStride; + unsigned clusterSize; + unsigned subgroupSize; +}; + +FailureOr getAndValidateClusterInfo(gpu::SubgroupReduceOp op, + unsigned subgroupSize); + +FailureOr +createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, + Value input, gpu::AllReduceOperation mode, + const ClusterInfo &ci, amdgpu::Chipset chipset, + function_ref packFn, + function_ref unpackFn); + +} // namespace mlir + +#endif // MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_ diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index 013311ec027da..1074760aa959e 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -53,6 +53,7 @@ add_mlir_dialect_library(MLIRGPUTransforms LINK_LIBS PUBLIC MLIRAffineUtils + MLIRAMDGPUDialect MLIRArithDialect MLIRAsyncDialect MLIRBufferizationDialect diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index f2fc9a4e39bcd..57af63cbe5eca 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/GPU/Utils/GPUUtils.h" +#include "mlir/Dialect/GPU/Utils/ReductionUtils.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinTypes.h" @@ -148,34 +149,34 @@ struct ScalarizeSingleElementReduce final } }; -struct ClusterInfo { - unsigned clusterStride; - unsigned clusterSize; - unsigned subgroupSize; -}; - -static FailureOr -getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { - assert(llvm::isPowerOf2_32(subgroupSize)); - - std::optional clusterSize = op.getClusterSize(); - assert(!clusterSize || - llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. - if (clusterSize && *clusterSize > subgroupSize) - return op.emitOpError() - << "cluster size " << *clusterSize - << " is greater than subgroup size " << subgroupSize; - unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); - - auto clusterStride = op.getClusterStride(); - assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. - if (clusterStride >= subgroupSize) - return op.emitOpError() - << "cluster stride " << clusterStride - << " is not less than subgroup size " << subgroupSize; - - return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; -} +// struct ClusterInfo { +// unsigned clusterStride; +// unsigned clusterSize; +// unsigned subgroupSize; +// }; + +// static FailureOr +// getAndValidateClusterInfo(gpu::SubgroupReduceOp op, unsigned subgroupSize) { +// assert(llvm::isPowerOf2_32(subgroupSize)); + +// std::optional clusterSize = op.getClusterSize(); +// assert(!clusterSize || +// llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. +// if (clusterSize && *clusterSize > subgroupSize) +// return op.emitOpError() +// << "cluster size " << *clusterSize +// << " is greater than subgroup size " << subgroupSize; +// unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + +// auto clusterStride = op.getClusterStride(); +// assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. +// if (clusterStride >= subgroupSize) +// return op.emitOpError() +// << "cluster stride " << clusterStride +// << " is not less than subgroup size " << subgroupSize; + +// return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +// } /// Emits a subgroup reduction using a sequence of shuffles. Uses the `packFn` /// and `unpackFn` to convert to the native shuffle type and to the reduction @@ -367,113 +368,113 @@ struct VectorSubgroupReduceToShuffles final bool matchClustered = false; }; -FailureOr -createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, - Value input, gpu::AllReduceOperation mode, - const ClusterInfo &ci, amdgpu::Chipset chipset) { - Location loc = op.getLoc(); - Value dpp; - Value res = input; - constexpr int allRows = 0xf; - constexpr int allBanks = 0xf; - const bool boundCtrl = true; - if (ci.clusterSize >= 2) { - // Perform reduction between all lanes N <-> N+1. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, - rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - - if (ci.clusterSize >= 4) { - // Perform reduction between all lanes N <-> N+2. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, - rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - if (ci.clusterSize >= 8) { - // Perform reduction between all lanes N <-> 7-N, - // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, - rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - if (ci.clusterSize >= 16) { - // Perform reduction between all lanes N <-> 15-N, - // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, - rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - if (ci.clusterSize >= 32) { - if (chipset.majorVersion <= 9) { - // Broadcast last value from each row to next row. - // Use row mask to avoid polluting rows 1 and 3. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, - rewriter.getUnitAttr(), 0xa, allBanks, - /*bound_ctrl*/ false); - res = vector::makeArithReduction( - rewriter, loc, gpu::convertReductionKind(mode), res, dpp); - } else if (chipset.majorVersion <= 12) { - // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). - Value uint32Max = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); - dpp = rewriter.create(loc, res.getType(), res, res, - uint32Max, uint32Max, - /*fi=*/true, - /*bound_ctrl=*/false); - res = vector::makeArithReduction( - rewriter, loc, gpu::convertReductionKind(mode), res, dpp); - if (ci.subgroupSize == 32) { - Value lane0 = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); - res = - rewriter.create(loc, res.getType(), res, lane0); - } - } else { - return rewriter.notifyMatchFailure( - op, "Subgroup reduce lowering to DPP not currently supported for " - "this device."); - } - } - if (ci.clusterSize >= 64) { - if (chipset.majorVersion <= 9) { - // Broadcast 31st lane value to rows 2 and 3. - // Use row mask to avoid polluting rows 0 and 1. - dpp = rewriter.create( - loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, - rewriter.getUnitAttr(), 0xc, allBanks, - /*bound_ctrl*/ false); - - } else if (chipset.majorVersion <= 12) { - // Assume reduction across 32 lanes has been done. - // Perform final reduction manually by summing values in lane 0 and - // lane 32. - Value lane0 = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); - Value lane32 = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); - dpp = rewriter.create(loc, res.getType(), res, lane32); - res = rewriter.create(loc, res.getType(), res, lane0); - } else { - return rewriter.notifyMatchFailure( - op, "Subgroup reduce lowering to DPP not currently supported for " - "this device."); - } - res = vector::makeArithReduction(rewriter, loc, - gpu::convertReductionKind(mode), res, dpp); - } - assert(res.getType() == input.getType()); - return res; -} +// FailureOr +// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, +// Value input, gpu::AllReduceOperation mode, +// const ClusterInfo &ci, amdgpu::Chipset chipset) { +// Location loc = op.getLoc(); +// Value dpp; +// Value res = input; +// constexpr int allRows = 0xf; +// constexpr int allBanks = 0xf; +// const bool boundCtrl = true; +// if (ci.clusterSize >= 2) { +// // Perform reduction between all lanes N <-> N+1. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, +// rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } + +// if (ci.clusterSize >= 4) { +// // Perform reduction between all lanes N <-> N+2. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, +// rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 8) { +// // Perform reduction between all lanes N <-> 7-N, +// // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, +// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 16) { +// // Perform reduction between all lanes N <-> 15-N, +// // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, +// rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// if (ci.clusterSize >= 32) { +// if (chipset.majorVersion <= 9) { +// // Broadcast last value from each row to next row. +// // Use row mask to avoid polluting rows 1 and 3. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, +// rewriter.getUnitAttr(), 0xa, allBanks, +// /*bound_ctrl*/ false); +// res = vector::makeArithReduction( +// rewriter, loc, gpu::convertReductionKind(mode), res, dpp); +// } else if (chipset.majorVersion <= 12) { +// // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). +// Value uint32Max = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); +// dpp = rewriter.create(loc, res.getType(), res, res, +// uint32Max, uint32Max, +// /*fi=*/true, +// /*bound_ctrl=*/false); +// res = vector::makeArithReduction( +// rewriter, loc, gpu::convertReductionKind(mode), res, dpp); +// if (ci.subgroupSize == 32) { +// Value lane0 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); +// res = +// rewriter.create(loc, res.getType(), res, lane0); +// } +// } else { +// return rewriter.notifyMatchFailure( +// op, "Subgroup reduce lowering to DPP not currently supported for " +// "this device."); +// } +// } +// if (ci.clusterSize >= 64) { +// if (chipset.majorVersion <= 9) { +// // Broadcast 31st lane value to rows 2 and 3. +// // Use row mask to avoid polluting rows 0 and 1. +// dpp = rewriter.create( +// loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, +// rewriter.getUnitAttr(), 0xc, allBanks, +// /*bound_ctrl*/ false); + +// } else if (chipset.majorVersion <= 12) { +// // Assume reduction across 32 lanes has been done. +// // Perform final reduction manually by summing values in lane 0 and +// // lane 32. +// Value lane0 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); +// Value lane32 = rewriter.create( +// loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); +// dpp = rewriter.create(loc, res.getType(), res, lane32); +// res = rewriter.create(loc, res.getType(), res, lane0); +// } else { +// return rewriter.notifyMatchFailure( +// op, "Subgroup reduce lowering to DPP not currently supported for " +// "this device."); +// } +// res = vector::makeArithReduction(rewriter, loc, +// gpu::convertReductionKind(mode), res, dpp); +// } +// assert(res.getType() == input.getType()); +// return res; +// } /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `amdgpu.dpp` /// ops over scalar types. Assumes that the subgroup has @@ -481,9 +482,9 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, struct ScalarSubgroupReduceToDPP final : OpRewritePattern { ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize, - bool matchClustered, amdgpu::Chipset chipset, - PatternBenefit benefit) - : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), + unsigned shuffleBitwidth, bool matchClustered, + amdgpu::Chipset chipset, PatternBenefit benefit) + : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth), matchClustered(matchClustered), chipset(chipset) {} LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op, @@ -505,12 +506,42 @@ struct ScalarSubgroupReduceToDPP final "clusters of contiguous lanes."); Type valueTy = op.getType(); - if (!valueTy.isIntOrFloat()) + unsigned elemBitwidth = + getElementTypeOrSelf(valueTy).getIntOrFloatBitWidth(); + if (!valueTy.isIntOrFloat() || elemBitwidth > shuffleBitwidth) return rewriter.notifyMatchFailure( op, "value type is not a compatible scalar"); + Location loc = op.getLoc(); + // Since this is already a native shuffle scalar, no packing is necessary. + if (elemBitwidth == shuffleBitwidth) { + auto identityFn = [](Value v) { return v; }; + FailureOr dpp = + createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(), + *ci, chipset, identityFn, identityFn); + if (failed(dpp)) + return failure(); + rewriter.replaceOp(op, dpp.value()); + return success(); + } + + auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth); + auto equivIntType = rewriter.getIntegerType(elemBitwidth); + auto packFn = [loc, &rewriter, equivIntType, + shuffleIntType](Value unpackedVal) -> Value { + auto asInt = + rewriter.create(loc, equivIntType, unpackedVal); + return rewriter.create(loc, shuffleIntType, asInt); + }; + auto unpackFn = [loc, &rewriter, equivIntType, + valueTy](Value packedVal) -> Value { + auto asInt = + rewriter.create(loc, equivIntType, packedVal); + return rewriter.create(loc, valueTy, asInt); + }; + FailureOr dpp = createSubgroupDPPReduction( - rewriter, op, op.getValue(), op.getOp(), *ci, chipset); + rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn); if (failed(dpp)) return failure(); @@ -520,6 +551,7 @@ struct ScalarSubgroupReduceToDPP final private: unsigned subgroupSize = 0; + unsigned shuffleBitwidth = 0; bool matchClustered = false; amdgpu::Chipset chipset; }; @@ -534,19 +566,19 @@ void mlir::populateGpuBreakDownSubgroupReducePatterns( } void mlir::populateGpuLowerSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, - PatternBenefit benefit) { - patterns.add(patterns.getContext(), subgroupSize, - /*matchClustered=*/false, chipset, - benefit); + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/false, chipset, benefit); } void mlir::populateGpuLowerClusteredSubgroupReduceToDPPPatterns( - RewritePatternSet &patterns, unsigned subgroupSize, amdgpu::Chipset chipset, - PatternBenefit benefit) { - patterns.add(patterns.getContext(), subgroupSize, - /*matchClustered=*/true, chipset, - benefit); + RewritePatternSet &patterns, unsigned subgroupSize, + unsigned shuffleBitwidth, amdgpu::Chipset chipset, PatternBenefit benefit) { + patterns.add( + patterns.getContext(), subgroupSize, shuffleBitwidth, + /*matchClustered=*/true, chipset, benefit); } void mlir::populateGpuLowerSubgroupReduceToShufflePatterns( diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt index 69094c518a159..e7489eaac4988 100644 --- a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt @@ -1,14 +1,17 @@ add_mlir_dialect_library(MLIRGPUUtils Utils.cpp DistributionUtils.cpp + ReductionUtils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils LINK_LIBS PUBLIC - MLIRArithDialect MLIRAffineDialect + MLIRArithDialect + MLIRAMDGPUDialect MLIRGPUDialect + MLIRROCDLDialect MLIRSupport MLIRIR ) diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp new file mode 100644 index 0000000000000..2f50a1ec87cba --- /dev/null +++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp @@ -0,0 +1,171 @@ +//===- ReductionUtils.cpp - Distribution tools for GPUOps --------------===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements distribution utility methods. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/AMDGPU/Utils/Chipset.h" +#include "mlir/Dialect/GPU/Utils/GPUUtils.h" +#include "mlir/Dialect/GPU/Utils/ReductionUtils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/FunctionInterfaces.h" + +#include + +using namespace mlir; + +FailureOr mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op, + unsigned subgroupSize) { + assert(llvm::isPowerOf2_32(subgroupSize)); + + std::optional clusterSize = op.getClusterSize(); + assert(!clusterSize || + llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this. + if (clusterSize && *clusterSize > subgroupSize) + return op.emitOpError() + << "cluster size " << *clusterSize + << " is greater than subgroup size " << subgroupSize; + unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize); + + auto clusterStride = op.getClusterStride(); + assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this. + if (clusterStride >= subgroupSize) + return op.emitOpError() + << "cluster stride " << clusterStride + << " is not less than subgroup size " << subgroupSize; + + return ClusterInfo{clusterStride, effectiveClusterSize, subgroupSize}; +} + +FailureOr mlir::createSubgroupDPPReduction( + PatternRewriter &rewriter, gpu::SubgroupReduceOp op, Value input, + gpu::AllReduceOperation mode, const ClusterInfo &ci, + amdgpu::Chipset chipset, function_ref packFn, + function_ref unpackFn) { + + Location loc = op.getLoc(); + Value dpp; + Value res = input; + constexpr int allRows = 0xf; + constexpr int allBanks = 0xf; + const bool boundCtrl = true; + if (ci.clusterSize >= 2) { + // Perform reduction between all lanes N <-> N+1. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + + if (ci.clusterSize >= 4) { + // Perform reduction between all lanes N <-> N+2. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm, + rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 8) { + // Perform reduction between all lanes N <-> 7-N, + // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4]. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 16) { + // Perform reduction between all lanes N <-> 15-N, + // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8]. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror, + rewriter.getUnitAttr(), allRows, allBanks, boundCtrl); + dpp = unpackFn(dpp); + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + if (ci.clusterSize >= 32) { + if (chipset.majorVersion <= 9) { + // Broadcast last value from each row to next row. + // Use row mask to avoid polluting rows 1 and 3. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15, + rewriter.getUnitAttr(), 0xa, allBanks, + /*bound_ctrl*/ false); + dpp = unpackFn(dpp); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + } else if (chipset.majorVersion <= 12) { + // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2). + Value uint32Max = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1)); + res = packFn(res); + dpp = rewriter.create(loc, res.getType(), res, res, + uint32Max, uint32Max, + /*fi=*/true, + /*bound_ctrl=*/false); + dpp = unpackFn(dpp); + res = vector::makeArithReduction( + rewriter, loc, gpu::convertReductionKind(mode), res, dpp); + if (ci.subgroupSize == 32) { + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + res = + rewriter.create(loc, res.getType(), res, lane0); + } + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); + } + } + if (ci.clusterSize >= 64) { + if (chipset.majorVersion <= 9) { + // Broadcast 31st lane value to rows 2 and 3. + // Use row mask to avoid polluting rows 0 and 1. + res = packFn(res); + dpp = rewriter.create( + loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31, + rewriter.getUnitAttr(), 0xc, allBanks, + /*bound_ctrl*/ false); + dpp = unpackFn(dpp); + + } else if (chipset.majorVersion <= 12) { + // Assume reduction across 32 lanes has been done. + // Perform final reduction manually by summing values in lane 0 and + // lane 32. + Value lane0 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0)); + Value lane32 = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32)); + dpp = rewriter.create(loc, res.getType(), res, lane32); + res = rewriter.create(loc, res.getType(), res, lane0); + } else { + return rewriter.notifyMatchFailure( + op, "Subgroup reduce lowering to DPP not currently supported for " + "this device."); + } + res = vector::makeArithReduction(rewriter, loc, + gpu::convertReductionKind(mode), res, dpp); + } + assert(res.getType() == input.getType()); + return res; +} diff --git a/mlir/lib/Dialect/GPU/Utils/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp index 1f09875b3e273..53b1e0883055c 100644 --- a/mlir/lib/Dialect/GPU/Utils/Utils.cpp +++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp @@ -41,4 +41,30 @@ vector::CombiningKind convertReductionKind(gpu::AllReduceOperation mode) { llvm_unreachable("Vector and GPU reduction kinds should match 1:1"); } +gpu::AllReduceOperation convertReductionMode(vector::CombiningKind kind) { + switch (kind) { +#define MAP_CASE(X) \ + case vector::CombiningKind::X: \ + return gpu::AllReduceOperation::X + + MAP_CASE(ADD); + MAP_CASE(MUL); + MAP_CASE(MINUI); + MAP_CASE(MINSI); + MAP_CASE(MINNUMF); + MAP_CASE(MAXSI); + MAP_CASE(MAXUI); + MAP_CASE(MAXNUMF); + MAP_CASE(AND); + MAP_CASE(OR); + MAP_CASE(XOR); + MAP_CASE(MINIMUMF); + MAP_CASE(MAXIMUMF); + +#undef MAP_CASE + } + + llvm_unreachable("Vector and GPU reduction kinds should match 1:1"); +} + } // namespace mlir::gpu diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index fe402da4cc105..4ebcf897fd532 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -93,9 +93,9 @@ struct TestGpuSubgroupReduceLoweringPass auto maybeChipset = amdgpu::Chipset::parse(target); if (succeeded(maybeChipset)) { populateGpuLowerSubgroupReduceToDPPPatterns( - patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); + patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2)); populateGpuLowerClusteredSubgroupReduceToDPPPatterns( - patterns, /*subgroupSize=*/64, *maybeChipset, PatternBenefit(2)); + patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2)); } populateGpuLowerSubgroupReduceToShufflePatterns( patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);