From de4364f23cb912a66505fb86c5de9539b2af3d7a Mon Sep 17 00:00:00 2001 From: Alan Li Date: Fri, 25 Apr 2025 12:37:59 -0400 Subject: [PATCH 1/4] [MLIR][ROCDL] Lower `gpu.subgroup_id` to `wavefrontsize` --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 2 + .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 37 ++++++++++++++++++- .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 10 +++-- mlir/test/Target/LLVMIR/rocdl.mlir | 4 ++ 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 186a4f53f93cb..93e59e0e7e6be 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -216,6 +216,8 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">; def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">; def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">; +def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">; + //===----------------------------------------------------------------------===// // Thread range and Block range //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index e6dd6f135884e..d17fb4716d331 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -52,6 +52,25 @@ namespace mlir { using namespace mlir; +// Truncate or extend the result depending on the index bitwidth specified +// by the LLVMTypeConverter options. +static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, + Location loc, Value value, + const LLVMTypeConverter &converter) { + int64_t intWidth = cast(value.getType()).getWidth(); + int64_t indexBitwidth = converter.getIndexTypeBitwidth(); + auto indexBitwidthType = + IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth()); + // TODO: use <=> in C++20. + if (indexBitwidth > intWidth) { + return rewriter.create(loc, indexBitwidthType, value); + } + if (indexBitwidth < intWidth) { + return rewriter.create(loc, indexBitwidthType, value); + } + return value; +} + /// Returns true if the given `gpu.func` can be safely called using the bare /// pointer calling convention. static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { @@ -113,6 +132,20 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { } }; +struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + LogicalResult + matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value wavefrontOp = rewriter.create( + op.getLoc(), IntegerType::get(rewriter.getContext(), 32)); + wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp, + *getTypeConverter()); + rewriter.replaceOp(op, {wavefrontOp}); + return success(); + } +}; + struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -405,7 +438,9 @@ void mlir::populateGpuToROCDLConversionPatterns( // TODO: Add alignment for workgroup memory patterns.add(converter); - patterns.add(converter); + patterns + .add( + converter); populateMathToROCDLConversionPatterns(converter, patterns); } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 071cae9d5789f..5e3cad0cf26b0 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -11,7 +11,7 @@ gpu.module @test_module { func.func @gpu_index_ops() -> (index, index, index, index, index, index, index, index, index, index, index, index, - index) { + index, index) { // CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64 // CHECK: rocdl.workitem.id.x : i32 @@ -59,12 +59,16 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %laneId = gpu.lane_id + // CHECK: = rocdl.wavefrontsize : i32 + // CHECK: = llvm.sext %{{.*}} : i32 to i64 + %subgroupSize = gpu.subgroup_size : index + func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ, %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ, - %laneId + %laneId, %subgroupSize : index, index, index, index, index, index, index, index, index, index, index, index, - index + index, index } } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 3db1f7b2b6427..3a0d3943fe207 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -32,6 +32,10 @@ llvm.func @rocdl_special_regs() -> i32 { // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0) %14 = rocdl.workgroup.dim.x range : i64 + + // CHECK: call i32 $llvm.amdgcn.wavefrontsize() + %15 = rocdl.wavefrontsize : i32 + llvm.return %1 : i32 } From 1c75a81472535e0b27706df898d6799de32734a9 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Fri, 25 Apr 2025 16:06:04 -0400 Subject: [PATCH 2/4] updates --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 5 ++++- .../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 2 +- mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 10 +++++++--- mlir/test/Target/LLVMIR/rocdl.mlir | 5 ++++- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 93e59e0e7e6be..3511f71b32866 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -216,7 +216,10 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">; def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">; def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">; -def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">; +def ROCDL_WavefrontSizeOp : ROCDL_IntrPure1Op<"wavefrontsize">, + Arguments<(ins OptionalAttr:$upper_bound)> { + let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($res)"; +} //===----------------------------------------------------------------------===// // Thread range and Block range diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index d17fb4716d331..e196aa17d61c2 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -138,7 +138,7 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value wavefrontOp = rewriter.create( - op.getLoc(), IntegerType::get(rewriter.getContext(), 32)); + op.getLoc(), rewriter.getI32Type(), op.getUpperBoundAttr()); wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp, *getTypeConverter()); rewriter.replaceOp(op, {wavefrontOp}); diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 5e3cad0cf26b0..3ed291ce11c4e 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -11,7 +11,7 @@ gpu.module @test_module { func.func @gpu_index_ops() -> (index, index, index, index, index, index, index, index, index, index, index, index, - index, index) { + index, index, index) { // CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64 // CHECK: rocdl.workitem.id.x : i32 @@ -63,12 +63,16 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %subgroupSize = gpu.subgroup_size : index + // CHECK: = rocdl.wavefrontsize upper_bound 64 : i32 + // CHECK: = llvm.sext %{{.*}} : i32 to i64 + %subgroupSize2 = gpu.subgroup_size upper_bound 64 : index + func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ, %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ, - %laneId, %subgroupSize + %laneId, %subgroupSize, %subgroupSize2 : index, index, index, index, index, index, index, index, index, index, index, index, - index, index + index, index, index } } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 3a0d3943fe207..66be3dea66630 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -33,9 +33,12 @@ llvm.func @rocdl_special_regs() -> i32 { // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0) %14 = rocdl.workgroup.dim.x range : i64 - // CHECK: call i32 $llvm.amdgcn.wavefrontsize() + // CHECK: call i32 @llvm.amdgcn.wavefrontsize() %15 = rocdl.wavefrontsize : i32 + // CHECK: call i32 @llvm.amdgcn.wavefrontsize() + %16 = rocdl.wavefrontsize upper_bound 32 : i32 + llvm.return %1 : i32 } From a29668da34fd9a5a0546c615c062ba277276d951 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Fri, 25 Apr 2025 16:20:07 -0400 Subject: [PATCH 3/4] Use range instead of single upper_bound --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 5 +---- mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 8 +++++++- mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 2 +- mlir/test/Target/LLVMIR/rocdl.mlir | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 3511f71b32866..93e59e0e7e6be 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -216,10 +216,7 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">; def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">; def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">; -def ROCDL_WavefrontSizeOp : ROCDL_IntrPure1Op<"wavefrontsize">, - Arguments<(ins OptionalAttr:$upper_bound)> { - let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($res)"; -} +def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">; //===----------------------------------------------------------------------===// // Thread range and Block range diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index e196aa17d61c2..c328ff96feb4e 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -137,8 +137,14 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { LogicalResult matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { + LLVM::ConstantRangeAttr bounds = nullptr; + if (auto upperBoundAttr = op.getUpperBoundAttr()) { + bounds = rewriter.getAttr( + /*bitWidth=*/32, /*lower=*/32, + /*upper=*/op.getUpperBoundAttr().getInt()); + } Value wavefrontOp = rewriter.create( - op.getLoc(), rewriter.getI32Type(), op.getUpperBoundAttr()); + op.getLoc(), rewriter.getI32Type(), bounds); wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp, *getTypeConverter()); rewriter.replaceOp(op, {wavefrontOp}); diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 3ed291ce11c4e..640df84dcba8a 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -63,7 +63,7 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %subgroupSize = gpu.subgroup_size : index - // CHECK: = rocdl.wavefrontsize upper_bound 64 : i32 + // CHECK: = rocdl.wavefrontsize range : i32 // CHECK: = llvm.sext %{{.*}} : i32 to i64 %subgroupSize2 = gpu.subgroup_size upper_bound 64 : index diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 66be3dea66630..663f115a7c5ce 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -36,8 +36,8 @@ llvm.func @rocdl_special_regs() -> i32 { // CHECK: call i32 @llvm.amdgcn.wavefrontsize() %15 = rocdl.wavefrontsize : i32 - // CHECK: call i32 @llvm.amdgcn.wavefrontsize() - %16 = rocdl.wavefrontsize upper_bound 32 : i32 + // CHECK: call range(i32 32, 64) i32 @llvm.amdgcn.wavefrontsize() + %16 = rocdl.wavefrontsize range : i32 llvm.return %1 : i32 } From 936d9e15d73aaf85fc09d7634ca1a1a0486669a4 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Fri, 25 Apr 2025 18:47:09 -0400 Subject: [PATCH 4/4] Another update --- .../Conversion/GPUToROCDL/GPUToROCDLPass.h | 7 ++++++- .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 19 +++++++++++++++---- .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 2 +- mlir/test/Target/LLVMIR/rocdl.mlir | 4 ++-- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h index 1a917932a9a84..291b809071ce9 100644 --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -20,6 +20,10 @@ class RewritePatternSet; template class OperationPass; +namespace amdgpu { +struct Chipset; +} // namespace amdgpu + namespace gpu { class GPUModuleOp; } // namespace gpu @@ -32,7 +36,8 @@ class GPUModuleOp; /// The resulting pattern set should be run over a gpu.module op void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, - gpu::amd::Runtime runtime); + gpu::amd::Runtime runtime, + amdgpu::Chipset chipset); /// Configure target to convert from the GPU dialect to ROCDL. void configureGpuToROCDLConversionLegality(ConversionTarget &target); diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index c328ff96feb4e..6b180860ff4eb 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -134,14 +134,21 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter, + amdgpu::Chipset chipset) + : ConvertOpToLLVMPattern(converter), + chipset(chipset) {} + LogicalResult matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { LLVM::ConstantRangeAttr bounds = nullptr; + bool isBeforeGfx10 = chipset.majorVersion < 10; if (auto upperBoundAttr = op.getUpperBoundAttr()) { bounds = rewriter.getAttr( - /*bitWidth=*/32, /*lower=*/32, - /*upper=*/op.getUpperBoundAttr().getInt()); + /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32, + /*upper=*/op.getUpperBoundAttr().getInt() + 1); } Value wavefrontOp = rewriter.create( op.getLoc(), rewriter.getI32Type(), bounds); @@ -150,6 +157,8 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern { rewriter.replaceOp(op, {wavefrontOp}); return success(); } + + const amdgpu::Chipset chipset; }; struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { @@ -358,7 +367,8 @@ struct LowerGpuOpsToROCDLOpsPass final populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, *maybeChipset); - populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime); + populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime, + *maybeChipset); configureGpuToROCDLConversionLegality(target); if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) signalPassFailure(); @@ -406,7 +416,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { void mlir::populateGpuToROCDLConversionPatterns( const LLVMTypeConverter &converter, RewritePatternSet &patterns, - mlir::gpu::amd::Runtime runtime) { + mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) { using gpu::index_lowering::IndexKind; using gpu::index_lowering::IntrType; using mlir::gpu::amd::Runtime; @@ -447,6 +457,7 @@ void mlir::populateGpuToROCDLConversionPatterns( patterns .add( converter); + patterns.add(converter, chipset); populateMathToROCDLConversionPatterns(converter, patterns); } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 640df84dcba8a..4cb35a458fcfa 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -63,7 +63,7 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %subgroupSize = gpu.subgroup_size : index - // CHECK: = rocdl.wavefrontsize range : i32 + // CHECK: = rocdl.wavefrontsize range : i32 // CHECK: = llvm.sext %{{.*}} : i32 to i64 %subgroupSize2 = gpu.subgroup_size upper_bound 64 : index diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 663f115a7c5ce..af47582dd0bfb 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -36,8 +36,8 @@ llvm.func @rocdl_special_regs() -> i32 { // CHECK: call i32 @llvm.amdgcn.wavefrontsize() %15 = rocdl.wavefrontsize : i32 - // CHECK: call range(i32 32, 64) i32 @llvm.amdgcn.wavefrontsize() - %16 = rocdl.wavefrontsize range : i32 + // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wavefrontsize() + %16 = rocdl.wavefrontsize range : i32 llvm.return %1 : i32 }