From b4855f9eff2846ef3194f815f83a28a9c492bbe3 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 30 Jul 2025 21:16:37 +0000 Subject: [PATCH] [mlir][ROCDL] Annotate lane ID functions with noundef, ranges Now that we have general support for setting argument and result attributes on LLVM intrinsics, extend the definitions of mbcnt.lo and mbcnt.hi to carry such attributes. With that, update the construction of the mbcnt.lo/mbcnt.hi calls used to get the lane ID to be `noundef` (since the lane ID is always defined) and to be annotated with the correct ranges (so that generic LLVM passes can correctly optimized based on the fact that there are never more than 32/64 lanes). (Also, handle a pattern that wasn't using getLaneId() and get rid of a dead argument) --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 24 ++++++---- .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 48 +++++++++++-------- .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 6 +-- mlir/test/Target/LLVMIR/rocdl.mlir | 8 ++-- 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index a2354e22e2745..d58ec7ef75e1f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -93,19 +93,22 @@ class ROCDL_IntrPure1Op : class ROCDL_IntrOp overloadedResults, list overloadedOperands, list traits, int numResults, - int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list immArgPositions = [], + int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, + int requiresArgAndResultAttrs = 0, + list immArgPositions = [], list immArgAttrNames = []> : LLVM_IntrOpBase; + requiresAliasAnalysis, 0, requiresArgAndResultAttrs, 0, + immArgPositions, immArgAttrNames>; // Subclass to save typing and ease readibility when there aren't overloaded // operands or memory accesses. class ROCDL_ConcreteNonMemIntrOp traits, int numResults, list immArgPositions = [], list immArgNames = []> - : ROCDL_IntrOp; //===----------------------------------------------------------------------===// // ROCDL special register op definitions @@ -148,8 +151,11 @@ class ROCDL_DimGetterFunctionOp : - ROCDL_IntrPure1Op<"mbcnt." # mnemonic>, - Arguments<(ins I32:$in0, I32:$in1)> { + ROCDL_IntrOp<"mbcnt." # mnemonic, [], [], [Pure], 1, + 0, 0, /*requiresArgAndResultAttrs=*/1> { + dag args = (ins I32:$in0, I32:$in1); + let arguments = !con(args, baseArgs); + let results = (outs I32:$res); let assemblyFormat = [{ $in0 `,` $in1 attr-dict `:` `(` type($in0) `,` type($in1) `)` `->` type($res) }]; @@ -501,7 +507,7 @@ def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">; //===---------------------------------------------------------------------===// def ROCDL_LoadToLDSOp : - ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> { + ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> { dag args = (ins Arg:$globalPtr, Arg:$ldsPtr, I32Attr:$size, @@ -520,7 +526,7 @@ def ROCDL_LoadToLDSOp : } def ROCDL_GlobalLoadLDSOp : - ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> { + ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> { dag args = (ins Arg:$globalPtr, Arg:$ldsPtr, I32Attr:$size, @@ -734,7 +740,7 @@ def ROCDL_RawBufferAtomicUMinOp : // DPP Update intrinsic def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0], - [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0, + [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0, 0, [2, 3, 4, 5], ["dppCtrl", "rowMask", "bankMask", "boundCtrl"]>, Arguments<(ins LLVM_Type:$old, LLVM_Type:$src, I32Attr:$dppCtrl, I32Attr:$rowMask, I32Attr:$bankMask, I1Attr:$boundCtrl)> { @@ -746,7 +752,7 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0], // PermLaneX16 intrinsic operation def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0], - [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0, + [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0, 0, [4, 5], ["fi", "boundControl"]>, Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2, I1Attr:$fi, I1Attr:$boundControl)> { diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index d22364e1ef441..e6fbcf98950a4 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -79,17 +79,30 @@ static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { return canBeBare; } -static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, - const unsigned indexBitwidth) { +static Value getLaneId(RewriterBase &rewriter, Location loc) { auto int32Type = IntegerType::get(rewriter.getContext(), 32); Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32); Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32); - Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, int32Type, - ValueRange{minus1, zero}); - Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, int32Type, - ValueRange{minus1, mbcntLo}); + NamedAttribute noundef = rewriter.getNamedAttr( + LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr()); + NamedAttribute lowRange = rewriter.getNamedAttr( + LLVM::LLVMDialect::getRangeAttrName(), + LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32), + APInt(32, 32))); + NamedAttribute highRange = rewriter.getNamedAttr( + LLVM::LLVMDialect::getRangeAttrName(), + LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32), + APInt(32, 64))); + Value mbcntLo = ROCDL::MbcntLoOp::create( + rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{}, + /*res_attrs=*/ + rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange}))); + Value laneId = ROCDL::MbcntHiOp::create( + rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{}, + rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange}))); return laneId; } + static constexpr StringLiteral amdgcnDataLayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:" @@ -104,18 +117,16 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { LogicalResult matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto loc = op->getLoc(); + Location loc = op.getLoc(); MLIRContext *context = rewriter.getContext(); - // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0) - // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo) - - Type intTy = IntegerType::get(context, 32); - Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32); - Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32); - Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, intTy, - ValueRange{minus1, zero}); - Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, intTy, - ValueRange{minus1, mbcntLo}); + // convert to: + // %mlo = call noundef range(i32 0, 32) + // @llvm.amdgcn.mbcnt.lo(-1, 0) + // followed by: + // %lid = call noundef range(i32 0, 64) + // @llvm.amdgcn.mbcnt.hi(-1, %mlo) + + Value laneId = getLaneId(rewriter, loc); // Truncate or extend the result depending on the index bitwidth specified // by the LLVMTypeConverter options. const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); @@ -185,8 +196,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { Location loc = op->getLoc(); Value initShflValue = adaptor.getValue(); - const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); - Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth); + Value srcLaneId = getLaneId(rewriter, loc); auto int32Type = IntegerType::get(rewriter.getContext(), 32); Value width = adaptor.getWidth(); diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 2b6adffc81f72..fa4a9749f6a9b 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -54,8 +54,8 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %gDimZ = gpu.grid_dim z - // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} : (i32, i32) -> i32 - // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} : (i32, i32) -> i32 + // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 + // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 // CHECK: = llvm.sext %{{.*}} : i32 to i64 %laneId = gpu.lane_id @@ -701,7 +701,7 @@ gpu.module @test_module { // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32 // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 - %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 + %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 // *** UP mode shuffle *** // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32 diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 740990a6e589b..c101d071875f4 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -86,12 +86,12 @@ llvm.func @kernel_func_unsafe_fp_atomics() } llvm.func @rocdl.lane_id() -> i32 { - // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) - // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]]) + // CHECK: [[mbcntlo:%.+]] = call noundef range(i32 0, 32) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + // CHECK-NEXT: call noundef range(i32 0, 64) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]]) %0 = llvm.mlir.constant(-1 : i32) : i32 %1 = llvm.mlir.constant(0 : i32) : i32 - %2 = rocdl.mbcnt.lo %0, %1 : (i32, i32) -> i32 - %3 = rocdl.mbcnt.hi %0, %2 : (i32, i32) -> i32 + %2 = rocdl.mbcnt.lo %0, %1 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 + %3 = rocdl.mbcnt.hi %0, %2 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 llvm.return %3 : i32 }