diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index a2354e22e2745..d58ec7ef75e1f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -93,19 +93,22 @@ class ROCDL_IntrPure1Op : class ROCDL_IntrOp overloadedResults, list overloadedOperands, list traits, int numResults, - int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list immArgPositions = [], + int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, + int requiresArgAndResultAttrs = 0, + list immArgPositions = [], list immArgAttrNames = []> : LLVM_IntrOpBase; + requiresAliasAnalysis, 0, requiresArgAndResultAttrs, 0, + immArgPositions, immArgAttrNames>; // Subclass to save typing and ease readibility when there aren't overloaded // operands or memory accesses. class ROCDL_ConcreteNonMemIntrOp traits, int numResults, list immArgPositions = [], list immArgNames = []> - : ROCDL_IntrOp; //===----------------------------------------------------------------------===// // ROCDL special register op definitions @@ -148,8 +151,11 @@ class ROCDL_DimGetterFunctionOp : - ROCDL_IntrPure1Op<"mbcnt." # mnemonic>, - Arguments<(ins I32:$in0, I32:$in1)> { + ROCDL_IntrOp<"mbcnt." # mnemonic, [], [], [Pure], 1, + 0, 0, /*requiresArgAndResultAttrs=*/1> { + dag args = (ins I32:$in0, I32:$in1); + let arguments = !con(args, baseArgs); + let results = (outs I32:$res); let assemblyFormat = [{ $in0 `,` $in1 attr-dict `:` `(` type($in0) `,` type($in1) `)` `->` type($res) }]; @@ -501,7 +507,7 @@ def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">; //===---------------------------------------------------------------------===// def ROCDL_LoadToLDSOp : - ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> { + ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> { dag args = (ins Arg:$globalPtr, Arg:$ldsPtr, I32Attr:$size, @@ -520,7 +526,7 @@ def ROCDL_LoadToLDSOp : } def ROCDL_GlobalLoadLDSOp : - ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> { + ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> { dag args = (ins Arg:$globalPtr, Arg:$ldsPtr, I32Attr:$size, @@ -734,7 +740,7 @@ def ROCDL_RawBufferAtomicUMinOp : // DPP Update intrinsic def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0], - [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0, + [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0, 0, [2, 3, 4, 5], ["dppCtrl", "rowMask", "bankMask", "boundCtrl"]>, Arguments<(ins LLVM_Type:$old, LLVM_Type:$src, I32Attr:$dppCtrl, I32Attr:$rowMask, I32Attr:$bankMask, I1Attr:$boundCtrl)> { @@ -746,7 +752,7 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0], // PermLaneX16 intrinsic operation def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0], - [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0, + [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0, 0, [4, 5], ["fi", "boundControl"]>, Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2, I1Attr:$fi, I1Attr:$boundControl)> { diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index d22364e1ef441..e6fbcf98950a4 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -79,17 +79,30 @@ static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { return canBeBare; } -static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, - const unsigned indexBitwidth) { +static Value getLaneId(RewriterBase &rewriter, Location loc) { auto int32Type = IntegerType::get(rewriter.getContext(), 32); Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32); Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32); - Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, int32Type, - ValueRange{minus1, zero}); - Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, int32Type, - ValueRange{minus1, mbcntLo}); + NamedAttribute noundef = rewriter.getNamedAttr( + LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr()); + NamedAttribute lowRange = rewriter.getNamedAttr( + LLVM::LLVMDialect::getRangeAttrName(), + LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32), + APInt(32, 32))); + NamedAttribute highRange = rewriter.getNamedAttr( + LLVM::LLVMDialect::getRangeAttrName(), + LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32), + APInt(32, 64))); + Value mbcntLo = ROCDL::MbcntLoOp::create( + rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{}, + /*res_attrs=*/ + rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange}))); + Value laneId = ROCDL::MbcntHiOp::create( + rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{}, + rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange}))); return laneId; } + static constexpr StringLiteral amdgcnDataLayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:" @@ -104,18 +117,16 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern { LogicalResult matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto loc = op->getLoc(); + Location loc = op.getLoc(); MLIRContext *context = rewriter.getContext(); - // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0) - // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo) - - Type intTy = IntegerType::get(context, 32); - Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32); - Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32); - Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, intTy, - ValueRange{minus1, zero}); - Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, intTy, - ValueRange{minus1, mbcntLo}); + // convert to: + // %mlo = call noundef range(i32 0, 32) + // @llvm.amdgcn.mbcnt.lo(-1, 0) + // followed by: + // %lid = call noundef range(i32 0, 64) + // @llvm.amdgcn.mbcnt.hi(-1, %mlo) + + Value laneId = getLaneId(rewriter, loc); // Truncate or extend the result depending on the index bitwidth specified // by the LLVMTypeConverter options. const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); @@ -185,8 +196,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { Location loc = op->getLoc(); Value initShflValue = adaptor.getValue(); - const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); - Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth); + Value srcLaneId = getLaneId(rewriter, loc); auto int32Type = IntegerType::get(rewriter.getContext(), 32); Value width = adaptor.getWidth(); diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 2b6adffc81f72..fa4a9749f6a9b 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -54,8 +54,8 @@ gpu.module @test_module { // CHECK: = llvm.sext %{{.*}} : i32 to i64 %gDimZ = gpu.grid_dim z - // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} : (i32, i32) -> i32 - // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} : (i32, i32) -> i32 + // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 + // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 // CHECK: = llvm.sext %{{.*}} : i32 to i64 %laneId = gpu.lane_id @@ -701,7 +701,7 @@ gpu.module @test_module { // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32 // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 - %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 + %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 // *** UP mode shuffle *** // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32 diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 740990a6e589b..c101d071875f4 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -86,12 +86,12 @@ llvm.func @kernel_func_unsafe_fp_atomics() } llvm.func @rocdl.lane_id() -> i32 { - // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) - // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]]) + // CHECK: [[mbcntlo:%.+]] = call noundef range(i32 0, 32) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + // CHECK-NEXT: call noundef range(i32 0, 64) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]]) %0 = llvm.mlir.constant(-1 : i32) : i32 %1 = llvm.mlir.constant(0 : i32) : i32 - %2 = rocdl.mbcnt.lo %0, %1 : (i32, i32) -> i32 - %3 = rocdl.mbcnt.hi %0, %2 : (i32, i32) -> i32 + %2 = rocdl.mbcnt.lo %0, %1 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 + %3 = rocdl.mbcnt.hi %0, %2 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range}]} : (i32, i32) -> i32 llvm.return %3 : i32 }