- 
                Notifications
    
You must be signed in to change notification settings  - Fork 15.1k
 
[mlir][ROCDL] Annotate lane ID functions with noundef, ranges #151396
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Now that we have general support for setting argument and result attributes on LLVM intrinsics, extend the definitions of mbcnt.lo and mbcnt.hi to carry such attributes. With that, update the construction of the mbcnt.lo/mbcnt.hi calls used to get the lane ID to be `noundef` (since the lane ID is always defined) and to be annotated with the correct ranges (so that generic LLVM passes can correctly optimized based on the fact that there are never more than 32/64 lanes). (Also, handle a pattern that wasn't using getLaneId() and get rid of a dead argument)
| 
          
 @llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Krzysztof Drewniak (krzysz00) ChangesNow that we have general support for setting argument and result attributes on LLVM intrinsics, extend the definitions of mbcnt.lo and mbcnt.hi to carry such attributes. With that, update the construction of the mbcnt.lo/mbcnt.hi calls used to get the lane ID to be  (Also, handle a pattern that wasn't using getLaneId() and get rid of a dead argument) Full diff: https://github.com/llvm/llvm-project/pull/151396.diff 4 Files Affected: 
 diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index a2354e22e2745..d58ec7ef75e1f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -93,19 +93,22 @@ class ROCDL_IntrPure1Op<string mnemonic> :
 
 class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   list<int> overloadedOperands, list<Trait> traits, int numResults,
-  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list<int> immArgPositions = [],
+  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0,
+  int requiresArgAndResultAttrs = 0,
+  list<int> immArgPositions = [],
   list<string> immArgAttrNames = []> :
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
     overloadedOperands, traits, numResults, requiresAccessGroup,
-    requiresAliasAnalysis, 0, 0, 0, immArgPositions, immArgAttrNames>;
+    requiresAliasAnalysis, 0, requiresArgAndResultAttrs, 0,
+    immArgPositions, immArgAttrNames>;
 
 // Subclass to save typing and ease readibility when there aren't overloaded
 // operands or memory accesses.
 class ROCDL_ConcreteNonMemIntrOp<string mnemonic, list<Trait> traits,
     int numResults, list<int> immArgPositions = [],
     list<string> immArgNames = []>
-  : ROCDL_IntrOp<mnemonic, [], [], traits, numResults, 0, 0,
+  : ROCDL_IntrOp<mnemonic, [], [], traits, numResults, 0, 0, 0,
       immArgPositions, immArgNames>;
 //===----------------------------------------------------------------------===//
 // ROCDL special register op definitions
@@ -148,8 +151,11 @@ class ROCDL_DimGetterFunctionOp<string mnemonic, string device_function,
 //===----------------------------------------------------------------------===//
 
 class ROCDL_MbcntOp<string mnemonic> :
-    ROCDL_IntrPure1Op<"mbcnt." # mnemonic>,
-  Arguments<(ins I32:$in0, I32:$in1)> {
+    ROCDL_IntrOp<"mbcnt." # mnemonic, [], [], [Pure], 1,
+    0, 0, /*requiresArgAndResultAttrs=*/1> {
+  dag args = (ins I32:$in0, I32:$in1);
+  let arguments = !con(args, baseArgs);
+  let results = (outs I32:$res);
   let assemblyFormat = [{
     $in0 `,` $in1  attr-dict `:` `(` type($in0) `,` type($in1) `)` `->` type($res)
    }];
@@ -501,7 +507,7 @@ def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">;
 //===---------------------------------------------------------------------===//
 
 def ROCDL_LoadToLDSOp :
-  ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> {
+  ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
   dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr,
                  Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
                  I32Attr:$size,
@@ -520,7 +526,7 @@ def ROCDL_LoadToLDSOp :
 }
 
 def ROCDL_GlobalLoadLDSOp :
-  ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> {
+  ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
   dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
                  Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
                  I32Attr:$size,
@@ -734,7 +740,7 @@ def ROCDL_RawBufferAtomicUMinOp :
 
 // DPP Update intrinsic
 def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
-    [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0,
+    [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0, 0,
       [2, 3, 4, 5], ["dppCtrl", "rowMask", "bankMask", "boundCtrl"]>,
   Arguments<(ins LLVM_Type:$old, LLVM_Type:$src, I32Attr:$dppCtrl, I32Attr:$rowMask,
       I32Attr:$bankMask, I1Attr:$boundCtrl)> {
@@ -746,7 +752,7 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
 
 // PermLaneX16 intrinsic operation
 def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
-    [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0,
+    [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0, 0,
     [4, 5], ["fi", "boundControl"]>,
   Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
              I1Attr:$fi, I1Attr:$boundControl)> {
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d22364e1ef441..e6fbcf98950a4 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -79,17 +79,30 @@ static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
   return canBeBare;
 }
 
-static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
-                       const unsigned indexBitwidth) {
+static Value getLaneId(RewriterBase &rewriter, Location loc) {
   auto int32Type = IntegerType::get(rewriter.getContext(), 32);
   Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
   Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
-  Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, int32Type,
-                                           ValueRange{minus1, zero});
-  Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, int32Type,
-                                          ValueRange{minus1, mbcntLo});
+  NamedAttribute noundef = rewriter.getNamedAttr(
+      LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr());
+  NamedAttribute lowRange = rewriter.getNamedAttr(
+      LLVM::LLVMDialect::getRangeAttrName(),
+      LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
+                                   APInt(32, 32)));
+  NamedAttribute highRange = rewriter.getNamedAttr(
+      LLVM::LLVMDialect::getRangeAttrName(),
+      LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
+                                   APInt(32, 64)));
+  Value mbcntLo = ROCDL::MbcntLoOp::create(
+      rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{},
+      /*res_attrs=*/
+      rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange})));
+  Value laneId = ROCDL::MbcntHiOp::create(
+      rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{},
+      rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange})));
   return laneId;
 }
+
 static constexpr StringLiteral amdgcnDataLayout =
     "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
     "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
@@ -104,18 +117,16 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   LogicalResult
   matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto loc = op->getLoc();
+    Location loc = op.getLoc();
     MLIRContext *context = rewriter.getContext();
-    // convert to:  %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
-    // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
-
-    Type intTy = IntegerType::get(context, 32);
-    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
-    Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
-    Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, intTy,
-                                             ValueRange{minus1, zero});
-    Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, intTy,
-                                            ValueRange{minus1, mbcntLo});
+    // convert to:
+    //   %mlo = call noundef range(i32 0, 32)
+    //     @llvm.amdgcn.mbcnt.lo(-1, 0)
+    // followed by:
+    //   %lid = call noundef range(i32 0, 64)
+    //     @llvm.amdgcn.mbcnt.hi(-1, %mlo)
+
+    Value laneId = getLaneId(rewriter, loc);
     // Truncate or extend the result depending on the index bitwidth specified
     // by the LLVMTypeConverter options.
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
@@ -185,8 +196,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     Location loc = op->getLoc();
     Value initShflValue = adaptor.getValue();
 
-    const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
-    Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
+    Value srcLaneId = getLaneId(rewriter, loc);
 
     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
     Value width = adaptor.getWidth();
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 2b6adffc81f72..fa4a9749f6a9b 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -54,8 +54,8 @@ gpu.module @test_module {
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %gDimZ = gpu.grid_dim z
 
-    // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} : (i32, i32) -> i32
-    // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} : (i32, i32) -> i32
+    // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 32>}]} : (i32, i32) -> i32
+    // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 64>}]} : (i32, i32) -> i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %laneId = gpu.lane_id
 
@@ -701,7 +701,7 @@ gpu.module @test_module {
     // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
     // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
     // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
-    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 
+    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
     // *** UP mode shuffle ***
     // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
     // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 740990a6e589b..c101d071875f4 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -86,12 +86,12 @@ llvm.func @kernel_func_unsafe_fp_atomics()
 }
 
 llvm.func @rocdl.lane_id() -> i32 {
-  // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-  // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
+  // CHECK: [[mbcntlo:%.+]] = call noundef range(i32 0, 32) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  // CHECK-NEXT: call noundef range(i32 0, 64) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
   %0 = llvm.mlir.constant(-1 : i32) : i32
   %1 = llvm.mlir.constant(0 : i32) : i32
-  %2 = rocdl.mbcnt.lo %0, %1 : (i32, i32) -> i32
-  %3 = rocdl.mbcnt.hi %0, %2 : (i32, i32) -> i32
+  %2 = rocdl.mbcnt.lo %0, %1 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 32>}]} : (i32, i32) -> i32
+  %3 = rocdl.mbcnt.hi %0, %2 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 64>}]} : (i32, i32) -> i32
   llvm.return %3 : i32
 }
 
 | 
    
| 
           Ping  | 
    
Now that we have general support for setting argument and result attributes on LLVM intrinsics, extend the definitions of mbcnt.lo and mbcnt.hi to carry such attributes. With that, update the construction of the mbcnt.lo/mbcnt.hi calls used to get the lane ID to be
noundef(since the lane ID is always defined) and to be annotated with the correct ranges (so that generic LLVM passes can correctly optimized based on the fact that there are never more than 32/64 lanes).(Also, handle a pattern that wasn't using getLaneId() and get rid of a dead argument)