Skip to content

Commit 7bfb83f

Browse files
committed
Update clang and MLIR to know aobut the overload
Also fix MLIR to represent immargs properly
1 parent 976aa3b commit 7bfb83f

File tree

9 files changed

+38
-42
lines changed

9 files changed

+38
-42
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
257257
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
258258
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
259259
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
260-
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
260+
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
261261

262262
//===----------------------------------------------------------------------===//
263263
// Deep learning builtins.

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
574574
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
575575
return Builder.CreateCall(F, {Addr});
576576
}
577+
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
578+
return emitBuiltinWithOneOverloadedOperand<5>(
579+
*this, E, Intrinsic::amdgcn_global_load_lds);
580+
}
577581
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
578582
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
579583
{llvm::Type::getInt64Ty(getLLVMContext())});

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1774,7 +1774,7 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
17741774
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
17751775
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
17761776
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1777-
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
1777+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
17781778
// CHECK-NEXT: ret void
17791779
//
17801780
void test_global_load_lds_96(global void* src, local void *dst) {
@@ -1789,7 +1789,7 @@ void test_global_load_lds_96(global void* src, local void *dst) {
17891789
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
17901790
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
17911791
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1792-
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
1792+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
17931793
// CHECK-NEXT: ret void
17941794
//
17951795
void test_global_load_lds_128(global void* src, local void *dst) {

clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ typedef unsigned char u8;
1818
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
1919
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
2020
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
21-
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
21+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
2222
// CHECK-NEXT: ret void
2323
//
2424
void test_global_load_lds_u32(global u32* src, local u32 *dst) {
@@ -35,7 +35,7 @@ void test_global_load_lds_u32(global u32* src, local u32 *dst) {
3535
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
3636
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
3737
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
38-
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
38+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
3939
// CHECK-NEXT: ret void
4040
//
4141
void test_global_load_lds_u16(global u16* src, local u16 *dst) {
@@ -52,7 +52,7 @@ void test_global_load_lds_u16(global u16* src, local u16 *dst) {
5252
// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
5353
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
5454
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
55-
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
55+
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
5656
// CHECK-NEXT: ret void
5757
//
5858
void test_global_load_lds_u8(global u8* src, local u8 *dst) {

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -447,14 +447,17 @@ def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">;
447447
// Global load to LDS intrinsic (available in GFX950)
448448

449449
def ROCDL_GlobalLoadLDSOp :
450-
ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1> {
451-
dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
450+
ROCDL_IntrOp<"global.load.lds", [], [0], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> {
451+
dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr,
452452
Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
453-
I32:$size,
454-
I32:$offset,
455-
I32:$aux);
453+
I32Attr:$size,
454+
I32Attr:$offset,
455+
I32Attr:$aux);
456456
let arguments = !con(args, aliasAttrs);
457-
let assemblyFormat = "operands attr-dict";
457+
let assemblyFormat = [{
458+
$globalPtr `,` $ldsPtr `,` $size `,` $offset `,` $aux
459+
attr-dict `:` type($globalPtr)
460+
}];
458461
let extraClassDefinition = [{
459462
::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
460463
return {getGlobalPtr(), getLdsPtr()};

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,9 +1050,9 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
10501050
(adaptor.getDstIndices()), rewriter);
10511051

10521052
rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadLDSOp>(
1053-
op, srcPtr, dstPtr, createI32Constant(rewriter, loc, loadWidth),
1054-
createI32Constant(rewriter, loc, 0),
1055-
createI32Constant(rewriter, loc, 0), ArrayAttr{}, ArrayAttr{},
1053+
op, srcPtr, dstPtr, rewriter.getI32IntegerAttr(loadWidth),
1054+
/*offset=*/rewriter.getI32IntegerAttr(0),
1055+
/*aux=*/rewriter.getI32IntegerAttr(0), ArrayAttr{}, ArrayAttr{},
10561056
ArrayAttr{});
10571057

10581058
return success();

mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
2121

2222
// CHECK: %[[ALLOC:.*]] = memref.alloc()
2323
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast
24-
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
25-
24+
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
25+
2626
// CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
2727
// CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
2828
// CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
@@ -35,8 +35,7 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
3535
// CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
3636

3737
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
38-
// CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
39-
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]]
38+
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
4039
amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0]
4140
: f32, memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace>
4241
func.return
@@ -56,8 +55,8 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
5655

5756
// CHECK: %[[ALLOC:.*]] = memref.alloc()
5857
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
59-
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
60-
58+
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
59+
6160
// CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
6261
// CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
6362
// CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
@@ -70,8 +69,7 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
7069
// CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
7170

7271
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
73-
// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
74-
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C1]]
72+
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 1
7573
%c0 = arith.constant 0 : index
7674
%c12 = arith.constant 12 : index
7775
%c32 = arith.constant 32 : index
@@ -85,7 +83,7 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
8583
// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, 1>)
8684
func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_addrspace>) {
8785
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
88-
86+
8987
// CHECK: %[[C0:.*]] = arith.constant 0 : index
9088
// CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
9189
// CHECK: %[[C12:.*]] = arith.constant 12 : index
@@ -95,8 +93,8 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add
9593

9694
// CHECK: %[[ALLOC:.*]] = memref.alloc()
9795
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
98-
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
99-
96+
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
97+
10098
// CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64
10199
// CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64
102100
// CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64
@@ -109,8 +107,7 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add
109107
// CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
110108

111109
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
112-
// CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
113-
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]]
110+
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
114111
%c0 = arith.constant 0 : index
115112
%c12 = arith.constant 12 : index
116113
%c32 = arith.constant 32 : index
@@ -129,12 +126,11 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
129126
// CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
130127
// CHECK: %[[ALLOC:.*]] = memref.alloc()
131128
// CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
132-
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
129+
// CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
133130
// CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
134131
// CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
135132
// CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]]
136-
// CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32
137-
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]]
133+
// CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
138134
%alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
139135
%c0 = arith.constant 0 : index
140136
amdgpu.gather_to_lds %global[%src_idx], %alloc[%dst_idx, %c0]

mlir/test/Dialect/LLVMIR/rocdl.mlir

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -637,12 +637,8 @@ llvm.func @rocdl.ds.read.tr(%ptr : !llvm.ptr<3>) -> vector<4xf16> {
637637
}
638638

639639
llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
640-
%aux = llvm.mlir.constant(0 : i32) : i32
641-
%offset = llvm.mlir.constant(0 : i32) : i32
642-
%size = llvm.mlir.constant(10 : i32) : i32
643-
644-
//CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
645-
rocdl.global.load.lds %src, %dst, %size, %offset, %aux
640+
//CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, 4, 0, 0 : <1>
641+
rocdl.global.load.lds %src, %dst, 4, 0, 0 : <1>
646642

647643
llvm.return
648644
}

mlir/test/Target/LLVMIR/rocdl.mlir

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -842,11 +842,8 @@ llvm.func @rocdl.ds.read.tr(%ptr : !llvm.ptr<3>) -> vector<4xf16> {
842842
}
843843

844844
llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
845-
%aux = llvm.mlir.constant(0 : i32) : i32
846-
%offset = llvm.mlir.constant(0 : i32) : i32
847-
%size = llvm.mlir.constant(10 : i32) : i32
848-
//CHECK: call void @llvm.amdgcn.global.load.lds
849-
rocdl.global.load.lds %src, %dst, %size, %offset, %aux
845+
//CHECK: call void @llvm.amdgcn.global.load.lds.p1
846+
rocdl.global.load.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1>
850847
llvm.return
851848
}
852849

0 commit comments

Comments
 (0)