Skip to content

Commit d7a38a9

Browse files
authored
[AMDGPU] Support builtin/intrinsics for load monitors on gfx1250 (#150540)
1 parent af98a24 commit d7a38a9

File tree

14 files changed

+430
-0
lines changed

14 files changed

+430
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,13 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
645645
TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
646646
TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
647647

648+
TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b32, "ii*1Ii", "nc", "gfx1250-insts")
649+
TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b64, "V2iV2i*1Ii", "nc", "gfx1250-insts")
650+
TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "gfx1250-insts")
651+
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
652+
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
653+
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
654+
648655
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
649656
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
650657
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,41 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
633633
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
634634
return Builder.CreateCall(F, {Addr});
635635
}
636+
case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
637+
case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
638+
case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
639+
case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
640+
case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
641+
case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128: {
642+
643+
Intrinsic::ID IID;
644+
switch (BuiltinID) {
645+
case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b32:
646+
IID = Intrinsic::amdgcn_global_load_monitor_b32;
647+
break;
648+
case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b64:
649+
IID = Intrinsic::amdgcn_global_load_monitor_b64;
650+
break;
651+
case AMDGPU::BI__builtin_amdgcn_global_load_monitor_b128:
652+
IID = Intrinsic::amdgcn_global_load_monitor_b128;
653+
break;
654+
case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b32:
655+
IID = Intrinsic::amdgcn_flat_load_monitor_b32;
656+
break;
657+
case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b64:
658+
IID = Intrinsic::amdgcn_flat_load_monitor_b64;
659+
break;
660+
case AMDGPU::BI__builtin_amdgcn_flat_load_monitor_b128:
661+
IID = Intrinsic::amdgcn_flat_load_monitor_b128;
662+
break;
663+
}
664+
665+
llvm::Type *LoadTy = ConvertType(E->getType());
666+
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
667+
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
668+
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
669+
return Builder.CreateCall(F, {Addr, Val});
670+
}
636671
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
637672
// Should this have asan instrumentation?
638673
return emitBuiltinWithOneOverloadedType<5>(*this, E,
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
4+
5+
typedef int v2i __attribute__((ext_vector_type(2)));
6+
typedef int v4i __attribute__((ext_vector_type(4)));
7+
8+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_monitor_b32(
9+
// CHECK-GFX1250-NEXT: entry:
10+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 1)
11+
// CHECK-GFX1250-NEXT: ret i32 [[TMP0]]
12+
//
13+
int test_amdgcn_global_load_monitor_b32(global int* inptr)
14+
{
15+
return __builtin_amdgcn_global_load_monitor_b32(inptr, 1);
16+
}
17+
18+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_monitor_b64(
19+
// CHECK-GFX1250-NEXT: entry:
20+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 10)
21+
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
22+
//
23+
v2i test_amdgcn_global_load_monitor_b64(global v2i* inptr)
24+
{
25+
return __builtin_amdgcn_global_load_monitor_b64(inptr, 10);
26+
}
27+
28+
// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_monitor_b128(
29+
// CHECK-GFX1250-NEXT: entry:
30+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 22)
31+
// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]]
32+
//
33+
v4i test_amdgcn_global_load_monitor_b128(global v4i* inptr)
34+
{
35+
return __builtin_amdgcn_global_load_monitor_b128(inptr, 22);
36+
}
37+
38+
// CHECK-GFX1250-LABEL: @test_amdgcn_flat_load_monitor_b32(
39+
// CHECK-GFX1250-NEXT: entry:
40+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr [[INPTR:%.*]], i32 27)
41+
// CHECK-GFX1250-NEXT: ret i32 [[TMP0]]
42+
//
43+
int test_amdgcn_flat_load_monitor_b32(int* inptr)
44+
{
45+
return __builtin_amdgcn_flat_load_monitor_b32(inptr, 27);
46+
}
47+
48+
// CHECK-GFX1250-LABEL: @test_amdgcn_flat_load_monitor_b64(
49+
// CHECK-GFX1250-NEXT: entry:
50+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr [[INPTR:%.*]], i32 1)
51+
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
52+
//
53+
v2i test_amdgcn_flat_load_monitor_b64(v2i* inptr)
54+
{
55+
return __builtin_amdgcn_flat_load_monitor_b64(inptr, 1);
56+
}
57+
58+
// CHECK-GFX1250-LABEL: @test_amdgcn_flat_load_monitor_b128(
59+
// CHECK-GFX1250-NEXT: entry:
60+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr [[INPTR:%.*]], i32 0)
61+
// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]]
62+
//
63+
v4i test_amdgcn_flat_load_monitor_b128(v4i* inptr)
64+
{
65+
return __builtin_amdgcn_flat_load_monitor_b128(inptr, 0);
66+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// REQUIRES: amdgpu-registered-target
22
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
33

4+
typedef int v2i __attribute__((ext_vector_type(2)));
45
typedef int v4i __attribute__((ext_vector_type(4)));
56
typedef int v8i __attribute__((ext_vector_type(8)));
67

@@ -28,6 +29,17 @@ void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
2829
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
2930
}
3031

32+
void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr,
33+
global int* b32out, global v2i* b64out, global v4i* b128out, int cpol)
34+
{
35+
*b32out = __builtin_amdgcn_global_load_monitor_b32(b32gaddr, cpol); // expected-error {{'__builtin_amdgcn_global_load_monitor_b32' must be a constant integer}}
36+
*b64out = __builtin_amdgcn_global_load_monitor_b64(b64gaddr, cpol); // expected-error {{'__builtin_amdgcn_global_load_monitor_b64' must be a constant integer}}
37+
*b128out = __builtin_amdgcn_global_load_monitor_b128(b128gaddr, cpol); // expected-error {{'__builtin_amdgcn_global_load_monitor_b128' must be a constant integer}}
38+
*b32out = __builtin_amdgcn_flat_load_monitor_b32(b32faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b32' must be a constant integer}}
39+
*b64out = __builtin_amdgcn_flat_load_monitor_b64(b64faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b64' must be a constant integer}}
40+
*b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}}
41+
}
42+
3143
void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
3244
{
3345
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
def flat_ptr_ty : LLVMQualPointerType<0>;
1314
def global_ptr_ty : LLVMQualPointerType<1>;
1415
def local_ptr_ty : LLVMQualPointerType<3>;
1516

@@ -3846,6 +3847,26 @@ def int_amdgcn_tensor_load_to_lds_d2 :
38463847
def int_amdgcn_tensor_store_from_lds_d2 :
38473848
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
38483849

3850+
class AMDGPULoadMonitor<LLVMType ptr_ty>:
3851+
Intrinsic<
3852+
[llvm_any_ty],
3853+
[ptr_ty,
3854+
llvm_i32_ty], // gfx12+ cachepolicy:
3855+
// bits [0-2] = th
3856+
// bits [3-4] = scope
3857+
[IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
3858+
IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
3859+
"",
3860+
[SDNPMemOperand]
3861+
>;
3862+
3863+
def int_amdgcn_flat_load_monitor_b32 : AMDGPULoadMonitor<flat_ptr_ty>;
3864+
def int_amdgcn_flat_load_monitor_b64 : AMDGPULoadMonitor<flat_ptr_ty>;
3865+
def int_amdgcn_flat_load_monitor_b128 : AMDGPULoadMonitor<flat_ptr_ty>;
3866+
def int_amdgcn_global_load_monitor_b32 : AMDGPULoadMonitor<global_ptr_ty>;
3867+
def int_amdgcn_global_load_monitor_b64 : AMDGPULoadMonitor<global_ptr_ty>;
3868+
def int_amdgcn_global_load_monitor_b128 : AMDGPULoadMonitor<global_ptr_ty>;
3869+
38493870
/// Emit an addrspacecast without null pointer checking.
38503871
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
38513872
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ def gi_global_offset :
137137
def gi_global_saddr :
138138
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
139139
GIComplexPatternEquiv<GlobalSAddr>;
140+
def gi_global_saddr_cpol :
141+
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
142+
GIComplexPatternEquiv<GlobalSAddrCPol>;
140143
def gi_global_saddr_glc :
141144
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
142145
GIComplexPatternEquiv<GlobalSAddrGLC>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,6 +2020,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
20202020
return true;
20212021
}
20222022

2023+
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2024+
SDValue &SAddr, SDValue &VOffset,
2025+
SDValue &Offset,
2026+
SDValue &CPol) const {
2027+
bool ScaleOffset;
2028+
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2029+
return false;
2030+
2031+
// We are assuming CPol is always the last operand of the intrinsic.
2032+
auto PassedCPol =
2033+
N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2034+
CPol = CurDAG->getTargetConstant(
2035+
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2036+
return true;
2037+
}
2038+
20232039
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
20242040
SDValue &SAddr, SDValue &VOffset,
20252041
SDValue &Offset,

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
168168
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
169169
SDValue &VOffset, SDValue &Offset,
170170
SDValue &CPol) const;
171+
bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
172+
SDValue &VOffset, SDValue &Offset,
173+
SDValue &CPol) const;
171174
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
172175
SDValue &VOffset, SDValue &Offset,
173176
SDValue &CPol) const;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5773,6 +5773,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
57735773
return selectGlobalSAddr(Root, 0);
57745774
}
57755775

5776+
InstructionSelector::ComplexRendererFns
5777+
AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5778+
const MachineInstr &I = *Root.getParent();
5779+
5780+
// We are assuming CPol is always the last operand of the intrinsic.
5781+
auto PassedCPol =
5782+
I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5783+
return selectGlobalSAddr(Root, PassedCPol);
5784+
}
5785+
57765786
InstructionSelector::ComplexRendererFns
57775787
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
57785788
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
261261
InstructionSelector::ComplexRendererFns
262262
selectGlobalSAddr(MachineOperand &Root) const;
263263
InstructionSelector::ComplexRendererFns
264+
selectGlobalSAddrCPol(MachineOperand &Root) const;
265+
InstructionSelector::ComplexRendererFns
264266
selectGlobalSAddrGLC(MachineOperand &Root) const;
265267

266268
InstructionSelector::ComplexRendererFns

0 commit comments

Comments
 (0)