Skip to content

Commit 3dd9bde

Browse files
authored
Add builtin/intrinsic global_(load|store)_b128 (llvm#4555)
2 parents eaf53db + dce84f8 commit 3dd9bde

27 files changed

+3883
-9
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4901,6 +4901,52 @@ for a concrete target, and shall reflect the latter's identity and features.
49014901
Thus, it is possible to author high-level code, in e.g. HIP, that is target
49024902
adaptive in a dynamic fashion, contrary to macro based mechanisms.
49034903
4904+
__builtin_amdgcn_global_load_b128 and __builtin_amdgcn_global_store_b128
4905+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4906+
4907+
Signature:
4908+
4909+
.. code-block:: c
4910+
4911+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u;
4912+
typedef v4u __attribute__((address_space(1))) *global_ptr_to_v4u;
4913+
4914+
v4u __builtin_amdgcn_global_load_b128(
4915+
v4u __attribute__((address_space(1))) *src,
4916+
const char *scope);
4917+
4918+
void __builtin_amdgcn_global_store_b128(
4919+
v4u __attribute__((address_space(1))) *dst,
4920+
v4u data,
4921+
const char *scope);
4922+
4923+
Load or store a vector of 4 unsigned integers from or to global memory with
4924+
cache behavior specified by `scope` which must be a string literal.
4925+
4926+
Valid values for `scope` are:
4927+
4928+
===================== ==========================================================
4929+
scope architecture name
4930+
===================== ==========================================================
4931+
``"wavefront"`` wave
4932+
4933+
``"workgroup"`` group
4934+
4935+
``"agent"`` device
4936+
4937+
``""`` (empty string) system
4938+
===================== ==========================================================
4939+
4940+
These builtins are only supported on gfx942 and gfx950 devices.
4941+
4942+
For semantics on gfx942, see Tables 47 and 48 in section 9.1.10 "Memory Scope
4943+
and Temporal Controls" of the "AMD Instinct MI300" Instruction Set Architecture
4944+
Reference.
4945+
4946+
For semantics on gfx950, see Tables 49 and 50 in section 9.1.10 "Memory Scope
4947+
and Temporal Controls" of the CDNA4 Instruction Set Architecture Reference.
4948+
4949+
49044950
ARM/AArch64 Language Extensions
49054951
-------------------------------
49064952

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
165165

166166
TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
167167

168+
TARGET_BUILTIN(__builtin_amdgcn_global_load_b128, "V4UiV4Ui*1cC*", "n", "gfx940-insts")
169+
TARGET_BUILTIN(__builtin_amdgcn_global_store_b128, "vV4Ui*1V4UicC*", "n", "gfx940-insts")
170+
168171
//===----------------------------------------------------------------------===//
169172
// Ballot builtins.
170173
//===----------------------------------------------------------------------===//

clang/include/clang/Sema/SemaAMDGPU.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class SemaAMDGPU : public SemaBase {
3030

3131
bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
3232

33+
bool checkScopedMemAccessFunctionCall(CallExpr *TheCall);
34+
3335
bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
3436
unsigned NumDataArgs);
3537

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20116,6 +20116,26 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
2011620116
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
2011720117
return Builder.CreateCall(F, {Addr});
2011820118
}
20119+
case AMDGPU::BI__builtin_amdgcn_global_load_b128:
20120+
case AMDGPU::BI__builtin_amdgcn_global_store_b128: {
20121+
const bool IsStore =
20122+
BuiltinID == AMDGPU::BI__builtin_amdgcn_global_store_b128;
20123+
LLVMContext &Ctx = CGM.getLLVMContext();
20124+
SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
20125+
if (IsStore)
20126+
Args.push_back(EmitScalarExpr(E->getArg(1))); // data
20127+
const unsigned ScopeIdx = E->getNumArgs() - 1;
20128+
StringRef ScopeLit =
20129+
cast<StringLiteral>(E->getArg(ScopeIdx)->IgnoreParenCasts())
20130+
->getString();
20131+
llvm::MDNode *MD =
20132+
llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, ScopeLit)});
20133+
Args.push_back(llvm::MetadataAsValue::get(Ctx, MD)); // scope
20134+
llvm::Function *F =
20135+
CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_global_store_b128
20136+
: Intrinsic::amdgcn_global_load_b128);
20137+
return Builder.CreateCall(F, Args);
20138+
}
2011920139
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
2012020140
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
2012120141
{llvm::Type::getInt64Ty(getLLVMContext())});

clang/lib/Sema/SemaAMDGPU.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
8484
case AMDGPU::BI__builtin_amdgcn_update_dpp: {
8585
return checkMovDPPFunctionCall(TheCall, 6, 2);
8686
}
87+
case AMDGPU::BI__builtin_amdgcn_global_load_b128:
88+
case AMDGPU::BI__builtin_amdgcn_global_store_b128:
89+
return checkScopedMemAccessFunctionCall(TheCall);
8790
default:
8891
return false;
8992
}
@@ -129,6 +132,19 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
129132
return false;
130133
}
131134

135+
bool SemaAMDGPU::checkScopedMemAccessFunctionCall(CallExpr *TheCall) {
136+
bool Fail = false;
137+
// Last argument is a string literal
138+
Expr *Arg = TheCall->getArg(TheCall->getNumArgs() - 1);
139+
auto Scope = dyn_cast<StringLiteral>(Arg->IgnoreParenCasts());
140+
if (!Scope) {
141+
Fail = true;
142+
Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
143+
<< Arg->getSourceRange();
144+
}
145+
return Fail;
146+
}
147+
132148
bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
133149
unsigned NumDataArgs) {
134150
assert(NumDataArgs <= 2);

clang/lib/Sema/SemaExpr.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6247,7 +6247,8 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context,
62476247
return nullptr;
62486248
Expr *Arg = ArgRes.get();
62496249
QualType ArgType = Arg->getType();
6250-
if (!ParamType->isPointerType() || ParamType.hasAddressSpace() ||
6250+
if (!ParamType->isPointerType() ||
6251+
ParamType->getPointeeType().hasAddressSpace() ||
62516252
!ArgType->isPointerType() ||
62526253
!ArgType->getPointeeType().hasAddressSpace() ||
62536254
isPtrSizeAddressSpace(ArgType->getPointeeType().getAddressSpace())) {
@@ -6256,9 +6257,6 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context,
62566257
}
62576258

62586259
QualType PointeeType = ParamType->getPointeeType();
6259-
if (PointeeType.hasAddressSpace())
6260-
continue;
6261-
62626260
NeedsNewDecl = true;
62636261
LangAS AS = ArgType->getPointeeType().getAddressSpace();
62646262

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX950
4+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX9_4_GENERIC
5+
6+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
7+
typedef v4u32 __global *global_ptr_to_v4u32;
8+
9+
//------------------------------------------------------------------------------
10+
// Store
11+
//------------------------------------------------------------------------------
12+
// GFX-LABEL: @test_amdgcn_global_store_b128_00(
13+
// GFX-NEXT: entry:
14+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META4:![0-9]+]])
15+
// GFX-NEXT: ret void
16+
//
17+
void test_amdgcn_global_store_b128_00(global_ptr_to_v4u32 ptr, v4u32 data) {
18+
__builtin_amdgcn_global_store_b128(ptr, data, "wavefront");
19+
}
20+
21+
// GFX-LABEL: @test_amdgcn_global_store_b128_01(
22+
// GFX-NEXT: entry:
23+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META5:![0-9]+]])
24+
// GFX-NEXT: ret void
25+
//
26+
void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data) {
27+
__builtin_amdgcn_global_store_b128(ptr, data, "workgroup");
28+
}
29+
30+
// GFX-LABEL: @test_amdgcn_global_store_b128_10(
31+
// GFX-NEXT: entry:
32+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META6:![0-9]+]])
33+
// GFX-NEXT: ret void
34+
//
35+
void test_amdgcn_global_store_b128_10(global_ptr_to_v4u32 ptr, v4u32 data) {
36+
__builtin_amdgcn_global_store_b128(ptr, data, "agent");
37+
}
38+
39+
// GFX-LABEL: @test_amdgcn_global_store_b128_11(
40+
// GFX-NEXT: entry:
41+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7:![0-9]+]])
42+
// GFX-NEXT: ret void
43+
//
44+
void test_amdgcn_global_store_b128_11(global_ptr_to_v4u32 ptr, v4u32 data) {
45+
__builtin_amdgcn_global_store_b128(ptr, data, "");
46+
}
47+
48+
//------------------------------------------------------------------------------
49+
// Load
50+
//------------------------------------------------------------------------------
51+
// GFX-LABEL: @test_amdgcn_global_load_b128_00(
52+
// GFX-NEXT: entry:
53+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META4]])
54+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
55+
//
56+
v4u32 test_amdgcn_global_load_b128_00(global_ptr_to_v4u32 ptr) {
57+
return __builtin_amdgcn_global_load_b128(ptr, "wavefront");
58+
}
59+
60+
// GFX-LABEL: @test_amdgcn_global_load_b128_01(
61+
// GFX-NEXT: entry:
62+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META5]])
63+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
64+
//
65+
v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr) {
66+
return __builtin_amdgcn_global_load_b128(ptr, "workgroup");
67+
}
68+
69+
// GFX-LABEL: @test_amdgcn_global_load_b128_10(
70+
// GFX-NEXT: entry:
71+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META6]])
72+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
73+
//
74+
v4u32 test_amdgcn_global_load_b128_10(global_ptr_to_v4u32 ptr) {
75+
return __builtin_amdgcn_global_load_b128(ptr, "agent");
76+
}
77+
78+
// GFX-LABEL: @test_amdgcn_global_load_b128_11(
79+
// GFX-NEXT: entry:
80+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META7]])
81+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
82+
//
83+
v4u32 test_amdgcn_global_load_b128_11(global_ptr_to_v4u32 ptr) {
84+
return __builtin_amdgcn_global_load_b128(ptr, "");
85+
}
86+
//.
87+
// GFX950: [[META4]] = !{!"wavefront"}
88+
// GFX950: [[META5]] = !{!"workgroup"}
89+
// GFX950: [[META6]] = !{!"agent"}
90+
// GFX950: [[META7]] = !{!""}
91+
//.
92+
// GFX9_4_GENERIC: [[META4]] = !{!"wavefront"}
93+
// GFX9_4_GENERIC: [[META5]] = !{!"workgroup"}
94+
// GFX9_4_GENERIC: [[META6]] = !{!"agent"}
95+
// GFX9_4_GENERIC: [[META7]] = !{!""}
96+
//.
97+
//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
98+
// GFX950: {{.*}}
99+
// GFX9_4_GENERIC: {{.*}}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -S -verify -o - %s
3+
// REQUIRES: amdgpu-registered-target
4+
5+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
6+
typedef v4u32 __global *global_ptr_to_v4u32;
7+
8+
void test_amdgcn_global_store_b128_00(v4u32 *ptr, v4u32 data, const char* scope) {
9+
__builtin_amdgcn_global_store_b128(ptr, data, ""); //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
10+
}
11+
12+
void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
13+
__builtin_amdgcn_global_store_b128(ptr, data, scope); //expected-error{{expression is not a string literal}}
14+
}
15+
16+
v4u32 test_amdgcn_global_load_b128_00(v4u32 *ptr, const char* scope) {
17+
return __builtin_amdgcn_global_load_b128(ptr, ""); //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
18+
}
19+
20+
v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
21+
return __builtin_amdgcn_global_load_b128(ptr, scope); //expected-error{{expression is not a string literal}}
22+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// We test loads and stores separately because clang only seems to exit after
2+
// the first 'target feature' error.
3+
4+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic -DTEST_LOAD -S -verify -o - %s
5+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_LOAD -S -verify -o - %s
6+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_LOAD -S -verify -o - %s
7+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic -DTEST_LOAD -S -verify -o - %s
8+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic -DTEST_LOAD -S -verify -o - %s
9+
10+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic -DTEST_STORE -S -verify -o - %s
11+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_STORE -S -verify -o - %s
12+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_STORE -S -verify -o - %s
13+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic -DTEST_STORE -S -verify -o - %s
14+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic -DTEST_STORE -S -verify -o - %s
15+
// REQUIRES: amdgpu-registered-target
16+
17+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
18+
typedef v4u32 __global *global_ptr_to_v4u32;
19+
20+
#ifdef TEST_LOAD
21+
v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
22+
return __builtin_amdgcn_global_load_b128(ptr, ""); // expected-error{{'__builtin_amdgcn_global_load_b128' needs target feature gfx940-insts}}
23+
}
24+
#endif
25+
26+
#ifdef TEST_STORE
27+
void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
28+
__builtin_amdgcn_global_store_b128(ptr, data, ""); // expected-error{{'__builtin_amdgcn_global_store_b128' needs target feature gfx940-insts}}
29+
}
30+
#endif

0 commit comments

Comments
 (0)