Skip to content

Commit 7b800a1

Browse files
authored
Add builtin/intrinsic global_(load|store)_b128 (llvm#4455) (#1004)
2 parents aaa1220 + 144d547 commit 7b800a1

23 files changed

+3856
-3
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5324,6 +5324,53 @@ returns the bit at the position of the current lane. It is almost equivalent to
53245324
``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
53255325
the given mask has the same value for all active lanes of the current wave.
53265326
5327+
5328+
__builtin_amdgcn_global_load_b128 and __builtin_amdgcn_global_store_b128
5329+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
5330+
5331+
Signature:
5332+
5333+
.. code-block:: c
5334+
5335+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u;
5336+
typedef v4u __attribute__((address_space(1))) *global_ptr_to_v4u;
5337+
5338+
v4u __builtin_amdgcn_global_load_b128(
5339+
v4u __attribute__((address_space(1))) *src,
5340+
const char *scope);
5341+
5342+
void __builtin_amdgcn_global_store_b128(
5343+
v4u __attribute__((address_space(1))) *dst,
5344+
v4u data,
5345+
const char *scope);
5346+
5347+
Load or store a vector of 4 unsigned integers from or to global memory with
5348+
cache behavior specified by `scope` which must be a string literal.
5349+
5350+
Valid values for `scope` are:
5351+
5352+
===================== ==========================================================
5353+
scope architecture name
5354+
===================== ==========================================================
5355+
``"wavefront"`` wave
5356+
5357+
``"workgroup"`` group
5358+
5359+
``"agent"`` device
5360+
5361+
``""`` (empty string) system
5362+
===================== ==========================================================
5363+
5364+
These builtins are only supported on gfx942 and gfx950 devices.
5365+
5366+
For semantics on gfx942, see Tables 47 and 48 in section 9.1.10 "Memory Scope
5367+
and Temporal Controls" of the "AMD Instinct MI300" Instruction Set Architecture
5368+
Reference.
5369+
5370+
For semantics on gfx950, see Tables 49 and 50 in section 9.1.10 "Memory Scope
5371+
and Temporal Controls" of the CDNA4 Instruction Set Architecture Reference.
5372+
5373+
53275374
ARM/AArch64 Language Extensions
53285375
-------------------------------
53295376

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,9 @@ TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64, "ddQbiiIi", "t",
190190
TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
191191
TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi", "t", "vmem-to-lds-load-insts")
192192

193+
TARGET_BUILTIN(__builtin_amdgcn_global_load_b128, "V4UiV4Ui*1cC*", "n", "gfx940-insts")
194+
TARGET_BUILTIN(__builtin_amdgcn_global_store_b128, "vV4Ui*1V4UicC*", "n", "gfx940-insts")
195+
193196
//===----------------------------------------------------------------------===//
194197
// Ballot builtins.
195198
//===----------------------------------------------------------------------===//

clang/include/clang/Sema/SemaAMDGPU.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ class SemaAMDGPU : public SemaBase {
3232

3333
bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
3434

35+
bool checkScopedMemAccessFunctionCall(CallExpr *TheCall);
36+
3537
bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
3638
unsigned NumDataArgs);
3739

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,26 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
830830
llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
831831
return Builder.CreateCall(F, {Args});
832832
}
833+
case AMDGPU::BI__builtin_amdgcn_global_load_b128:
834+
case AMDGPU::BI__builtin_amdgcn_global_store_b128: {
835+
const bool IsStore =
836+
BuiltinID == AMDGPU::BI__builtin_amdgcn_global_store_b128;
837+
LLVMContext &Ctx = CGM.getLLVMContext();
838+
SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
839+
if (IsStore)
840+
Args.push_back(EmitScalarExpr(E->getArg(1))); // data
841+
const unsigned ScopeIdx = E->getNumArgs() - 1;
842+
StringRef ScopeLit =
843+
cast<StringLiteral>(E->getArg(ScopeIdx)->IgnoreParenCasts())
844+
->getString();
845+
llvm::MDNode *MD =
846+
llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, ScopeLit)});
847+
Args.push_back(llvm::MetadataAsValue::get(Ctx, MD)); // scope
848+
llvm::Function *F =
849+
CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_global_store_b128
850+
: Intrinsic::amdgcn_global_load_b128);
851+
return Builder.CreateCall(F, Args);
852+
}
833853
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
834854
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
835855
{llvm::Type::getInt64Ty(getLLVMContext())});

clang/lib/Sema/SemaAMDGPU.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
112112
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
113113
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
114114
return checkCoopAtomicFunctionCall(TheCall, /*IsStore=*/true);
115+
case AMDGPU::BI__builtin_amdgcn_global_load_b128:
116+
case AMDGPU::BI__builtin_amdgcn_global_store_b128:
117+
return checkScopedMemAccessFunctionCall(TheCall);
115118
default:
116119
return false;
117120
}
@@ -201,6 +204,19 @@ bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
201204
return Fail;
202205
}
203206

207+
bool SemaAMDGPU::checkScopedMemAccessFunctionCall(CallExpr *TheCall) {
208+
bool Fail = false;
209+
// Last argument is a string literal
210+
Expr *Arg = TheCall->getArg(TheCall->getNumArgs() - 1);
211+
auto Scope = dyn_cast<StringLiteral>(Arg->IgnoreParenCasts());
212+
if (!Scope) {
213+
Fail = true;
214+
Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
215+
<< Arg->getSourceRange();
216+
}
217+
return Fail;
218+
}
219+
204220
bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
205221
unsigned NumDataArgs) {
206222
assert(NumDataArgs <= 2);
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX950
4+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX9_4_GENERIC
5+
6+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
7+
typedef v4u32 __global *global_ptr_to_v4u32;
8+
9+
//------------------------------------------------------------------------------
10+
// Store
11+
//------------------------------------------------------------------------------
12+
// GFX-LABEL: @test_amdgcn_global_store_b128_00(
13+
// GFX-NEXT: entry:
14+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META4:![0-9]+]])
15+
// GFX-NEXT: ret void
16+
//
17+
void test_amdgcn_global_store_b128_00(global_ptr_to_v4u32 ptr, v4u32 data) {
18+
__builtin_amdgcn_global_store_b128(ptr, data, "wavefront");
19+
}
20+
21+
// GFX-LABEL: @test_amdgcn_global_store_b128_01(
22+
// GFX-NEXT: entry:
23+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META5:![0-9]+]])
24+
// GFX-NEXT: ret void
25+
//
26+
void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data) {
27+
__builtin_amdgcn_global_store_b128(ptr, data, "workgroup");
28+
}
29+
30+
// GFX-LABEL: @test_amdgcn_global_store_b128_10(
31+
// GFX-NEXT: entry:
32+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META6:![0-9]+]])
33+
// GFX-NEXT: ret void
34+
//
35+
void test_amdgcn_global_store_b128_10(global_ptr_to_v4u32 ptr, v4u32 data) {
36+
__builtin_amdgcn_global_store_b128(ptr, data, "agent");
37+
}
38+
39+
// GFX-LABEL: @test_amdgcn_global_store_b128_11(
40+
// GFX-NEXT: entry:
41+
// GFX-NEXT: tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7:![0-9]+]])
42+
// GFX-NEXT: ret void
43+
//
44+
void test_amdgcn_global_store_b128_11(global_ptr_to_v4u32 ptr, v4u32 data) {
45+
__builtin_amdgcn_global_store_b128(ptr, data, "");
46+
}
47+
48+
//------------------------------------------------------------------------------
49+
// Load
50+
//------------------------------------------------------------------------------
51+
// GFX-LABEL: @test_amdgcn_global_load_b128_00(
52+
// GFX-NEXT: entry:
53+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META4]])
54+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
55+
//
56+
v4u32 test_amdgcn_global_load_b128_00(global_ptr_to_v4u32 ptr) {
57+
return __builtin_amdgcn_global_load_b128(ptr, "wavefront");
58+
}
59+
60+
// GFX-LABEL: @test_amdgcn_global_load_b128_01(
61+
// GFX-NEXT: entry:
62+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META5]])
63+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
64+
//
65+
v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr) {
66+
return __builtin_amdgcn_global_load_b128(ptr, "workgroup");
67+
}
68+
69+
// GFX-LABEL: @test_amdgcn_global_load_b128_10(
70+
// GFX-NEXT: entry:
71+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META6]])
72+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
73+
//
74+
v4u32 test_amdgcn_global_load_b128_10(global_ptr_to_v4u32 ptr) {
75+
return __builtin_amdgcn_global_load_b128(ptr, "agent");
76+
}
77+
78+
// GFX-LABEL: @test_amdgcn_global_load_b128_11(
79+
// GFX-NEXT: entry:
80+
// GFX-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META7]])
81+
// GFX-NEXT: ret <4 x i32> [[TMP0]]
82+
//
83+
v4u32 test_amdgcn_global_load_b128_11(global_ptr_to_v4u32 ptr) {
84+
return __builtin_amdgcn_global_load_b128(ptr, "");
85+
}
86+
//.
87+
// GFX950: [[META4]] = !{!"wavefront"}
88+
// GFX950: [[META5]] = !{!"workgroup"}
89+
// GFX950: [[META6]] = !{!"agent"}
90+
// GFX950: [[META7]] = !{!""}
91+
//.
92+
// GFX9_4_GENERIC: [[META4]] = !{!"wavefront"}
93+
// GFX9_4_GENERIC: [[META5]] = !{!"workgroup"}
94+
// GFX9_4_GENERIC: [[META6]] = !{!"agent"}
95+
// GFX9_4_GENERIC: [[META7]] = !{!""}
96+
//.
97+
//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
98+
// GFX950: {{.*}}
99+
// GFX9_4_GENERIC: {{.*}}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -S -verify -o - %s
3+
// REQUIRES: amdgpu-registered-target
4+
5+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
6+
typedef v4u32 __global *global_ptr_to_v4u32;
7+
8+
void test_amdgcn_global_store_b128_00(v4u32 *ptr, v4u32 data, const char* scope) {
9+
__builtin_amdgcn_global_store_b128(ptr, data, ""); //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
10+
}
11+
12+
void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
13+
__builtin_amdgcn_global_store_b128(ptr, data, scope); //expected-error{{expression is not a string literal}}
14+
}
15+
16+
v4u32 test_amdgcn_global_load_b128_00(v4u32 *ptr, const char* scope) {
17+
return __builtin_amdgcn_global_load_b128(ptr, ""); //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
18+
}
19+
20+
v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
21+
return __builtin_amdgcn_global_load_b128(ptr, scope); //expected-error{{expression is not a string literal}}
22+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// We test loads and stores separately because clang only seems to exit after
2+
// the first 'target feature' error.
3+
4+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic -DTEST_LOAD -S -verify -o - %s
5+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_LOAD -S -verify -o - %s
6+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_LOAD -S -verify -o - %s
7+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic -DTEST_LOAD -S -verify -o - %s
8+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic -DTEST_LOAD -S -verify -o - %s
9+
10+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic -DTEST_STORE -S -verify -o - %s
11+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_STORE -S -verify -o - %s
12+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_STORE -S -verify -o - %s
13+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic -DTEST_STORE -S -verify -o - %s
14+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic -DTEST_STORE -S -verify -o - %s
15+
// REQUIRES: amdgpu-registered-target
16+
17+
typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
18+
typedef v4u32 __global *global_ptr_to_v4u32;
19+
20+
#ifdef TEST_LOAD
21+
v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
22+
return __builtin_amdgcn_global_load_b128(ptr, ""); // expected-error{{'__builtin_amdgcn_global_load_b128' needs target feature gfx940-insts}}
23+
}
24+
#endif
25+
26+
#ifdef TEST_STORE
27+
void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
28+
__builtin_amdgcn_global_store_b128(ptr, data, ""); // expected-error{{'__builtin_amdgcn_global_store_b128' needs target feature gfx940-insts}}
29+
}
30+
#endif

llvm/docs/AMDGPUUsage.rst

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,6 +1513,86 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
15131513
* 1 - Data cache.
15141514

15151515
Instruction cache prefetches are unsafe on invalid address.
1516+
1517+
llvm.amdgcn.global.load.b128 This intrinsic is supported on gfx942, gfx950.
1518+
1519+
Signature:
1520+
1521+
.. code-block:: llvm
1522+
1523+
<4 x i32> @llvm.amdgcn.raw.load.store.b128(
1524+
ptr addrspace(1), ; source
1525+
metadata) ; scope - e.g. '!0' where '!0 = !{!"wavegroup"}'
1526+
1527+
Reads the value from the source address with cache behavior
1528+
specified by the scope.
1529+
1530+
For gfc942 and gfx950 devices, this emits a
1531+
``global_load_dwordx4`` instruction with the appropriate
1532+
``SC0`` and ``SC1`` bits set.
1533+
1534+
Valid values for scope are
1535+
1536+
===================== =============================================================
1537+
scope architecture name
1538+
===================== =============================================================
1539+
``"wavefront"`` wave
1540+
1541+
``"workgroup"`` group
1542+
1543+
``"agent"`` device
1544+
1545+
``""`` (empty string) system
1546+
===================== =============================================================
1547+
1548+
For semantics on gfx942, see Table 47 in section 9.1.10
1549+
"Memory Scope and Temporal Controls" of the "AMD Instinct
1550+
MI300" Instruction Set Architecture Reference.
1551+
1552+
For semantics on gfx950, see Table 49 in section 9.1.10
1553+
"Memory Scope and Temporal Controls" of the CDNA4
1554+
Instruction Set Architecture Reference.
1555+
1556+
llvm.amdgcn.global.store.b128 This intrinsic is supported on gfx942, gfx950.
1557+
1558+
Signature:
1559+
1560+
.. code-block:: llvm
1561+
1562+
void @llvm.amdgcn.global.store.b128(
1563+
ptr addrspace(1), ; destination
1564+
<4 x i32>, ; value
1565+
metadata) ; scope - e.g. '!0' where '!0 = !{!"wavegroup"}'
1566+
1567+
Writes the value to the destination address with cache
1568+
behavior specified by the scope.
1569+
1570+
For gfc942 and gfx950 devices, this emits a
1571+
``global_store_dwordx4`` instruction with the appropriate
1572+
``SC0`` and ``SC1`` bits set.
1573+
1574+
Valid values for scope are
1575+
1576+
===================== =============================================================
1577+
scope architecture name
1578+
===================== =============================================================
1579+
``"wavefront"`` wave
1580+
1581+
``"workgroup"`` group
1582+
1583+
``"agent"`` device
1584+
1585+
``""`` (empty string) system
1586+
===================== =============================================================
1587+
1588+
For semantics on gfx942, see Table 48 in section 9.1.10
1589+
"Memory Scope and Temporal Controls" of the "AMD Instinct
1590+
MI300" Instruction Set Architecture Reference.
1591+
1592+
For semantics on gfx950, see Table 50 in section 9.1.10
1593+
"Memory Scope and Temporal Controls" of the CDNA4
1594+
Instruction Set Architecture Reference.
1595+
15161596
============================================== ==========================================================
15171597

15181598
.. TODO::

0 commit comments

Comments
 (0)