Skip to content

Commit b52a04c

Browse files
[SPIRV] GPU intrinsics
1 parent 09a36c8 commit b52a04c

File tree

18 files changed

+2272
-1
lines changed

18 files changed

+2272
-1
lines changed

clang/include/clang/Basic/Builtins.td

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4770,6 +4770,35 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
47704770
let Prototype = "char const*(...)";
47714771
}
47724772

4773+
// GPU intrinsics
4774+
class GPUBuiltin<string prototype> : Builtin {
4775+
let Spellings = ["__builtin_" # NAME];
4776+
let Prototype = prototype;
4777+
let Attributes = [NoThrow];
4778+
}
4779+
4780+
multiclass GPUGridBuiltin<string prototype> {
4781+
def _x : GPUBuiltin<prototype>;
4782+
def _y : GPUBuiltin<prototype>;
4783+
def _z : GPUBuiltin<prototype>;
4784+
}
4785+
4786+
defm gpu_num_blocks : GPUGridBuiltin<"uint32_t()">;
4787+
defm gpu_block_id : GPUGridBuiltin<"uint32_t()">;
4788+
defm gpu_num_threads : GPUGridBuiltin<"uint32_t()">;
4789+
defm gpu_thread_id : GPUGridBuiltin<"uint32_t()">;
4790+
4791+
def gpu_ballot : GPUBuiltin<"uint64_t(uint64_t, bool)">;
4792+
def gpu_exit : GPUBuiltin<"void()">;
4793+
def gpu_lane_id : GPUBuiltin<"uint32_t()">;
4794+
def gpu_lane_mask : GPUBuiltin<"uint64_t()">;
4795+
def gpu_num_lanes : GPUBuiltin<"uint32_t()">;
4796+
def gpu_read_first_lane_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t)">;
4797+
def gpu_shuffle_idx_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t, uint32_t, uint32_t)">;
4798+
def gpu_sync_lane : GPUBuiltin<"void(uint64_t)">;
4799+
def gpu_sync_threads : GPUBuiltin<"void()">;
4800+
def gpu_thread_suspend : GPUBuiltin<"void()">;
4801+
47734802
// HLSL
47744803
def HLSLAddUint64: LangBuiltin<"HLSL_LANG"> {
47754804
let Spellings = ["__builtin_hlsl_adduint64"];

clang/lib/Headers/amdgpuintrin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
1+
//===-- amdgpuintrin.h - AMDGPU intrinsic functions -----------------------===//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.

clang/lib/Headers/gpuintrin.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
6060
#include <nvptxintrin.h>
6161
#elif defined(__AMDGPU__)
6262
#include <amdgpuintrin.h>
63+
#elif defined(__SPIRV64__)
64+
#include <spirvintrin.h>
6365
#elif !defined(_OPENMP)
6466
#error "This header is only meant to be used on GPU architectures."
6567
#endif

clang/lib/Headers/spirvintrin.h

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
//===-- spirvintrin.h - SPIRV intrinsic functions ------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef __SPIRVINTRIN_H
10+
#define __SPIRVINTRIN_H
11+
12+
#ifndef __SPIRV64__
13+
// 32 bit SPIRV is currently a stretch goal
14+
#error "This file is intended for SPIRV64 targets or offloading to SPIRV64"
15+
#endif
16+
17+
#ifndef __GPUINTRIN_H
18+
#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
19+
#endif
20+
21+
// This is the skeleton of the spirv implementation for gpuintrin
22+
// Address spaces and kernel attribute are not yet implemented
23+
24+
#if defined(_OPENMP)
25+
#error "Openmp is not yet available on spirv though gpuintrin header"
26+
#endif
27+
28+
// Type aliases to the address spaces used by the SPIRV backend.
29+
#define __gpu_private
30+
#define __gpu_constant
31+
#define __gpu_local
32+
#define __gpu_global
33+
#define __gpu_generic
34+
35+
// Attribute to declare a function as a kernel.
36+
#define __gpu_kernel
37+
38+
// Note, because the builtin_gpu intrinsics lower to amdgcn or nvptx on request
39+
// the following implementations of these functions would work equally well
40+
// in the amdgcnintrin.h or nvptxintrin.h headers, i.e. we could move this
41+
// definition of __gpu_num_blocks_x et al into gpuintrin.h and remove them
42+
// from the three target intrin.h headers.
43+
44+
// Returns the number of workgroups in the 'x' dimension of the grid.
45+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
46+
return __builtin_gpu_num_blocks_x();
47+
}
48+
49+
// Returns the number of workgroups in the 'y' dimension of the grid.
50+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
51+
return __builtin_gpu_num_blocks_y();
52+
}
53+
54+
// Returns the number of workgroups in the 'z' dimension of the grid.
55+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
56+
return __builtin_gpu_num_blocks_z();
57+
}
58+
59+
// Returns the 'x' dimension of the current AMD workgroup's id.
60+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
61+
return __builtin_gpu_block_id_x();
62+
}
63+
64+
// Returns the 'y' dimension of the current AMD workgroup's id.
65+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
66+
return __builtin_gpu_block_id_y();
67+
}
68+
69+
// Returns the 'z' dimension of the current AMD workgroup's id.
70+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
71+
return __builtin_gpu_block_id_z();
72+
}
73+
74+
// Returns the number of workitems in the 'x' dimension.
75+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
76+
return __builtin_gpu_num_threads_x();
77+
}
78+
79+
// Returns the number of workitems in the 'y' dimension.
80+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
81+
return __builtin_gpu_num_threads_y();
82+
}
83+
84+
// Returns the number of workitems in the 'z' dimension.
85+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
86+
return __builtin_gpu_num_threads_z();
87+
}
88+
89+
// Returns the 'x' dimension id of the workitem in the current workgroup.
90+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
91+
return __builtin_gpu_thread_id_x();
92+
}
93+
94+
// Returns the 'y' dimension id of the workitem in the current workgroup.
95+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
96+
return __builtin_gpu_thread_id_y();
97+
}
98+
99+
// Returns the 'z' dimension id of the workitem in the current workgroup.
100+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
101+
return __builtin_gpu_thread_id_z();
102+
}
103+
104+
// Returns the size of the wave.
105+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
106+
return __builtin_gpu_num_lanes();
107+
}
108+
109+
// Returns the id of the thread inside of a wave executing together.
110+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
111+
return __builtin_gpu_lane_id();
112+
}
113+
114+
// Returns the bit-mask of active threads in the current wave.
115+
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
116+
return __builtin_gpu_lane_mask();
117+
}
118+
119+
// Copies the value from the first active thread in the wave to the rest.
120+
_DEFAULT_FN_ATTRS static __inline__ uint32_t
121+
__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
122+
return __builtin_gpu_read_first_lane_u32(__lane_mask, __x);
123+
}
124+
125+
// Returns a bitmask of threads in the current lane for which \p x is true.
126+
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
127+
bool __x) {
128+
return __builtin_gpu_ballot(__lane_mask, __x);
129+
}
130+
131+
// Waits for all the threads in the block to converge and issues a fence.
132+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
133+
return __builtin_gpu_sync_threads();
134+
}
135+
136+
// Wait for all threads in the wave to converge
137+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
138+
return __builtin_gpu_sync_lane(__lane_mask);
139+
}
140+
141+
// Shuffles the the lanes inside the wave according to the given index.
142+
_DEFAULT_FN_ATTRS static __inline__ uint32_t
143+
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
144+
uint32_t __width) {
145+
return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width);
146+
}
147+
148+
// Returns a bitmask marking all lanes that have the same value of __x.
149+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
150+
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
151+
return __gpu_match_any_u32_impl(__lane_mask, __x);
152+
}
153+
154+
// Returns a bitmask marking all lanes that have the same value of __x.
155+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
156+
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
157+
return __gpu_match_any_u64_impl(__lane_mask, __x);
158+
}
159+
160+
// Returns the current lane mask if every lane contains __x.
161+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
162+
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
163+
return __gpu_match_all_u32_impl(__lane_mask, __x);
164+
}
165+
166+
// Returns the current lane mask if every lane contains __x.
167+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
168+
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
169+
return __gpu_match_all_u64_impl(__lane_mask, __x);
170+
}
171+
172+
// Terminates execution of the associated wave.
173+
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
174+
return __builtin_gpu_exit();
175+
}
176+
177+
// Suspend the thread briefly to assist the scheduler during busy loops.
178+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
179+
return __builtin_gpu_thread_suspend();
180+
}
181+
182+
#endif // __SPIRVINTRIN_H
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | FileCheck %s
4+
5+
#include <stdint.h>
6+
7+
// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_x(
8+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
9+
// CHECK-NEXT: [[ENTRY:.*:]]
10+
// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x()
11+
// CHECK-NEXT: ret i32 [[TMP0]]
12+
//
13+
uint32_t workgroup_id_x(void)
14+
{
15+
return __builtin_amdgcn_workgroup_id_x();
16+
}
17+
18+
// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_y(
19+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
20+
// CHECK-NEXT: [[ENTRY:.*:]]
21+
// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.y()
22+
// CHECK-NEXT: ret i32 [[TMP0]]
23+
//
24+
uint32_t workgroup_id_y(void)
25+
{
26+
return __builtin_amdgcn_workgroup_id_y();
27+
}
28+
29+
// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_z(
30+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
31+
// CHECK-NEXT: [[ENTRY:.*:]]
32+
// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.z()
33+
// CHECK-NEXT: ret i32 [[TMP0]]
34+
//
35+
uint32_t workgroup_id_z(void)
36+
{
37+
return __builtin_amdgcn_workgroup_id_z();
38+
}
39+
40+
// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_x(
41+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
42+
// CHECK-NEXT: [[ENTRY:.*:]]
43+
// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
44+
// CHECK-NEXT: ret i32 [[TMP0]]
45+
//
46+
uint32_t workitem_id_x(void)
47+
{
48+
return __builtin_amdgcn_workitem_id_x();
49+
}
50+
51+
// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_y(
52+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR5:[0-9]+]] {
53+
// CHECK-NEXT: [[ENTRY:.*:]]
54+
// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
55+
// CHECK-NEXT: ret i32 [[TMP0]]
56+
//
57+
uint32_t workitem_id_y(void)
58+
{
59+
return __builtin_amdgcn_workitem_id_y();
60+
}
61+
62+
// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_z(
63+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] {
64+
// CHECK-NEXT: [[ENTRY:.*:]]
65+
// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
66+
// CHECK-NEXT: ret i32 [[TMP0]]
67+
//
68+
uint32_t workitem_id_z(void)
69+
{
70+
return __builtin_amdgcn_workitem_id_z();
71+
}
72+
73+
// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_x(
74+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
75+
// CHECK-NEXT: [[ENTRY:.*:]]
76+
// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
77+
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
78+
// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef [[META4]]
79+
// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
80+
// CHECK-NEXT: ret i32 [[CONV]]
81+
//
82+
uint32_t workgroup_size_x(void)
83+
{
84+
return __builtin_amdgcn_workgroup_size_x();
85+
}
86+
87+
// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_y(
88+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
89+
// CHECK-NEXT: [[ENTRY:.*:]]
90+
// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
91+
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 14
92+
// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
93+
// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
94+
// CHECK-NEXT: ret i32 [[CONV]]
95+
//
96+
uint32_t workgroup_size_y(void)
97+
{
98+
return __builtin_amdgcn_workgroup_size_y();
99+
}
100+
101+
// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_z(
102+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
103+
// CHECK-NEXT: [[ENTRY:.*:]]
104+
// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
105+
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
106+
// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 8, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
107+
// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
108+
// CHECK-NEXT: ret i32 [[CONV]]
109+
//
110+
uint32_t workgroup_size_z(void)
111+
{
112+
return __builtin_amdgcn_workgroup_size_z();
113+
}
114+
115+
// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_x(
116+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
117+
// CHECK-NEXT: [[ENTRY:.*:]]
118+
// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
119+
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
120+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5:![0-9]+]], !invariant.load [[META4]]
121+
// CHECK-NEXT: ret i32 [[TMP2]]
122+
//
123+
uint32_t grid_size_x(void)
124+
{
125+
return __builtin_amdgcn_grid_size_x();
126+
}
127+
128+
// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_y(
129+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
130+
// CHECK-NEXT: [[ENTRY:.*:]]
131+
// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
132+
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
133+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
134+
// CHECK-NEXT: ret i32 [[TMP2]]
135+
//
136+
uint32_t grid_size_y(void)
137+
{
138+
return __builtin_amdgcn_grid_size_y();
139+
}
140+
141+
// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_z(
142+
// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
143+
// CHECK-NEXT: [[ENTRY:.*:]]
144+
// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
145+
// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 20
146+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
147+
// CHECK-NEXT: ret i32 [[TMP2]]
148+
//
149+
uint32_t grid_size_z(void)
150+
{
151+
return __builtin_amdgcn_grid_size_z();
152+
}
153+
154+
//.
155+
// CHECK: [[RNG3]] = !{i16 1, i16 1025}
156+
// CHECK: [[META4]] = !{}
157+
// CHECK: [[RNG5]] = !{i32 1, i32 0}
158+
//.

0 commit comments

Comments
 (0)