llvm
diff --git a/‎clang/include/clang/Basic/Builtins.td‎
Lines changed: 29 additions & 0 deletions b/‎clang/include/clang/Basic/Builtins.td‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎clang/lib/Headers/amdgpuintrin.h‎
Lines changed: 1 addition & 1 deletion b/‎clang/lib/Headers/amdgpuintrin.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/lib/Headers/gpuintrin.h‎
Lines changed: 2 additions & 0 deletions b/‎clang/lib/Headers/gpuintrin.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/lib/Headers/spirvintrin.h‎
Lines changed: 182 additions & 0 deletions b/‎clang/lib/Headers/spirvintrin.h‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎clang/test/CodeGen/amdgpu-grid-builtins.c‎
Lines changed: 158 additions & 0 deletions b/‎clang/test/CodeGen/amdgpu-grid-builtins.c‎
Lines changed: 158 additions & 0 deletions
@@ -4770,6 +4770,35 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
   let Prototype = "char const*(...)";
 }
 
+// GPU intrinsics
+class GPUBuiltin<string prototype> : Builtin {
+  let Spellings = ["__builtin_" # NAME];
+  let Prototype = prototype;
+  let Attributes = [NoThrow];
+}
+
+multiclass GPUGridBuiltin<string prototype> {
+  def _x : GPUBuiltin<prototype>;
+  def _y : GPUBuiltin<prototype>;
+  def _z : GPUBuiltin<prototype>;
+}
+
+defm gpu_num_blocks : GPUGridBuiltin<"uint32_t()">;
+defm gpu_block_id : GPUGridBuiltin<"uint32_t()">;
+defm gpu_num_threads : GPUGridBuiltin<"uint32_t()">;
+defm gpu_thread_id : GPUGridBuiltin<"uint32_t()">;
+
+def gpu_ballot : GPUBuiltin<"uint64_t(uint64_t, bool)">;
+def gpu_exit : GPUBuiltin<"void()">;
+def gpu_lane_id : GPUBuiltin<"uint32_t()">;
+def gpu_lane_mask : GPUBuiltin<"uint64_t()">;
+def gpu_num_lanes : GPUBuiltin<"uint32_t()">;
+def gpu_read_first_lane_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t)">;
+def gpu_shuffle_idx_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t, uint32_t, uint32_t)">;
+def gpu_sync_lane : GPUBuiltin<"void(uint64_t)">;
+def gpu_sync_threads : GPUBuiltin<"void()">;
+def gpu_thread_suspend : GPUBuiltin<"void()">;
+
 // HLSL
 def HLSLAddUint64: LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_adduint64"];
 
@@ -1,4 +1,4 @@
-//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
+//===-- amdgpuintrin.h - AMDGPU intrinsic functions -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV64__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
 
@@ -0,0 +1,182 @@
+//===-- spirvintrin.h - SPIRV intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV64__
+// 32 bit SPIRV is currently a stretch goal
+#error "This file is intended for SPIRV64 targets or offloading to SPIRV64"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+// This is the skeleton of the spirv implementation for gpuintrin
+// Address spaces and kernel attribute are not yet implemented
+
+#if defined(_OPENMP)
+#error "Openmp is not yet available on spirv though gpuintrin header"
+#endif
+
+// Type aliases to the address spaces used by the SPIRV backend.
+#define __gpu_private
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global
+#define __gpu_generic
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel
+
+// Note, because the builtin_gpu intrinsics lower to amdgcn or nvptx on request
+// the following implementations of these functions would work equally well
+// in the amdgcnintrin.h or nvptxintrin.h headers, i.e. we could move this
+// definition of __gpu_num_blocks_x et al into gpuintrin.h and remove them
+// from the three target intrin.h headers.
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_gpu_num_blocks_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_gpu_num_blocks_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_gpu_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_gpu_block_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_gpu_block_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_gpu_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_gpu_num_threads_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_gpu_num_threads_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_gpu_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_gpu_thread_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_gpu_thread_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_gpu_thread_id_z();
+}
+
+// Returns the size of the wave.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_gpu_num_lanes();
+}
+
+// Returns the id of the thread inside of a wave executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_gpu_lane_id();
+}
+
+// Returns the bit-mask of active threads in the current wave.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  return __builtin_gpu_lane_mask();
+}
+
+// Copies the value from the first active thread in the wave to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_gpu_read_first_lane_u32(__lane_mask, __x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  return __builtin_gpu_ballot(__lane_mask, __x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+  return __builtin_gpu_sync_threads();
+}
+
+// Wait for all threads in the wave to converge
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+  return __builtin_gpu_sync_lane(__lane_mask);
+}
+
+// Shuffles the the lanes inside the wave according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Terminates execution of the associated wave.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+  return __builtin_gpu_exit();
+}
+
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  return __builtin_gpu_thread_suspend();
+}
+
+#endif // __SPIRVINTRIN_H
@@ -0,0 +1,158 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | FileCheck %s
+
+#include <stdint.h>
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_x(void)
+{
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.y()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_y(void)
+{
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.z()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_z(void)
+{
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_x(void)
+{
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_y(void)
+{
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_z(void)
+{
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_x(void)
+{
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 14
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_y(void)
+{
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 8, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_z(void)
+{
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5:![0-9]+]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_x(void)
+{
+  return __builtin_amdgcn_grid_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_y(void)
+{
+  return __builtin_amdgcn_grid_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 20
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_z(void)
+{
+  return __builtin_amdgcn_grid_size_z();
+}
+
+//.
+// CHECK: [[RNG3]] = !{i16 1, i16 1025}
+// CHECK: [[META4]] = !{}
+// CHECK: [[RNG5]] = !{i32 1, i32 0}
+//.
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//`
	`1`	`+//===-- amdgpuintrin.h - AMDGPU intrinsic functions -----------------------===//`
`2`	`2`	`//`
`3`	`3`	`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
`4`	`4`	`// See https://llvm.org/LICENSE.txt for license information.`