Skip to content

Commit a9f43d5

Browse files
committed
Add intrinsic for dynamic group-shared memory on GPUs
Group-shared memory is a memory region that is shared between all threads in a work-group on GPUs. Dynamic group-shared memory is in that memory region, though the allocated size is specified late, when launching a kernel, instead of early at compile-time. Group-shared memory in amdgpu and nvptx lives in address space 3. Dynamic group-shared memory is implemented by creating an external global variable in address space 3. The global is declared with size 0, as the actual size is only known at runtime. It is defined behavior in LLVM to access an external global outside the defined size. As far as I know, there is no similar way to get the allocated size of dynamic shared memory on amdgpu an nvptx, so users have to pass this out-of-band or rely on target specific ways.
1 parent 292be5c commit a9f43d5

File tree

10 files changed

+146
-6
lines changed

10 files changed

+146
-6
lines changed

compiler/rustc_abi/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,6 +1713,9 @@ pub struct AddressSpace(pub u32);
17131713
impl AddressSpace {
17141714
/// LLVM's `0` address space.
17151715
pub const ZERO: Self = AddressSpace(0);
1716+
/// The address space for work-group shared memory on nvptx and amdgpu.
1717+
/// See e.g. the `gpu_dynamic_groupshared_mem` intrinsic for details.
1718+
pub const GPU_SHARED: Self = AddressSpace(3);
17161719
}
17171720

17181721
/// The way we represent values to the backend

compiler/rustc_codegen_llvm/src/declare.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
use std::borrow::Borrow;
1515

1616
use itertools::Itertools;
17+
use rustc_abi::AddressSpace;
1718
use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods;
1819
use rustc_data_structures::fx::FxIndexSet;
1920
use rustc_middle::ty::{Instance, Ty};
@@ -97,6 +98,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
9798
)
9899
}
99100
}
101+
102+
/// Declare a global value in a specific address space.
103+
///
104+
/// If there’s a value with the same name already declared, the function will
105+
/// return its Value instead.
106+
pub(crate) fn declare_global_in_addrspace(
107+
&self,
108+
name: &str,
109+
ty: &'ll Type,
110+
addr_space: AddressSpace,
111+
) -> &'ll Value {
112+
debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
113+
unsafe {
114+
llvm::LLVMRustGetOrInsertGlobalInAddrspace(
115+
(**self).borrow().llmod,
116+
name.as_c_char_ptr(),
117+
name.len(),
118+
ty,
119+
addr_space.0,
120+
)
121+
}
122+
}
100123
}
101124

102125
impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {

compiler/rustc_codegen_llvm/src/intrinsic.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
use std::assert_matches::assert_matches;
22
use std::cmp::Ordering;
33

4-
use rustc_abi::{Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size};
4+
use rustc_abi::{
5+
AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size,
6+
};
57
use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
68
use rustc_codegen_ssa::codegen_attrs::autodiff_attrs;
79
use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
@@ -539,6 +541,31 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
539541
return Ok(());
540542
}
541543

544+
sym::gpu_dynamic_groupshared_mem => {
545+
// The name of the global variable is not relevant, the important properties are.
546+
// 1. The global is in the shared address space
547+
// 2. It is an extern global
548+
// All instances of extern addrspace(shared) globals are merged in the LLVM backend.
549+
// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
550+
let global = self.declare_global_in_addrspace(
551+
"gpu_dynamic_groupshared_mem",
552+
self.type_array(self.type_i8(), 0),
553+
AddressSpace::GPU_SHARED,
554+
);
555+
let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
556+
// The alignment of the global is used to specify the *minimum* alignment that the
557+
// must be obeyed by the GPU runtime.
558+
// When multiple of these global variables are merged, the maximum alignment is taken.
559+
// See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
560+
let alignment = self.align_of(*inner_ty).bytes() as u32;
561+
unsafe {
562+
if alignment > llvm::LLVMGetAlignment(global) {
563+
llvm::LLVMSetAlignment(global, alignment);
564+
}
565+
}
566+
self.cx().const_pointercast(global, self.type_ptr())
567+
}
568+
542569
_ if name.as_str().starts_with("simd_") => {
543570
// Unpack non-power-of-2 #[repr(packed, simd)] arguments.
544571
// This gives them the expected layout of a regular #[repr(simd)] vector.

compiler/rustc_codegen_llvm/src/llvm/ffi.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,6 +1973,13 @@ unsafe extern "C" {
19731973
NameLen: size_t,
19741974
T: &'a Type,
19751975
) -> &'a Value;
1976+
pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
1977+
M: &'a Module,
1978+
Name: *const c_char,
1979+
NameLen: size_t,
1980+
T: &'a Type,
1981+
AddressSpace: c_uint,
1982+
) -> &'a Value;
19761983
pub(crate) fn LLVMRustGetNamedValue(
19771984
M: &Module,
19781985
Name: *const c_char,

compiler/rustc_codegen_ssa/src/mir/intrinsic.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
110110
sym::abort
111111
| sym::unreachable
112112
| sym::cold_path
113+
| sym::gpu_dynamic_groupshared_mem
113114
| sym::breakpoint
114115
| sym::assert_zero_valid
115116
| sym::assert_mem_uninitialized_valid

compiler/rustc_hir_analysis/src/check/intrinsic.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
132132
| sym::forget
133133
| sym::frem_algebraic
134134
| sym::fsub_algebraic
135+
| sym::gpu_dynamic_groupshared_mem
135136
| sym::is_val_statically_known
136137
| sym::log2f16
137138
| sym::log2f32
@@ -289,6 +290,7 @@ pub(crate) fn check_intrinsic_type(
289290
}
290291
sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
291292
sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
293+
sym::gpu_dynamic_groupshared_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
292294
sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
293295
(1, 0, vec![], tcx.types.unit)
294296
}

compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -181,10 +181,10 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
181181
.getCallee());
182182
}
183183

184-
extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
185-
const char *Name,
186-
size_t NameLen,
187-
LLVMTypeRef Ty) {
184+
extern "C" LLVMValueRef
185+
LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
186+
size_t NameLen, LLVMTypeRef Ty,
187+
unsigned AddressSpace) {
188188
Module *Mod = unwrap(M);
189189
auto NameRef = StringRef(Name, NameLen);
190190

@@ -195,10 +195,21 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
195195
GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
196196
if (!GV)
197197
GV = new GlobalVariable(*Mod, unwrap(Ty), false,
198-
GlobalValue::ExternalLinkage, nullptr, NameRef);
198+
GlobalValue::ExternalLinkage, nullptr, NameRef,
199+
nullptr, GlobalValue::NotThreadLocal, AddressSpace);
199200
return wrap(GV);
200201
}
201202

203+
extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
204+
const char *Name,
205+
size_t NameLen,
206+
LLVMTypeRef Ty) {
207+
Module *Mod = unwrap(M);
208+
unsigned AddressSpace = Mod->getDataLayout().getDefaultGlobalsAddressSpace();
209+
return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
210+
AddressSpace);
211+
}
212+
202213
// Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
203214
enum class LLVMRustAttributeKind {
204215
AlwaysInline = 0,

compiler/rustc_span/src/symbol.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,6 +1140,7 @@ symbols! {
11401140
global_asm,
11411141
global_registration,
11421142
globs,
1143+
gpu_dynamic_groupshared_mem,
11431144
gt,
11441145
guard_patterns,
11451146
half_open_range_patterns,

library/core/src/intrinsics/mod.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3300,6 +3300,44 @@ pub(crate) const fn miri_promise_symbolic_alignment(ptr: *const (), align: usize
33003300
)
33013301
}
33023302

3303+
/// Returns the pointer to dynamic group-shared memory on GPUs.
3304+
///
3305+
/// Group-shared memory is a memory region that is shared between all threads in
3306+
/// the same work-group. It is faster to access then other memory but pointers do not
3307+
/// work outside the work-group where they were obtained.
3308+
/// Dynamic group-shared memory is in the group-shared memory region, the allocated
3309+
/// size is specified late, after compilation, when launching a gpu-kernel.
3310+
/// The size can differ between launches of a gpu-kernel, therefore it is called dynamic.
3311+
///
3312+
/// The returned pointer is the start of the dynamic group-shared memory region.
3313+
/// All calls to `gpu_dynamic_groupshared_mem` in a work-group, independent of the
3314+
/// generic type, return the same address, so alias the same memory.
3315+
/// The returned pointer is aligned by at least the alignment of `T`.
3316+
///
3317+
/// # Safety
3318+
///
3319+
/// The pointer is safe to dereference from the start (the returned pointer) up to the
3320+
/// size of dynamic group-shared memory that was specified when launching the current
3321+
/// gpu-kernel.
3322+
///
3323+
/// The user must take care of synchronizing access to group-shared memory between
3324+
/// threads in a work-group. It is undefined behavior if one thread makes a non-atomic
3325+
/// write to a group-shared memory location and another thread simultaneously accesses
3326+
/// the same location.
3327+
///
3328+
/// # Other APIs
3329+
///
3330+
/// CUDA and HIP call this shared memory, shared between threads in a block.
3331+
/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
3332+
/// GLSL calls this shared memory, shared between invocations in a work group.
3333+
/// DirectX calls this groupshared memory, shared between threads in a thread-group.
3334+
#[must_use = "returns a pointer that does nothing unless used"]
3335+
#[rustc_intrinsic]
3336+
#[rustc_nounwind]
3337+
#[unstable(feature = "gpu_dynamic_groupshared_mem", issue = "135513")]
3338+
#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
3339+
pub fn gpu_dynamic_groupshared_mem<T>() -> *mut T;
3340+
33033341
/// Copies the current location of arglist `src` to the arglist `dst`.
33043342
///
33053343
/// FIXME: document safety requirements
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Checks that the GPU dynamic group-shared memory intrinsic works.
2+
3+
//@ revisions: amdgpu nvptx
4+
//@ compile-flags: --crate-type=rlib
5+
//
6+
//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
7+
//@ [amdgpu] needs-llvm-components: amdgpu
8+
//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda
9+
//@ [nvptx] needs-llvm-components: nvptx
10+
//@ add-core-stubs
11+
#![feature(intrinsics, no_core, rustc_attrs)]
12+
#![no_core]
13+
14+
extern crate minicore;
15+
16+
#[rustc_intrinsic]
17+
#[rustc_nounwind]
18+
fn gpu_dynamic_groupshared_mem<T>() -> *mut T;
19+
20+
// CHECK: @gpu_dynamic_groupshared_mem = external addrspace(3) global [0 x i8], align 8
21+
// CHECK: ret ptr addrspacecast (ptr addrspace(3) @gpu_dynamic_groupshared_mem to ptr)
22+
#[unsafe(no_mangle)]
23+
pub fn fun() -> *mut i32 {
24+
let res = gpu_dynamic_groupshared_mem::<i32>();
25+
gpu_dynamic_groupshared_mem::<f64>(); // Increase alignment to 8
26+
res
27+
}

0 commit comments

Comments
 (0)