From a9f43d59232d92907b6f0bc1f77aa11f4568b941 Mon Sep 17 00:00:00 2001 From: Flakebi Date: Thu, 4 Sep 2025 00:50:22 +0200 Subject: [PATCH] Add intrinsic for dynamic group-shared memory on GPUs Group-shared memory is a memory region that is shared between all threads in a work-group on GPUs. Dynamic group-shared memory is in that memory region, though the allocated size is specified late, when launching a kernel, instead of early at compile-time. Group-shared memory in amdgpu and nvptx lives in address space 3. Dynamic group-shared memory is implemented by creating an external global variable in address space 3. The global is declared with size 0, as the actual size is only known at runtime. It is defined behavior in LLVM to access an external global outside the defined size. As far as I know, there is no similar way to get the allocated size of dynamic shared memory on amdgpu an nvptx, so users have to pass this out-of-band or rely on target specific ways. --- compiler/rustc_abi/src/lib.rs | 3 ++ compiler/rustc_codegen_llvm/src/declare.rs | 23 +++++++++++ compiler/rustc_codegen_llvm/src/intrinsic.rs | 29 +++++++++++++- compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 7 ++++ .../rustc_codegen_ssa/src/mir/intrinsic.rs | 1 + .../rustc_hir_analysis/src/check/intrinsic.rs | 2 + .../rustc_llvm/llvm-wrapper/RustWrapper.cpp | 21 +++++++--- compiler/rustc_span/src/symbol.rs | 1 + library/core/src/intrinsics/mod.rs | 38 +++++++++++++++++++ .../gpu-dynamic-groupshared-memory.rs | 27 +++++++++++++ 10 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs diff --git a/compiler/rustc_abi/src/lib.rs b/compiler/rustc_abi/src/lib.rs index de44c8755a078..6bcb24b57c1a4 100644 --- a/compiler/rustc_abi/src/lib.rs +++ b/compiler/rustc_abi/src/lib.rs @@ -1713,6 +1713,9 @@ pub struct AddressSpace(pub u32); impl AddressSpace { /// LLVM's `0` address space. pub const ZERO: Self = AddressSpace(0); + /// The address space for work-group shared memory on nvptx and amdgpu. + /// See e.g. the `gpu_dynamic_groupshared_mem` intrinsic for details. + pub const GPU_SHARED: Self = AddressSpace(3); } /// The way we represent values to the backend diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs index 8f69f176138cf..8c5fcd36fa69b 100644 --- a/compiler/rustc_codegen_llvm/src/declare.rs +++ b/compiler/rustc_codegen_llvm/src/declare.rs @@ -14,6 +14,7 @@ use std::borrow::Borrow; use itertools::Itertools; +use rustc_abi::AddressSpace; use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods; use rustc_data_structures::fx::FxIndexSet; use rustc_middle::ty::{Instance, Ty}; @@ -97,6 +98,28 @@ impl<'ll, CX: Borrow>> GenericCx<'ll, CX> { ) } } + + /// Declare a global value in a specific address space. + /// + /// If there’s a value with the same name already declared, the function will + /// return its Value instead. + pub(crate) fn declare_global_in_addrspace( + &self, + name: &str, + ty: &'ll Type, + addr_space: AddressSpace, + ) -> &'ll Value { + debug!("declare_global(name={name:?}, addrspace={addr_space:?})"); + unsafe { + llvm::LLVMRustGetOrInsertGlobalInAddrspace( + (**self).borrow().llmod, + name.as_c_char_ptr(), + name.len(), + ty, + addr_space.0, + ) + } + } } impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 14b3f3626efe9..389f9f9050814 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1,7 +1,9 @@ use std::assert_matches::assert_matches; use std::cmp::Ordering; -use rustc_abi::{Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size}; +use rustc_abi::{ + AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size, +}; use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh}; use rustc_codegen_ssa::codegen_attrs::autodiff_attrs; use rustc_codegen_ssa::common::{IntPredicate, TypeKind}; @@ -539,6 +541,31 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } + sym::gpu_dynamic_groupshared_mem => { + // The name of the global variable is not relevant, the important properties are. + // 1. The global is in the shared address space + // 2. It is an extern global + // All instances of extern addrspace(shared) globals are merged in the LLVM backend. + // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared + let global = self.declare_global_in_addrspace( + "gpu_dynamic_groupshared_mem", + self.type_array(self.type_i8(), 0), + AddressSpace::GPU_SHARED, + ); + let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() }; + // The alignment of the global is used to specify the *minimum* alignment that the + // must be obeyed by the GPU runtime. + // When multiple of these global variables are merged, the maximum alignment is taken. + // See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821 + let alignment = self.align_of(*inner_ty).bytes() as u32; + unsafe { + if alignment > llvm::LLVMGetAlignment(global) { + llvm::LLVMSetAlignment(global, alignment); + } + } + self.cx().const_pointercast(global, self.type_ptr()) + } + _ if name.as_str().starts_with("simd_") => { // Unpack non-power-of-2 #[repr(packed, simd)] arguments. // This gives them the expected layout of a regular #[repr(simd)] vector. diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 53f0f9ff9d01b..a323967dbaafc 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1973,6 +1973,13 @@ unsafe extern "C" { NameLen: size_t, T: &'a Type, ) -> &'a Value; + pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>( + M: &'a Module, + Name: *const c_char, + NameLen: size_t, + T: &'a Type, + AddressSpace: c_uint, + ) -> &'a Value; pub(crate) fn LLVMRustGetNamedValue( M: &Module, Name: *const c_char, diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs index cc3316c7f8cc0..aa9309c313ca9 100644 --- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs @@ -110,6 +110,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { sym::abort | sym::unreachable | sym::cold_path + | sym::gpu_dynamic_groupshared_mem | sym::breakpoint | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index a6659912e3fb9..2589bfe96d2fb 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -132,6 +132,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::forget | sym::frem_algebraic | sym::fsub_algebraic + | sym::gpu_dynamic_groupshared_mem | sym::is_val_statically_known | sym::log2f16 | sym::log2f32 @@ -289,6 +290,7 @@ pub(crate) fn check_intrinsic_type( } sym::rustc_peek => (1, 0, vec![param(0)], param(0)), sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()), + sym::gpu_dynamic_groupshared_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))), sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => { (1, 0, vec![], tcx.types.unit) } diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp index ad459986826a5..1acd29825b7e8 100644 --- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp +++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp @@ -181,10 +181,10 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M, .getCallee()); } -extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, - const char *Name, - size_t NameLen, - LLVMTypeRef Ty) { +extern "C" LLVMValueRef +LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name, + size_t NameLen, LLVMTypeRef Ty, + unsigned AddressSpace) { Module *Mod = unwrap(M); auto NameRef = StringRef(Name, NameLen); @@ -195,10 +195,21 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true); if (!GV) GV = new GlobalVariable(*Mod, unwrap(Ty), false, - GlobalValue::ExternalLinkage, nullptr, NameRef); + GlobalValue::ExternalLinkage, nullptr, NameRef, + nullptr, GlobalValue::NotThreadLocal, AddressSpace); return wrap(GV); } +extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, + const char *Name, + size_t NameLen, + LLVMTypeRef Ty) { + Module *Mod = unwrap(M); + unsigned AddressSpace = Mod->getDataLayout().getDefaultGlobalsAddressSpace(); + return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty, + AddressSpace); +} + // Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`. enum class LLVMRustAttributeKind { AlwaysInline = 0, diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 223d818a2949b..ddc15dbd48824 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1140,6 +1140,7 @@ symbols! { global_asm, global_registration, globs, + gpu_dynamic_groupshared_mem, gt, guard_patterns, half_open_range_patterns, diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 4cee77fda4fba..27572c27e381e 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3300,6 +3300,44 @@ pub(crate) const fn miri_promise_symbolic_alignment(ptr: *const (), align: usize ) } +/// Returns the pointer to dynamic group-shared memory on GPUs. +/// +/// Group-shared memory is a memory region that is shared between all threads in +/// the same work-group. It is faster to access then other memory but pointers do not +/// work outside the work-group where they were obtained. +/// Dynamic group-shared memory is in the group-shared memory region, the allocated +/// size is specified late, after compilation, when launching a gpu-kernel. +/// The size can differ between launches of a gpu-kernel, therefore it is called dynamic. +/// +/// The returned pointer is the start of the dynamic group-shared memory region. +/// All calls to `gpu_dynamic_groupshared_mem` in a work-group, independent of the +/// generic type, return the same address, so alias the same memory. +/// The returned pointer is aligned by at least the alignment of `T`. +/// +/// # Safety +/// +/// The pointer is safe to dereference from the start (the returned pointer) up to the +/// size of dynamic group-shared memory that was specified when launching the current +/// gpu-kernel. +/// +/// The user must take care of synchronizing access to group-shared memory between +/// threads in a work-group. It is undefined behavior if one thread makes a non-atomic +/// write to a group-shared memory location and another thread simultaneously accesses +/// the same location. +/// +/// # Other APIs +/// +/// CUDA and HIP call this shared memory, shared between threads in a block. +/// OpenCL and SYCL call this local memory, shared between threads in a work-group. +/// GLSL calls this shared memory, shared between invocations in a work group. +/// DirectX calls this groupshared memory, shared between threads in a thread-group. +#[must_use = "returns a pointer that does nothing unless used"] +#[rustc_intrinsic] +#[rustc_nounwind] +#[unstable(feature = "gpu_dynamic_groupshared_mem", issue = "135513")] +#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))] +pub fn gpu_dynamic_groupshared_mem() -> *mut T; + /// Copies the current location of arglist `src` to the arglist `dst`. /// /// FIXME: document safety requirements diff --git a/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs b/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs new file mode 100644 index 0000000000000..e22c68b0ef984 --- /dev/null +++ b/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs @@ -0,0 +1,27 @@ +// Checks that the GPU dynamic group-shared memory intrinsic works. + +//@ revisions: amdgpu nvptx +//@ compile-flags: --crate-type=rlib +// +//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 +//@ [amdgpu] needs-llvm-components: amdgpu +//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda +//@ [nvptx] needs-llvm-components: nvptx +//@ add-core-stubs +#![feature(intrinsics, no_core, rustc_attrs)] +#![no_core] + +extern crate minicore; + +#[rustc_intrinsic] +#[rustc_nounwind] +fn gpu_dynamic_groupshared_mem() -> *mut T; + +// CHECK: @gpu_dynamic_groupshared_mem = external addrspace(3) global [0 x i8], align 8 +// CHECK: ret ptr addrspacecast (ptr addrspace(3) @gpu_dynamic_groupshared_mem to ptr) +#[unsafe(no_mangle)] +pub fn fun() -> *mut i32 { + let res = gpu_dynamic_groupshared_mem::(); + gpu_dynamic_groupshared_mem::(); // Increase alignment to 8 + res +}