Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions compiler/rustc_abi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1713,6 +1713,9 @@ pub struct AddressSpace(pub u32);
impl AddressSpace {
/// LLVM's `0` address space.
pub const ZERO: Self = AddressSpace(0);
/// The address space for work-group shared memory on nvptx and amdgpu.
/// See e.g. the `gpu_dynamic_groupshared_mem` intrinsic for details.
pub const GPU_SHARED: Self = AddressSpace(3);
}

/// The way we represent values to the backend
Expand Down
23 changes: 23 additions & 0 deletions compiler/rustc_codegen_llvm/src/declare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use std::borrow::Borrow;

use itertools::Itertools;
use rustc_abi::AddressSpace;
use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods;
use rustc_data_structures::fx::FxIndexSet;
use rustc_middle::ty::{Instance, Ty};
Expand Down Expand Up @@ -97,6 +98,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
)
}
}

/// Declare a global value in a specific address space.
///
/// If there’s a value with the same name already declared, the function will
/// return its Value instead.
pub(crate) fn declare_global_in_addrspace(
&self,
name: &str,
ty: &'ll Type,
addr_space: AddressSpace,
) -> &'ll Value {
debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
unsafe {
llvm::LLVMRustGetOrInsertGlobalInAddrspace(
(**self).borrow().llmod,
name.as_c_char_ptr(),
name.len(),
ty,
addr_space.0,
)
}
}
}

impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
Expand Down
29 changes: 28 additions & 1 deletion compiler/rustc_codegen_llvm/src/intrinsic.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use std::assert_matches::assert_matches;
use std::cmp::Ordering;

use rustc_abi::{Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size};
use rustc_abi::{
AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size,
};
use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
use rustc_codegen_ssa::codegen_attrs::autodiff_attrs;
use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
Expand Down Expand Up @@ -539,6 +541,31 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
return Ok(());
}

sym::gpu_dynamic_groupshared_mem => {
// The name of the global variable is not relevant, the important properties are.
// 1. The global is in the shared address space
// 2. It is an extern global
// All instances of extern addrspace(shared) globals are merged in the LLVM backend.
// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
let global = self.declare_global_in_addrspace(
"gpu_dynamic_groupshared_mem",
self.type_array(self.type_i8(), 0),
AddressSpace::GPU_SHARED,
);
let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
// The alignment of the global is used to specify the *minimum* alignment that the
// must be obeyed by the GPU runtime.
// When multiple of these global variables are merged, the maximum alignment is taken.
// See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
let alignment = self.align_of(*inner_ty).bytes() as u32;
unsafe {
if alignment > llvm::LLVMGetAlignment(global) {
llvm::LLVMSetAlignment(global, alignment);
}
}
self.cx().const_pointercast(global, self.type_ptr())
}

_ if name.as_str().starts_with("simd_") => {
// Unpack non-power-of-2 #[repr(packed, simd)] arguments.
// This gives them the expected layout of a regular #[repr(simd)] vector.
Expand Down
7 changes: 7 additions & 0 deletions compiler/rustc_codegen_llvm/src/llvm/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1973,6 +1973,13 @@ unsafe extern "C" {
NameLen: size_t,
T: &'a Type,
) -> &'a Value;
pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
M: &'a Module,
Name: *const c_char,
NameLen: size_t,
T: &'a Type,
AddressSpace: c_uint,
) -> &'a Value;
pub(crate) fn LLVMRustGetNamedValue(
M: &Module,
Name: *const c_char,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
sym::abort
| sym::unreachable
| sym::cold_path
| sym::gpu_dynamic_groupshared_mem
| sym::breakpoint
| sym::assert_zero_valid
| sym::assert_mem_uninitialized_valid
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_hir_analysis/src/check/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
| sym::forget
| sym::frem_algebraic
| sym::fsub_algebraic
| sym::gpu_dynamic_groupshared_mem
| sym::is_val_statically_known
| sym::log2f16
| sym::log2f32
Expand Down Expand Up @@ -289,6 +290,7 @@ pub(crate) fn check_intrinsic_type(
}
sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
sym::gpu_dynamic_groupshared_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
(1, 0, vec![], tcx.types.unit)
}
Expand Down
21 changes: 16 additions & 5 deletions compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
.getCallee());
}

extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
const char *Name,
size_t NameLen,
LLVMTypeRef Ty) {
extern "C" LLVMValueRef
LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
size_t NameLen, LLVMTypeRef Ty,
unsigned AddressSpace) {
Module *Mod = unwrap(M);
auto NameRef = StringRef(Name, NameLen);

Expand All @@ -195,10 +195,21 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
if (!GV)
GV = new GlobalVariable(*Mod, unwrap(Ty), false,
GlobalValue::ExternalLinkage, nullptr, NameRef);
GlobalValue::ExternalLinkage, nullptr, NameRef,
nullptr, GlobalValue::NotThreadLocal, AddressSpace);
return wrap(GV);
}

extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
const char *Name,
size_t NameLen,
LLVMTypeRef Ty) {
Module *Mod = unwrap(M);
unsigned AddressSpace = Mod->getDataLayout().getDefaultGlobalsAddressSpace();
return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
AddressSpace);
}

// Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
enum class LLVMRustAttributeKind {
AlwaysInline = 0,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_span/src/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,7 @@ symbols! {
global_asm,
global_registration,
globs,
gpu_dynamic_groupshared_mem,
gt,
guard_patterns,
half_open_range_patterns,
Expand Down
38 changes: 38 additions & 0 deletions library/core/src/intrinsics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3300,6 +3300,44 @@ pub(crate) const fn miri_promise_symbolic_alignment(ptr: *const (), align: usize
)
}

/// Returns the pointer to dynamic group-shared memory on GPUs.
///
/// Group-shared memory is a memory region that is shared between all threads in
/// the same work-group. It is faster to access then other memory but pointers do not
/// work outside the work-group where they were obtained.
/// Dynamic group-shared memory is in the group-shared memory region, the allocated
/// size is specified late, after compilation, when launching a gpu-kernel.
/// The size can differ between launches of a gpu-kernel, therefore it is called dynamic.
///
/// The returned pointer is the start of the dynamic group-shared memory region.
/// All calls to `gpu_dynamic_groupshared_mem` in a work-group, independent of the
/// generic type, return the same address, so alias the same memory.
/// The returned pointer is aligned by at least the alignment of `T`.
Copy link
Member

@RalfJung RalfJung Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Speaking of safety requirements... how does one use this pointer? I get that it is aligned, but does it point to enough memory to store a T? If it's always the same address, doesn't everyone overwrite each other's data all the time? This API looks very odd for a non-GPU person, and it's not clear to me whether that is resolved by having more magic behavior (which should be documented or at least referenced here), or whether there's higher-level APIs built on top that deal with this (but this intrinsic provides so few guarantees, I can't see how that should be possible).

Typically, intrinsic documentations should be detailed enough that I can read and write code using the intrinsic and know exactly whether the code is correct and what it will do in all circumstances. I don't know if there's any hope of achieving that with GPU intrinsics, but if not then we need to have a bit of a wider discussion -- we have had bad experience with just importing "externally defined" semantics into Rust without considering all the interactions (in general, it is not logically coherent to have semantics externally defined).

The current docs would let me implement this intrinsic by just always returning 1024, and emitting a compile error if T has alignment bigger than 1024. I doubt that's a legal implementation. But that means the docs are not precise enough to describe what the implementation must do.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there some prior discussion of the design decision to determine the alignment by giving a type parameter? I could also be a const generic parameter, for instance. I don't have an opinion on the matter since I am an outsider to the GPU world, but as a compiler team member it'd be good to know if this is something you thought about for 5 minutes or whether there's some sort of larger design by a team that has a vision of how all these things will fit together.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is some discussion in #135516. I don’t mind either way, I thought (for 5 minutes ;)) that specifying the type of the returned pointer makes sense.
I’m not much of a GPU programmer, but I think in most cases, you would store an array in dynamic shared memory, or maybe a struct followed by a dynamically sized array (or maybe two/n arrays of different types).

For just a struct, static shared memory would make more sense, though we don’t support that yet (there’s some discussion in the tracking issue, but I think that’s more complicated to design and implement).

///
/// # Safety
///
/// The pointer is safe to dereference from the start (the returned pointer) up to the
/// size of dynamic group-shared memory that was specified when launching the current
/// gpu-kernel.
///
/// The user must take care of synchronizing access to group-shared memory between
/// threads in a work-group. It is undefined behavior if one thread makes a non-atomic
/// write to a group-shared memory location and another thread simultaneously accesses
/// the same location.
///
/// # Other APIs
///
/// CUDA and HIP call this shared memory, shared between threads in a block.
/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
/// GLSL calls this shared memory, shared between invocations in a work group.
/// DirectX calls this groupshared memory, shared between threads in a thread-group.
#[must_use = "returns a pointer that does nothing unless used"]
#[rustc_intrinsic]
#[rustc_nounwind]
#[unstable(feature = "gpu_dynamic_groupshared_mem", issue = "135513")]
#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
pub fn gpu_dynamic_groupshared_mem<T>() -> *mut T;

/// Copies the current location of arglist `src` to the arglist `dst`.
///
/// FIXME: document safety requirements
Expand Down
27 changes: 27 additions & 0 deletions tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Checks that the GPU dynamic group-shared memory intrinsic works.

//@ revisions: amdgpu nvptx
//@ compile-flags: --crate-type=rlib
//
//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
//@ [amdgpu] needs-llvm-components: amdgpu
//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda
//@ [nvptx] needs-llvm-components: nvptx
//@ add-core-stubs
#![feature(intrinsics, no_core, rustc_attrs)]
#![no_core]

extern crate minicore;

#[rustc_intrinsic]
#[rustc_nounwind]
fn gpu_dynamic_groupshared_mem<T>() -> *mut T;

// CHECK: @gpu_dynamic_groupshared_mem = external addrspace(3) global [0 x i8], align 8
// CHECK: ret ptr addrspacecast (ptr addrspace(3) @gpu_dynamic_groupshared_mem to ptr)
#[unsafe(no_mangle)]
pub fn fun() -> *mut i32 {
let res = gpu_dynamic_groupshared_mem::<i32>();
gpu_dynamic_groupshared_mem::<f64>(); // Increase alignment to 8
res
}
Loading