diff --git a/compiler/rustc_abi/src/lib.rs b/compiler/rustc_abi/src/lib.rs index de44c8755a078..6bcb24b57c1a4 100644 --- a/compiler/rustc_abi/src/lib.rs +++ b/compiler/rustc_abi/src/lib.rs @@ -1713,6 +1713,9 @@ pub struct AddressSpace(pub u32); impl AddressSpace { /// LLVM's `0` address space. pub const ZERO: Self = AddressSpace(0); + /// The address space for work-group shared memory on nvptx and amdgpu. + /// See e.g. the `gpu_dynamic_groupshared_mem` intrinsic for details. + pub const GPU_SHARED: Self = AddressSpace(3); } /// The way we represent values to the backend diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs index 8f69f176138cf..8c5fcd36fa69b 100644 --- a/compiler/rustc_codegen_llvm/src/declare.rs +++ b/compiler/rustc_codegen_llvm/src/declare.rs @@ -14,6 +14,7 @@ use std::borrow::Borrow; use itertools::Itertools; +use rustc_abi::AddressSpace; use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods; use rustc_data_structures::fx::FxIndexSet; use rustc_middle::ty::{Instance, Ty}; @@ -97,6 +98,28 @@ impl<'ll, CX: Borrow>> GenericCx<'ll, CX> { ) } } + + /// Declare a global value in a specific address space. + /// + /// If there’s a value with the same name already declared, the function will + /// return its Value instead. + pub(crate) fn declare_global_in_addrspace( + &self, + name: &str, + ty: &'ll Type, + addr_space: AddressSpace, + ) -> &'ll Value { + debug!("declare_global(name={name:?}, addrspace={addr_space:?})"); + unsafe { + llvm::LLVMRustGetOrInsertGlobalInAddrspace( + (**self).borrow().llmod, + name.as_c_char_ptr(), + name.len(), + ty, + addr_space.0, + ) + } + } } impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 14b3f3626efe9..389f9f9050814 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1,7 +1,9 @@ use std::assert_matches::assert_matches; use std::cmp::Ordering; -use rustc_abi::{Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size}; +use rustc_abi::{ + AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size, +}; use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh}; use rustc_codegen_ssa::codegen_attrs::autodiff_attrs; use rustc_codegen_ssa::common::{IntPredicate, TypeKind}; @@ -539,6 +541,31 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } + sym::gpu_dynamic_groupshared_mem => { + // The name of the global variable is not relevant, the important properties are. + // 1. The global is in the shared address space + // 2. It is an extern global + // All instances of extern addrspace(shared) globals are merged in the LLVM backend. + // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared + let global = self.declare_global_in_addrspace( + "gpu_dynamic_groupshared_mem", + self.type_array(self.type_i8(), 0), + AddressSpace::GPU_SHARED, + ); + let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() }; + // The alignment of the global is used to specify the *minimum* alignment that the + // must be obeyed by the GPU runtime. + // When multiple of these global variables are merged, the maximum alignment is taken. + // See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821 + let alignment = self.align_of(*inner_ty).bytes() as u32; + unsafe { + if alignment > llvm::LLVMGetAlignment(global) { + llvm::LLVMSetAlignment(global, alignment); + } + } + self.cx().const_pointercast(global, self.type_ptr()) + } + _ if name.as_str().starts_with("simd_") => { // Unpack non-power-of-2 #[repr(packed, simd)] arguments. // This gives them the expected layout of a regular #[repr(simd)] vector. diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index 53f0f9ff9d01b..a323967dbaafc 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1973,6 +1973,13 @@ unsafe extern "C" { NameLen: size_t, T: &'a Type, ) -> &'a Value; + pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>( + M: &'a Module, + Name: *const c_char, + NameLen: size_t, + T: &'a Type, + AddressSpace: c_uint, + ) -> &'a Value; pub(crate) fn LLVMRustGetNamedValue( M: &Module, Name: *const c_char, diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs index cc3316c7f8cc0..aa9309c313ca9 100644 --- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs @@ -110,6 +110,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { sym::abort | sym::unreachable | sym::cold_path + | sym::gpu_dynamic_groupshared_mem | sym::breakpoint | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index a6659912e3fb9..2589bfe96d2fb 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -132,6 +132,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::forget | sym::frem_algebraic | sym::fsub_algebraic + | sym::gpu_dynamic_groupshared_mem | sym::is_val_statically_known | sym::log2f16 | sym::log2f32 @@ -289,6 +290,7 @@ pub(crate) fn check_intrinsic_type( } sym::rustc_peek => (1, 0, vec![param(0)], param(0)), sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()), + sym::gpu_dynamic_groupshared_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))), sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => { (1, 0, vec![], tcx.types.unit) } diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp index ad459986826a5..1acd29825b7e8 100644 --- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp +++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp @@ -181,10 +181,10 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M, .getCallee()); } -extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, - const char *Name, - size_t NameLen, - LLVMTypeRef Ty) { +extern "C" LLVMValueRef +LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name, + size_t NameLen, LLVMTypeRef Ty, + unsigned AddressSpace) { Module *Mod = unwrap(M); auto NameRef = StringRef(Name, NameLen); @@ -195,10 +195,21 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true); if (!GV) GV = new GlobalVariable(*Mod, unwrap(Ty), false, - GlobalValue::ExternalLinkage, nullptr, NameRef); + GlobalValue::ExternalLinkage, nullptr, NameRef, + nullptr, GlobalValue::NotThreadLocal, AddressSpace); return wrap(GV); } +extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, + const char *Name, + size_t NameLen, + LLVMTypeRef Ty) { + Module *Mod = unwrap(M); + unsigned AddressSpace = Mod->getDataLayout().getDefaultGlobalsAddressSpace(); + return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty, + AddressSpace); +} + // Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`. enum class LLVMRustAttributeKind { AlwaysInline = 0, diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 223d818a2949b..ddc15dbd48824 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1140,6 +1140,7 @@ symbols! { global_asm, global_registration, globs, + gpu_dynamic_groupshared_mem, gt, guard_patterns, half_open_range_patterns, diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 4cee77fda4fba..27572c27e381e 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3300,6 +3300,44 @@ pub(crate) const fn miri_promise_symbolic_alignment(ptr: *const (), align: usize ) } +/// Returns the pointer to dynamic group-shared memory on GPUs. +/// +/// Group-shared memory is a memory region that is shared between all threads in +/// the same work-group. It is faster to access then other memory but pointers do not +/// work outside the work-group where they were obtained. +/// Dynamic group-shared memory is in the group-shared memory region, the allocated +/// size is specified late, after compilation, when launching a gpu-kernel. +/// The size can differ between launches of a gpu-kernel, therefore it is called dynamic. +/// +/// The returned pointer is the start of the dynamic group-shared memory region. +/// All calls to `gpu_dynamic_groupshared_mem` in a work-group, independent of the +/// generic type, return the same address, so alias the same memory. +/// The returned pointer is aligned by at least the alignment of `T`. +/// +/// # Safety +/// +/// The pointer is safe to dereference from the start (the returned pointer) up to the +/// size of dynamic group-shared memory that was specified when launching the current +/// gpu-kernel. +/// +/// The user must take care of synchronizing access to group-shared memory between +/// threads in a work-group. It is undefined behavior if one thread makes a non-atomic +/// write to a group-shared memory location and another thread simultaneously accesses +/// the same location. +/// +/// # Other APIs +/// +/// CUDA and HIP call this shared memory, shared between threads in a block. +/// OpenCL and SYCL call this local memory, shared between threads in a work-group. +/// GLSL calls this shared memory, shared between invocations in a work group. +/// DirectX calls this groupshared memory, shared between threads in a thread-group. +#[must_use = "returns a pointer that does nothing unless used"] +#[rustc_intrinsic] +#[rustc_nounwind] +#[unstable(feature = "gpu_dynamic_groupshared_mem", issue = "135513")] +#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))] +pub fn gpu_dynamic_groupshared_mem() -> *mut T; + /// Copies the current location of arglist `src` to the arglist `dst`. /// /// FIXME: document safety requirements diff --git a/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs b/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs new file mode 100644 index 0000000000000..e22c68b0ef984 --- /dev/null +++ b/tests/codegen-llvm/gpu-dynamic-groupshared-memory.rs @@ -0,0 +1,27 @@ +// Checks that the GPU dynamic group-shared memory intrinsic works. + +//@ revisions: amdgpu nvptx +//@ compile-flags: --crate-type=rlib +// +//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 +//@ [amdgpu] needs-llvm-components: amdgpu +//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda +//@ [nvptx] needs-llvm-components: nvptx +//@ add-core-stubs +#![feature(intrinsics, no_core, rustc_attrs)] +#![no_core] + +extern crate minicore; + +#[rustc_intrinsic] +#[rustc_nounwind] +fn gpu_dynamic_groupshared_mem() -> *mut T; + +// CHECK: @gpu_dynamic_groupshared_mem = external addrspace(3) global [0 x i8], align 8 +// CHECK: ret ptr addrspacecast (ptr addrspace(3) @gpu_dynamic_groupshared_mem to ptr) +#[unsafe(no_mangle)] +pub fn fun() -> *mut i32 { + let res = gpu_dynamic_groupshared_mem::(); + gpu_dynamic_groupshared_mem::(); // Increase alignment to 8 + res +}