|
1 |
| -//! Shared memory handling. Currently only macros. |
| 1 | +//! Static and Dynamic shared memory handling. |
| 2 | +
|
| 3 | +use crate::gpu_only; |
2 | 4 |
|
3 | 5 | /// Statically allocates a buffer large enough for `len` elements of `array_type`, yielding
|
4 | 6 | /// a `*mut array_type` that points to uninitialized shared memory. `len` must be a constant expression.
|
|
42 | 44 | #[macro_export]
|
43 | 45 | macro_rules! shared_array {
|
44 | 46 | ($array_type:ty; $len:expr) => {{
|
45 |
| - // the initializer is discarded when declaring shared globals, so it is unimportant. |
46 |
| - #[$crate::address_space(shared)] |
47 |
| - static mut SHARED: MaybeUninit<[$array_type; $len]> = MaybeUninit::uninit(); |
48 |
| - SHARED.as_mut_ptr() as *mut $array_type |
| 47 | + #[$crate::gpu_only] |
| 48 | + #[inline(always)] |
| 49 | + fn shared_array() -> *mut $array_type { |
| 50 | + use ::core::{cell::UnsafeCell, mem::MaybeUninit}; |
| 51 | + struct SyncWrapper(UnsafeCell<MaybeUninit<[$array_type; $len]>>); |
| 52 | + // SAFETY: it is up to the user to verify sound shared memory usage, we cannot |
| 53 | + // fundamentally check it for soundness. |
| 54 | + unsafe impl Send for SyncWrapper {} |
| 55 | + // SAFETY: see above |
| 56 | + unsafe impl Sync for SyncWrapper {} |
| 57 | + |
| 58 | + // the initializer is discarded when declaring shared globals, so it is unimportant. |
| 59 | + #[$crate::address_space(shared)] |
| 60 | + static SHARED: SyncWrapper = SyncWrapper(UnsafeCell::new(MaybeUninit::uninit())); |
| 61 | + |
| 62 | + SHARED.0.get() as *mut $array_type |
| 63 | + } |
| 64 | + shared_array() |
49 | 65 | }};
|
50 | 66 | }
|
| 67 | + |
| 68 | +/// Gets a pointer to the dynamic shared memory that was allocated by the caller of the kernel. The |
| 69 | +/// data is left uninitialized. |
| 70 | +/// |
| 71 | +/// **Calling this function multiple times will yield the same pointer**. |
| 72 | +#[gpu_only] |
| 73 | +pub fn dynamic_shared_mem<T>() -> *mut T { |
| 74 | + // it is unclear whether an alignment of 16 is actually required for correctness, however, |
| 75 | + // it seems like nvcc always generates the global with .align 16 no matter the type, so we just copy |
| 76 | + // nvcc's behavior for now. |
| 77 | + extern "C" { |
| 78 | + // need to use nvvm_internal and not address_space because address_space only parses |
| 79 | + // static definitions, not extern static definitions. |
| 80 | + #[nvvm_internal(addrspace(3))] |
| 81 | + #[allow(improper_ctypes)] |
| 82 | + // mangle it a bit to make sure nobody makes the same thing |
| 83 | + #[link_name = "_Zcuda_std_dyn_shared"] |
| 84 | + static DYN_SHARED: ::core::cell::UnsafeCell<u128>; |
| 85 | + } |
| 86 | + |
| 87 | + // SAFETY: extern statics is how dynamic shared mem is done in CUDA. This will turn into |
| 88 | + // an extern variable decl in ptx, which is the same thing nvcc does if you dump the ptx from a cuda file. |
| 89 | + unsafe { DYN_SHARED.get() as *mut T } |
| 90 | +} |
0 commit comments