Skip to content

Commit b6c0d81

Browse files
committed
Feat: add cuda_std::ptr with address space functions
1 parent c40f1f3 commit b6c0d81

File tree

4 files changed

+134
-3
lines changed

4 files changed

+134
-3
lines changed

crates/cuda_std/CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,9 @@ Notable changes to this project will be documented in this file.
66

77
- Added `#[externally_visible]` in conjunction with cg_nvvm dead code elimination changes to mark that
88
a function is externally visible.
9+
- Added `#[address_space(...)]` in conjunction with cg_nvvm address space changes. Only meant for internal use
10+
and advanced users.
11+
- Added `cuda_std::ptr`.
12+
- Added `is_in_address_space`
13+
- Added `convert_generic_to_specific_address_space`
14+
- Added `convert_specific_address_space_to_generic`

crates/cuda_std/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ pub mod mem;
3838
pub mod misc;
3939
// WIP
4040
// pub mod rt;
41+
pub mod ptr;
4142
pub mod thread;
4243
pub mod warp;
4344

crates/cuda_std/src/ptr.rs

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
//! CUDA-specific pointer handling logic.
2+
3+
use crate::gpu_only;
4+
5+
/// Special areas of GPU memory where a pointer could reside.
6+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
7+
pub enum AddressSpace {
8+
/// Memory available for reading and writing to the entire device.
9+
Global,
10+
/// Block-local read/write memory available to all threads in a block.
11+
Shared,
12+
/// Read-only memory available to the whole device.
13+
Constant,
14+
/// Thread-local read/write memory only available to an individual thread.
15+
Local,
16+
}
17+
18+
/// Determines whether a pointer is in a specific address space.
19+
///
20+
/// # Safety
21+
///
22+
/// The pointer must be valid for an instance of `T`, otherwise Undefined Behavior is exhibited.
23+
// TODO(RDambrosio016): Investigate subpar codegen for this function. It seems nvcc implements this not using
24+
// inline asm, but instead with some sort of compiler intrinsic, because its able to optimize away the function
25+
// a lot of the time.
26+
#[gpu_only]
27+
pub unsafe fn is_in_address_space<T>(ptr: *const T, address_space: AddressSpace) -> bool {
28+
let ret: u32;
29+
// create a predicate register to store the result of the isspacep into.
30+
asm!(".reg .pred p;");
31+
32+
// perform the actual isspacep operation, and store the result in the predicate register we made.
33+
match address_space {
34+
AddressSpace::Global => asm!("isspacep.global p, {}", in(reg64) ptr),
35+
AddressSpace::Shared => asm!("isspacep.shared p, {}", in(reg64) ptr),
36+
AddressSpace::Constant => asm!("isspacep.const p, {}", in(reg64) ptr),
37+
AddressSpace::Local => asm!("isspacep.local p, {}", in(reg64) ptr),
38+
}
39+
40+
// finally, use the predicate register to write out a value.
41+
asm!("selp.u32 {}, 1, 0, p;", out(reg32) ret);
42+
43+
ret != 0
44+
}
45+
46+
/// Converts a pointer from a generic address space, to a specific address space.
47+
/// This maps directly to the [`cvta`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta) PTX instruction.
48+
///
49+
/// # Safety
50+
///
51+
/// The pointer must be valid for an instance of `T`, and the pointer must fall in the specific address space in memory,
52+
/// otherwise Undefined Behavior is exhibited.
53+
#[gpu_only]
54+
pub unsafe fn convert_generic_to_specific_address_space<T>(
55+
ptr: *const T,
56+
address_space: AddressSpace,
57+
) -> *const T {
58+
let ret: *const T;
59+
60+
match address_space {
61+
AddressSpace::Global => asm!(
62+
"cvta.to.global.u64 {}, {}",
63+
out(reg64) ret,
64+
in(reg64), ptr
65+
),
66+
AddressSpace::Shared => asm!(
67+
"cvta.to.shared.u64 {}, {}",
68+
out(reg64) ret,
69+
in(reg64), ptr
70+
),
71+
AddressSpace::Constant => asm!(
72+
"cvta.to.const.u64 {}, {}",
73+
out(reg64) ret,
74+
in(reg64), ptr
75+
),
76+
AddressSpace::Local => asm!(
77+
"cvta.to.local.u64 {}, {}",
78+
out(reg64) ret,
79+
in(reg64), ptr
80+
),
81+
}
82+
83+
ret
84+
}
85+
86+
/// Converts a pointer in a specific address space, to a generic address space.
87+
/// This maps directly to the [`cvta`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta) PTX instruction.
88+
///
89+
/// # Safety
90+
///
91+
/// The pointer must be valid for an instance of `T`, and the pointer must fall in the specific address space in memory,
92+
/// otherwise Undefined Behavior is exhibited.
93+
#[gpu_only]
94+
pub unsafe fn convert_specific_address_space_to_generic<T>(
95+
ptr: *const T,
96+
address_space: AddressSpace,
97+
) -> *const T {
98+
let ret: *const T;
99+
100+
match address_space {
101+
AddressSpace::Global => asm!(
102+
"cvta.global.u64 {}, {}",
103+
out(reg64) ret,
104+
in(reg64), ptr
105+
),
106+
AddressSpace::Shared => asm!(
107+
"cvta.shared.u64 {}, {}",
108+
out(reg64) ret,
109+
in(reg64), ptr
110+
),
111+
AddressSpace::Constant => asm!(
112+
"cvta.const.u64 {}, {}",
113+
out(reg64) ret,
114+
in(reg64), ptr
115+
),
116+
AddressSpace::Local => asm!(
117+
"cvta.local.u64 {}, {}",
118+
out(reg64) ret,
119+
in(reg64), ptr
120+
),
121+
}
122+
123+
ret
124+
}

guide/src/features.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ on things used by the wide majority of users.
6565
| Feature Name | Support Level | Notes |
6666
| ------------ | ------------- | ----- |
6767
| Function Execution Space Specifiers ||
68-
| Variable Memory Space Specifiers | | Handled Implicitly |
68+
| Variable Memory Space Specifiers | ✔️ | Handled Implicitly but can be explicitly stated for statics with `#[address_space(...)]` |
6969
| Built-in Vector Types || Use linear algebra libraries like vek or glam |
7070
| Built-in Variables | ✔️ |
7171
| Memory Fence Instructions | ✔️ |
@@ -78,8 +78,8 @@ on things used by the wide majority of users.
7878
| Store Functions Using Cache Hints ||
7979
| Time Function | ✔️ |
8080
| Atomic Functions ||
81-
| Address Space Predicate Functions | | Address Spaces are implicitly handled, but they may be added for exotic interop with CUDA C/C++ |
82-
| Address Space Conversion Functions | |
81+
| Address Space Predicate Functions | ✔️ | Address Spaces are implicitly handled, but they may be added for exotic interop with CUDA C/C++ |
82+
| Address Space Conversion Functions | ✔️ |
8383
| Alloca Function ||
8484
| Compiler Optimization Hint Functions || Existing `core` hints work |
8585
| Warp Vote Functions ||

0 commit comments

Comments
 (0)