Feat: hint about scalar range in thread intrinsics

RDambrosio016 · RDambrosio016 · commit 4728fe0bd420 · 2022-01-03T15:19:05.000-05:00
diff --git a/crates/cuda_std/CHANGELOG.md b/crates/cuda_std/CHANGELOG.md
@@ -4,6 +4,8 @@ Notable changes to this project will be documented in this file.
 
 ## Unreleased
 
+- Thread/Block/Grid index/dim intrinsics now hint to llvm that their range is in some bound declared by CUDA. Hopefully allowing for more optimizations.
+
 ## 0.2.1 - 12/8/21
 
 - Fixed `shared_array!` not using fully qualified MaybeUninit.
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
@@ -49,76 +49,104 @@ extern "C" {
     fn __nvvm_system_fence();
 }
 
+#[cfg(target_os = "cuda")]
+macro_rules! inbounds {
+    // the bounds were taken mostly from the cuda C++ programming guide, i also
+    // double-checked with what cuda clang does by checking its emitted llvm ir's scalar metadata
+    ($func_name:ident, $bound:expr) => {{
+        let val = unsafe { $func_name() };
+        if val > $bound {
+            // SAFETY: this condition is declared unreachable by compute capability max bound
+            // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+            // we do this to potentially allow for better optimizations by LLVM
+            unsafe { core::hint::unreachable_unchecked() }
+        } else {
+            val
+        }
+    }};
+    ($func_name:ident, $lower_bound:expr, $upper_bound:expr) => {{
+        let val = unsafe { $func_name() };
+        if val < $lower_bound || val > $upper_bound {
+            // SAFETY: this condition is declared unreachable by compute capability max bound
+            // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+            // we do this to potentially allow for better optimizations by LLVM
+            unsafe { core::hint::unreachable_unchecked() }
+        } else {
+            val
+        }
+    }};
+}
+
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_x() -> u32 {
-    unsafe { __nvvm_thread_idx_x() }
+    inbounds!(__nvvm_thread_idx_x, 1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_y() -> u32 {
-    unsafe { __nvvm_thread_idx_y() }
+    inbounds!(__nvvm_thread_idx_y, 1024)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn thread_idx_z() -> u32 {
-    unsafe { __nvvm_thread_idx_z() }
+    inbounds!(__nvvm_thread_idx_z, 64)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_x() -> u32 {
-    unsafe { __nvvm_block_idx_x() }
+    inbounds!(__nvvm_block_idx_x, 2147483647)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_y() -> u32 {
-    unsafe { __nvvm_block_idx_y() }
+    inbounds!(__nvvm_block_idx_y, 65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_idx_z() -> u32 {
-    unsafe { __nvvm_block_idx_z() }
+    inbounds!(__nvvm_block_idx_z, 65535)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_x() -> u32 {
-    unsafe { __nvvm_block_dim_x() }
+    inbounds!(__nvvm_block_dim_x, 1, 1025)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_y() -> u32 {
-    unsafe { __nvvm_block_dim_y() }
+    inbounds!(__nvvm_block_dim_y, 1, 1025)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn block_dim_z() -> u32 {
-    unsafe { __nvvm_block_dim_z() }
+    inbounds!(__nvvm_block_dim_z, 1, 65)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_x() -> u32 {
-    unsafe { __nvvm_grid_dim_x() }
+    inbounds!(__nvvm_grid_dim_x, 1, 2147483648)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_y() -> u32 {
-    unsafe { __nvvm_grid_dim_y() }
+    inbounds!(__nvvm_grid_dim_y, 1, 65536)
 }
 
 #[gpu_only]
 #[inline(always)]
 pub fn grid_dim_z() -> u32 {
-    unsafe { __nvvm_grid_dim_z() }
+    inbounds!(__nvvm_grid_dim_z, 1, 65536)
 }
 
 /// Gets the 3d index of the thread currently executing the kernel.