NVIDIA · davebayer · Jan 5, 2026 · Jan 5, 2026 · bernhardmgruber · Jan 5, 2026
@@ -20,11 +20,6 @@
 #  pragma system_header
 #endif // no system header
 
-// CUDA headers might not be present when using NVRTC, see NVIDIA/cccl#2095 for detail
-#if !_CCCL_COMPILER(NVRTC)
-#  include <cuda_runtime_api.h>
-#endif // !_CCCL_COMPILER(NVRTC)
-
 #ifdef _CCCL_DOXYGEN_INVOKED // Only parse this during doxygen passes:
 //! Defined if RDC is enabled and CUB_DISABLE_CDP is not defined.
 //! Deprecated [Since 3.2]
@@ -40,9 +35,9 @@
 #  define CUB_RUNTIME_FUNCTION
 #else // Non-doxygen pass:
 
-#  if _CCCL_HAS_RDC()
+#  if _CCCL_HAS_CDP()
 #    define CUB_RDC_ENABLED
-#  endif // _CCCL_HAS_RDC()
+#  endif // _CCCL_HAS_CDP()
 
 #  ifndef CUB_RUNTIME_FUNCTION
 #    define CUB_RUNTIME_FUNCTION _CCCL_CDP_API

@@ -94,19 +94,26 @@ _CCCL_END_NAMESPACE_CUDA
 #    define _CCCL_BEFORE_NVTX_RANGE_SCOPE(name)
 #  endif // !CCCL_DETAIL_BEFORE_NVTX_RANGE_SCOPE
 
+#  if _CCCL_HOST_COMPILATION()
 // Conditionally inserts a NVTX range starting here until the end of the current function scope in host code. Does
 // nothing in device code.
 // The optional is needed to defer the construction of an NVTX range (host-only code) and message string registration
 // into a dispatch region running only on the host, while preserving the semantic scope where the range is declared.
-#  define _CCCL_NVTX_RANGE_SCOPE_IF(condition, name)                                                               \
-    _CCCL_BEFORE_NVTX_RANGE_SCOPE(name)                                                                            \
-    ::cuda::std::optional<::nvtx3::v1::scoped_range_in<::cuda::detail::NVTXCCCLDomain>> __cuda_nvtx3_range;        \
-    NV_IF_TARGET(                                                                                                  \
-      NV_IS_HOST,                                                                                                  \
-      static const ::nvtx3::v1::registered_string_in<::cuda::detail::NVTXCCCLDomain> __cuda_nvtx3_func_name{name}; \
-      static const ::nvtx3::v1::event_attributes __cuda_nvtx3_func_attr{__cuda_nvtx3_func_name};                   \
-      if (condition) __cuda_nvtx3_range.emplace(__cuda_nvtx3_func_attr);                                           \
-      (void) __cuda_nvtx3_range;)
+#    define _CCCL_NVTX_RANGE_SCOPE_IF(condition, name)                                                                 \
+      _CCCL_BEFORE_NVTX_RANGE_SCOPE(name)                                                                              \
+      ::cuda::std::optional<::nvtx3::v1::scoped_range_in<::cuda::detail::NVTXCCCLDomain>> __cuda_nvtx3_range;          \
+      NV_IF_TARGET(                                                                                                    \
+        NV_IS_HOST, ({                                                                                                 \
+          static const ::nvtx3::v1::registered_string_in<::cuda::detail::NVTXCCCLDomain> __cuda_nvtx3_func_name{name}; \
+          static const ::nvtx3::v1::event_attributes __cuda_nvtx3_func_attr{__cuda_nvtx3_func_name};                   \
+          if (condition)                                                                                               \
+          {                                                                                                            \
+            __cuda_nvtx3_range.emplace(__cuda_nvtx3_func_attr);                                                        \
+          }                                                                                                            \
+        }))
+#  else // ^^^ _CCCL_HOST_COMPILATION() ^^^ / vvv !_CCCL_HOST_COMPILATION() vvv
+#    define _CCCL_NVTX_RANGE_SCOPE_IF(condition, name)
+#  endif // ^^^ !_CCCL_HOST_COMPILATION() ^^^
 
 #  define _CCCL_NVTX_RANGE_SCOPE(name) _CCCL_NVTX_RANGE_SCOPE_IF(true, name)
 

@@ -48,4 +48,45 @@
 #  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH()
 #endif // _CCCL_HAS_PDL()
 
+// Check whether the relocatable device code (RDC) is being generated.
+#if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__) || defined(_NVHPC_RDC)
+#  define _CCCL_HAS_RDC() 1
+#else // ^^^ has RDC ^^^ / vvv no RDC vvv
+#  define _CCCL_HAS_RDC() 0
+#endif // ^^^ no RDC ^^^
+
+// Check whether extensible whole program is being compiled.
+#if defined(__CUDACC_EWP__)
+#  define _CCCL_HAS_EWP() 1
+#else // ^^^ has EWP ^^^ / vvv no EWP vvv
+#  define _CCCL_HAS_EWP() 0
+#endif // ^^^ no EWP ^^^
+
+// Control whether device runtime APIs can be used, because they require libcudadevrt to be linked. Defaults to true
+// when RDC or EWP are enabled. Can be disabled by defining CCCL_DISABLE_DEVICE_RUNTIME.
+#if (_CCCL_HAS_RDC() || _CCCL_HAS_EWP()) && !defined(CCCL_DISABLE_DEVICE_RUNTIME)
+#  define _CCCL_HAS_DEVICE_RUNTIME() 1
+#else // ^^^ has device runtime ^^^ / vvv no device runtime vvv
+#  define _CCCL_HAS_DEVICE_RUNTIME() 0
+#endif // ^^^ no device runtime ^^^
+
+// Some functions can be called from host or device code and launch kernels inside. Thus, they use CUDA Dynamic
+// Parallelism (CDP) and require compiling with Relocatable Device Code (RDC) or extensible whole program (EWP) and link
+// with device runtime library. CDP is unsupported with clang-cuda below 22.
+// TODO(bgruber): remove CUB_DISABLE_CDP in CCCL 4.0
+#if _CCCL_HAS_DEVICE_RUNTIME() && !defined(CCCL_DISABLE_CDP) && !defined(CUB_DISABLE_CDP) \
+  && !_CCCL_CUDA_COMPILER(CLANG, <, 22)
+// We have CDP, so host and device APIs can call kernels
+#  define _CCCL_HAS_CDP() 1
+#else // ^^^ has CDP ^^^ / vvv no CDP vvv
+// We don't have CDP, only host APIs can call kernels
+#  define _CCCL_HAS_CDP() 0
+#endif // ^^^ no CDP ^^^
+
+#if _CCCL_HAS_CDP()
+#  ifdef CUDA_FORCE_CDP1_IF_SUPPORTED
+#    error "CUDA Dynamic Parallelism 1 is no longer supported. Please undefine CUDA_FORCE_CDP1_IF_SUPPORTED."
+#  endif // CUDA_FORCE_CDP1_IF_SUPPORTED
+#endif // _CCCL_HAS_CDP()
+
 #endif // __CCCL_CUDA_CAPABILITIES
@@ -28,6 +28,7 @@
 #endif // no system header
 
 #include <cuda/std/__cccl/attributes.h>
+#include <cuda/std/__cccl/cuda_capabilities.h>
 #include <cuda/std/__cccl/execution_space.h>
 
 // For unknown reasons, nvc++ need to selectively disable this warning
@@ -143,24 +144,13 @@
 #  define CCCL_DISABLE_CDP
 #endif // _CCCL_DOXYGEN_INVOKED
 
-// Some functions can be called from host or device code and launch kernels inside. Thus, they use CUDA Dynamic
-// Parallelism (CDP) and require compiling with Relocatable Device Code (RDC).
-// TODO(bgruber): remove CUB_DISABLE_CDP in CCCL 4.0
-#if defined(__CUDACC_RDC__) && !defined(CCCL_DISABLE_CDP) && !defined(CUB_DISABLE_CDP)
-#  define _CCCL_HAS_RDC() 1
-// We have RDC, so host and device APIs can call kernels
+#if _CCCL_HAS_CDP()
+// We have CDP, so host and device APIs can call kernels
 #  define _CCCL_CDP_API _CCCL_API
-#else // defined(__CUDACC_RDC__) && !defined(CCCL_DISABLE_CDP) && !defined(CUB_DISABLE_CDP)
-#  define _CCCL_HAS_RDC() 0
-// We don't have RDC, only host APIs can call kernels
-#  define _CCCL_CDP_API   _CCCL_HOST_API
-#endif // defined(__CUDACC_RDC__) && !defined(CCCL_DISABLE_CDP) && !defined(CUB_DISABLE_CDP)
-
-#if _CCCL_HAS_RDC()
-#  ifdef CUDA_FORCE_CDP1_IF_SUPPORTED
-#    error "CUDA Dynamic Parallelism 1 is no longer supported. Please undefine CUDA_FORCE_CDP1_IF_SUPPORTED."
-#  endif // CUDA_FORCE_CDP1_IF_SUPPORTED
-#endif // _CCCL_HAS_RDC()
+#else // ^^^ _CCCL_HAS_CDP() ^^^ / vvv !_CCCL_HAS_CDP() vvv
+// We don't have CDP, only host APIs can call kernels
+#  define _CCCL_CDP_API _CCCL_HOST_API
+#endif // ^^^ !_CCCL_HAS_CDP() ^^^
 
 //! _LIBCUDACXX_HIDE_FROM_ABI is for backwards compatibility for external projects.
 //! _CCCL_API and its variants are the preferred way to declare functions