NVIDIA
diff --git a/‎cudax/examples/simple_p2p.cu‎
Lines changed: 1 addition & 1 deletion b/‎cudax/examples/simple_p2p.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cudax/examples/vector_add.cu‎
Lines changed: 3 additions & 2 deletions b/‎cudax/examples/vector_add.cu‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cudax/include/cuda/experimental/__execution/stream/adaptor.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cudax/include/cuda/experimental/__execution/stream/adaptor.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cudax/include/cuda/experimental/__execution/stream/scheduler.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cudax/include/cuda/experimental/__execution/stream/scheduler.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cudax/include/cuda/experimental/__launch/launch.cuh‎
Lines changed: 15 additions & 15 deletions b/‎cudax/include/cuda/experimental/__launch/launch.cuh‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎cudax/test/launch/launch_smoke.cu‎
Lines changed: 5 additions & 5 deletions b/‎cudax/test/launch/launch_smoke.cu‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/cudax/vector_add/vector_add.cu‎
Lines changed: 3 additions & 2 deletions b/‎examples/cudax/vector_add/vector_add.cu‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎libcudacxx/include/cuda/__hierarchy/block_level.h‎
Lines changed: 7 additions & 5 deletions b/‎libcudacxx/include/cuda/__hierarchy/block_level.h‎
Lines changed: 7 additions & 5 deletions
@@ -52,7 +52,7 @@ struct simple_kernel
   __device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
   {
     // Just a dummy kernel, doing enough for us to verify that everything worked
-    const auto idx = config.dims.rank(cuda::gpu_thread);
+    const auto idx = config.hierarchy().rank(cuda::gpu_thread);
     dst[idx]       = src[idx] * 2.0f;
   }
 };
 
@@ -95,8 +95,9 @@ try
   auto config                   = cuda::distribute<threadsPerBlock>(numElements);
 
   // Launch the vectorAdd kernel
-  printf(
-    "CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cuda::block, cuda::grid), threadsPerBlock);
+  printf("CUDA kernel launch with %d blocks of %d threads\n",
+         config.hierarchy().count(cuda::block, cuda::grid),
+         threadsPerBlock);
   cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
 
   printf("waiting for the stream to finish\n");
 
@@ -112,7 +112,7 @@ _CCCL_API constexpr auto __with_cuda_error(_Completions __completions) noexcept
 }
 
 template <class _Config>
-using __dims_of_t = decltype(_Config::dims);
+using __dims_of_t = typename _Config::hierarchy_type;
 
 // This kernel forwards the results from the child sender to the receiver of the parent
 // sender. The receiver is where most algorithms do their work, so we want the receiver to
@@ -269,7 +269,7 @@ private:
     // the completion kernel, we will be completing the parent's receiver, so we must let
     // the receiver tell us how to launch the kernel.
     auto const __launch_config    = get_launch_config(execution::get_env(__state.__state_.__rcvr_));
-    using __launch_dims_t         = decltype(__launch_config.dims);
+    using __launch_dims_t         = typename decltype(__launch_config)::hierarchy_type;
     constexpr int __block_threads = __launch_dims_t::static_count(gpu_thread, block);
 
     // Start the child operation state. This will launch kernels for all the predecessors
 
@@ -133,7 +133,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
       // Read the launch configuration passed to us by the parent operation. When we launch
       // the completion kernel, we will be completing the parent's receiver, so we must let
       // the receiver tell us how to launch the kernel.
-      auto const __launch_dims      = get_launch_config(execution::get_env(__rcvr_)).dims;
+      auto const __launch_dims      = get_launch_config(execution::get_env(__rcvr_)).hierarchy();
       constexpr int __block_threads = decltype(__launch_dims)::static_count(cuda::gpu_thread, cuda::block);
       int const __grid_blocks       = __launch_dims.count(cuda::block, cuda::grid);
       static_assert(__block_threads != ::cuda::std::dynamic_extent);
@@ -152,7 +152,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
     // TODO: untested
     _CCCL_DEVICE_API void __device_start() noexcept
     {
-      using __launch_dims_t         = decltype(get_launch_config(execution::get_env(__rcvr_)).dims);
+      using __launch_dims_t         = typename decltype(get_launch_config(execution::get_env(__rcvr_)))::hierarchy_type;
       constexpr int __block_threads = __launch_dims_t::static_count(cuda::gpu_thread, cuda::block);
 
       // without the following, the kernel in __host_start will fail to launch with
 
@@ -106,10 +106,10 @@ namespace cuda::experimental
 template <typename... _ExpTypes, typename _Dst, typename _Config>
 _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __kernel, _ExpTypes... __args)
 {
-  static_assert(!::cuda::std::is_same_v<decltype(__conf.dims), no_init_t>,
+  static_assert(!::cuda::std::is_same_v<decltype(__conf.hierarchy()), no_init_t>,
                 "Can't launch a configuration without hierarchy dimensions");
   ::CUlaunchConfig __config{};
-  constexpr bool __has_cluster_level = has_level_v<cluster_level, decltype(__conf.dims)>;
+  constexpr bool __has_cluster_level = has_level_v<cluster_level, decltype(__conf.hierarchy())>;
   constexpr unsigned int __num_attrs_needed =
     ::cuda::__detail::kernel_config_count_attr_space(__conf) + __has_cluster_level;
   ::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed];
@@ -122,20 +122,20 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k
     ::cuda::__throw_cuda_error(__status, "Failed to prepare a launch configuration");
   }
 
-  __config.gridDimX  = static_cast<unsigned>(__conf.dims.extents(block, grid).x);
-  __config.gridDimY  = static_cast<unsigned>(__conf.dims.extents(block, grid).y);
-  __config.gridDimZ  = static_cast<unsigned>(__conf.dims.extents(block, grid).z);
-  __config.blockDimX = static_cast<unsigned>(__conf.dims.extents(gpu_thread, block).x);
-  __config.blockDimY = static_cast<unsigned>(__conf.dims.extents(gpu_thread, block).y);
-  __config.blockDimZ = static_cast<unsigned>(__conf.dims.extents(gpu_thread, block).z);
+  __config.gridDimX  = static_cast<unsigned>(__conf.hierarchy().extents(block, grid).x);
+  __config.gridDimY  = static_cast<unsigned>(__conf.hierarchy().extents(block, grid).y);
+  __config.gridDimZ  = static_cast<unsigned>(__conf.hierarchy().extents(block, grid).z);
+  __config.blockDimX = static_cast<unsigned>(__conf.hierarchy().extents(gpu_thread, block).x);
+  __config.blockDimY = static_cast<unsigned>(__conf.hierarchy().extents(gpu_thread, block).y);
+  __config.blockDimZ = static_cast<unsigned>(__conf.hierarchy().extents(gpu_thread, block).z);
 
   if constexpr (__has_cluster_level)
   {
     ::CUlaunchAttribute __cluster_dims_attr{};
     __cluster_dims_attr.id                 = ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
-    __cluster_dims_attr.value.clusterDim.x = static_cast<unsigned>(__conf.dims.extents(block, cluster).x);
-    __cluster_dims_attr.value.clusterDim.y = static_cast<unsigned>(__conf.dims.extents(block, cluster).y);
-    __cluster_dims_attr.value.clusterDim.z = static_cast<unsigned>(__conf.dims.extents(block, cluster).z);
+    __cluster_dims_attr.value.clusterDim.x = static_cast<unsigned>(__conf.hierarchy().extents(block, cluster).x);
+    __cluster_dims_attr.value.clusterDim.y = static_cast<unsigned>(__conf.hierarchy().extents(block, cluster).y);
+    __cluster_dims_attr.value.clusterDim.z = static_cast<unsigned>(__conf.hierarchy().extents(block, cluster).z);
     __config.attrs[__config.numAttrs++]    = __cluster_dims_attr;
   }
 
@@ -162,7 +162,7 @@ _CCCL_CONCEPT work_submitter =
 //!     template <typename Configuration>
 //!     __device__ void operator()(Configuration conf, unsigned int
 //!     thread_to_print) {
-//!         if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
+//!         if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
 //!             printf("Hello from the GPU\n");
 //!         }
 //!     }
@@ -294,7 +294,7 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
 //!
 //! template <typename Configuration>
 //! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
-//!     if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
+//!     if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
 //!         printf("Hello from the GPU\n");
 //!     }
 //! }
@@ -351,7 +351,7 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
 //!
 //! template <typename Configuration>
 //! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
-//!     if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
+//!     if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
 //!         printf("Hello from the GPU\n");
 //!     }
 //! }
@@ -404,7 +404,7 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
 //!
 //! template <typename Configuration>
 //! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
-//!     if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
+//!     if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
 //!         printf("Hello from the GPU\n");
 //!     }
 //! }
 
@@ -55,8 +55,8 @@ struct functor_taking_config
   template <typename Config>
   __device__ void operator()(Config config, int grid_size)
   {
-    static_assert(config.dims.static_count(cuda::gpu_thread, cuda::block) == BlockSize);
-    CUDAX_REQUIRE(config.dims.count(cuda::block, cuda::grid) == grid_size);
+    static_assert(config.hierarchy().static_count(cuda::gpu_thread, cuda::block) == BlockSize);
+    CUDAX_REQUIRE(config.hierarchy().count(cuda::block, cuda::grid) == grid_size);
     kernel_run_proof = true;
   }
 };
@@ -248,7 +248,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst)
   // Lambda
   {
     cudax::launch(dst, cuda::block_dims<256>() & cuda::grid_dims(1), [] __device__(auto config) {
-      if (config.dims.rank(cuda::gpu_thread, cuda::block) == 0)
+      if (config.hierarchy().rank(cuda::gpu_thread, cuda::block) == 0)
       {
         printf("Hello from the GPU\n");
         kernel_run_proof = true;
@@ -354,8 +354,8 @@ void test_default_config()
   auto block = cuda::block_dims<256>;
 
   auto verify_lambda = [] __device__(auto config) {
-    static_assert(config.dims.count(cuda::gpu_thread, cuda::block) == 256);
-    CUDAX_REQUIRE(config.dims.count(cuda::block) == 4);
+    static_assert(config.hierarchy().count(cuda::gpu_thread, cuda::block) == 256);
+    CUDAX_REQUIRE(config.hierarchy().count(cuda::block) == 4);
     cooperative_groups::this_grid().sync();
   };
 
 
@@ -95,8 +95,9 @@ try
   auto config                   = cuda::distribute<threadsPerBlock>(numElements);
 
   // Launch the vectorAdd kernel
-  printf(
-    "CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cuda::block, cuda::grid), threadsPerBlock);
+  printf("CUDA kernel launch with %d blocks of %d threads\n",
+         config.hierarchy().count(cuda::block, cuda::grid),
+         threadsPerBlock);
   cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
 
   printf("waiting for the stream to finish\n");
 
@@ -124,14 +124,16 @@ struct block_level : __native_hierarchy_level_base<block_level>
   // interactions with grid level in hierarchy
 
   _CCCL_TEMPLATE(class _Tp, class _Hierarchy)
-  _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_v<_Hierarchy>)
+  _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_or_has_hierarchy_member_v<_Hierarchy>)
   [[nodiscard]] _CCCL_DEVICE_API static _Tp rank_as(const grid_level& __level, const _Hierarchy& __hier) noexcept
   {
-    static_assert(has_unit_or_level_v<block_level, _Hierarchy>, "_Hierarchy doesn't contain block level");
-    static_assert(has_level_v<grid_level, _Hierarchy>, "_Hierarchy doesn't contain grid level");
+    auto& __hier_unpacked    = ::cuda::__unpack_hierarchy_if_needed(__hier);
+    using _HierarchyUnpacked = ::cuda::std::remove_cvref_t<decltype(__hier_unpacked)>;
+    static_assert(has_unit_or_level_v<block_level, _HierarchyUnpacked>, "_Hierarchy doesn't contain block level");
+    static_assert(has_level_v<grid_level, _HierarchyUnpacked>, "_Hierarchy doesn't contain grid level");
 
-    const auto __dims = dims_as<_Tp>(__level, __hier);
-    const auto __idx  = index_as<_Tp>(__level, __hier);
+    const auto __dims = dims_as<_Tp>(__level, __hier_unpacked);
+    const auto __idx  = index_as<_Tp>(__level, __hier_unpacked);
     return static_cast<_Tp>((__idx.z * __dims.y + __idx.y) * __dims.x + __idx.x);
   }
 #  endif // _CCCL_CUDA_COMPILATION()
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ struct simple_kernel`
`52`	`52`	`__device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)`
`53`	`53`	`{`
`54`	`54`	`// Just a dummy kernel, doing enough for us to verify that everything worked`
`55`		`- const auto idx = config.dims.rank(cuda::gpu_thread);`
	`55`	`+ const auto idx = config.hierarchy().rank(cuda::gpu_thread);`
`56`	`56`	`dst[idx] = src[idx] * 2.0f;`
`57`	`57`	`}`
`58`	`58`	`};`