Skip to content

Commit 8911e9e

Browse files
pciolkoszdavebayer
andauthored
[libcu++] Make kernel_config member private and allow it in hierarchy queries (#7034)
* Make kernel_config member private and allow it in hierarchy queries * Update libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h * Update libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h * Merge fix * Fix cudax * format fix --------- Co-authored-by: David Bayer <[email protected]>
1 parent 8986447 commit 8911e9e

File tree

16 files changed

+234
-129
lines changed

16 files changed

+234
-129
lines changed

cudax/examples/simple_p2p.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ struct simple_kernel
5252
__device__ void operator()(Configuration config, ::cuda::std::span<const float> src, ::cuda::std::span<float> dst)
5353
{
5454
// Just a dummy kernel, doing enough for us to verify that everything worked
55-
const auto idx = config.dims.rank(cuda::gpu_thread);
55+
const auto idx = config.hierarchy().rank(cuda::gpu_thread);
5656
dst[idx] = src[idx] * 2.0f;
5757
}
5858
};

cudax/examples/vector_add.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,9 @@ try
9595
auto config = cuda::distribute<threadsPerBlock>(numElements);
9696

9797
// Launch the vectorAdd kernel
98-
printf(
99-
"CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cuda::block, cuda::grid), threadsPerBlock);
98+
printf("CUDA kernel launch with %d blocks of %d threads\n",
99+
config.hierarchy().count(cuda::block, cuda::grid),
100+
threadsPerBlock);
100101
cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
101102

102103
printf("waiting for the stream to finish\n");

cudax/include/cuda/experimental/__execution/stream/adaptor.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ _CCCL_API constexpr auto __with_cuda_error(_Completions __completions) noexcept
112112
}
113113

114114
template <class _Config>
115-
using __dims_of_t = decltype(_Config::dims);
115+
using __dims_of_t = typename _Config::hierarchy_type;
116116

117117
// This kernel forwards the results from the child sender to the receiver of the parent
118118
// sender. The receiver is where most algorithms do their work, so we want the receiver to
@@ -269,7 +269,7 @@ private:
269269
// the completion kernel, we will be completing the parent's receiver, so we must let
270270
// the receiver tell us how to launch the kernel.
271271
auto const __launch_config = get_launch_config(execution::get_env(__state.__state_.__rcvr_));
272-
using __launch_dims_t = decltype(__launch_config.dims);
272+
using __launch_dims_t = typename decltype(__launch_config)::hierarchy_type;
273273
constexpr int __block_threads = __launch_dims_t::static_count(gpu_thread, block);
274274

275275
// Start the child operation state. This will launch kernels for all the predecessors

cudax/include/cuda/experimental/__execution/stream/scheduler.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
133133
// Read the launch configuration passed to us by the parent operation. When we launch
134134
// the completion kernel, we will be completing the parent's receiver, so we must let
135135
// the receiver tell us how to launch the kernel.
136-
auto const __launch_dims = get_launch_config(execution::get_env(__rcvr_)).dims;
136+
auto const __launch_dims = get_launch_config(execution::get_env(__rcvr_)).hierarchy();
137137
constexpr int __block_threads = decltype(__launch_dims)::static_count(cuda::gpu_thread, cuda::block);
138138
int const __grid_blocks = __launch_dims.count(cuda::block, cuda::grid);
139139
static_assert(__block_threads != ::cuda::std::dynamic_extent);
@@ -152,7 +152,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
152152
// TODO: untested
153153
_CCCL_DEVICE_API void __device_start() noexcept
154154
{
155-
using __launch_dims_t = decltype(get_launch_config(execution::get_env(__rcvr_)).dims);
155+
using __launch_dims_t = typename decltype(get_launch_config(execution::get_env(__rcvr_)))::hierarchy_type;
156156
constexpr int __block_threads = __launch_dims_t::static_count(cuda::gpu_thread, cuda::block);
157157

158158
// without the following, the kernel in __host_start will fail to launch with

cudax/include/cuda/experimental/__launch/launch.cuh

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,10 @@ namespace cuda::experimental
106106
template <typename... _ExpTypes, typename _Dst, typename _Config>
107107
_CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __kernel, _ExpTypes... __args)
108108
{
109-
static_assert(!::cuda::std::is_same_v<decltype(__conf.dims), no_init_t>,
109+
static_assert(!::cuda::std::is_same_v<decltype(__conf.hierarchy()), no_init_t>,
110110
"Can't launch a configuration without hierarchy dimensions");
111111
::CUlaunchConfig __config{};
112-
constexpr bool __has_cluster_level = has_level_v<cluster_level, decltype(__conf.dims)>;
112+
constexpr bool __has_cluster_level = has_level_v<cluster_level, decltype(__conf.hierarchy())>;
113113
constexpr unsigned int __num_attrs_needed =
114114
::cuda::__detail::kernel_config_count_attr_space(__conf) + __has_cluster_level;
115115
::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed];
@@ -122,20 +122,20 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k
122122
::cuda::__throw_cuda_error(__status, "Failed to prepare a launch configuration");
123123
}
124124

125-
__config.gridDimX = static_cast<unsigned>(__conf.dims.extents(block, grid).x);
126-
__config.gridDimY = static_cast<unsigned>(__conf.dims.extents(block, grid).y);
127-
__config.gridDimZ = static_cast<unsigned>(__conf.dims.extents(block, grid).z);
128-
__config.blockDimX = static_cast<unsigned>(__conf.dims.extents(gpu_thread, block).x);
129-
__config.blockDimY = static_cast<unsigned>(__conf.dims.extents(gpu_thread, block).y);
130-
__config.blockDimZ = static_cast<unsigned>(__conf.dims.extents(gpu_thread, block).z);
125+
__config.gridDimX = static_cast<unsigned>(__conf.hierarchy().extents(block, grid).x);
126+
__config.gridDimY = static_cast<unsigned>(__conf.hierarchy().extents(block, grid).y);
127+
__config.gridDimZ = static_cast<unsigned>(__conf.hierarchy().extents(block, grid).z);
128+
__config.blockDimX = static_cast<unsigned>(__conf.hierarchy().extents(gpu_thread, block).x);
129+
__config.blockDimY = static_cast<unsigned>(__conf.hierarchy().extents(gpu_thread, block).y);
130+
__config.blockDimZ = static_cast<unsigned>(__conf.hierarchy().extents(gpu_thread, block).z);
131131

132132
if constexpr (__has_cluster_level)
133133
{
134134
::CUlaunchAttribute __cluster_dims_attr{};
135135
__cluster_dims_attr.id = ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
136-
__cluster_dims_attr.value.clusterDim.x = static_cast<unsigned>(__conf.dims.extents(block, cluster).x);
137-
__cluster_dims_attr.value.clusterDim.y = static_cast<unsigned>(__conf.dims.extents(block, cluster).y);
138-
__cluster_dims_attr.value.clusterDim.z = static_cast<unsigned>(__conf.dims.extents(block, cluster).z);
136+
__cluster_dims_attr.value.clusterDim.x = static_cast<unsigned>(__conf.hierarchy().extents(block, cluster).x);
137+
__cluster_dims_attr.value.clusterDim.y = static_cast<unsigned>(__conf.hierarchy().extents(block, cluster).y);
138+
__cluster_dims_attr.value.clusterDim.z = static_cast<unsigned>(__conf.hierarchy().extents(block, cluster).z);
139139
__config.attrs[__config.numAttrs++] = __cluster_dims_attr;
140140
}
141141

@@ -162,7 +162,7 @@ _CCCL_CONCEPT work_submitter =
162162
//! template <typename Configuration>
163163
//! __device__ void operator()(Configuration conf, unsigned int
164164
//! thread_to_print) {
165-
//! if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
165+
//! if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
166166
//! printf("Hello from the GPU\n");
167167
//! }
168168
//! }
@@ -294,7 +294,7 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
294294
//!
295295
//! template <typename Configuration>
296296
//! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
297-
//! if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
297+
//! if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
298298
//! printf("Hello from the GPU\n");
299299
//! }
300300
//! }
@@ -351,7 +351,7 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
351351
//!
352352
//! template <typename Configuration>
353353
//! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
354-
//! if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
354+
//! if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
355355
//! printf("Hello from the GPU\n");
356356
//! }
357357
//! }
@@ -404,7 +404,7 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
404404
//!
405405
//! template <typename Configuration>
406406
//! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
407-
//! if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
407+
//! if (conf.hierarchy().rank(cudax::thread, cudax::grid) == thread_to_print) {
408408
//! printf("Hello from the GPU\n");
409409
//! }
410410
//! }

cudax/test/launch/launch_smoke.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ struct functor_taking_config
5555
template <typename Config>
5656
__device__ void operator()(Config config, int grid_size)
5757
{
58-
static_assert(config.dims.static_count(cuda::gpu_thread, cuda::block) == BlockSize);
59-
CUDAX_REQUIRE(config.dims.count(cuda::block, cuda::grid) == grid_size);
58+
static_assert(config.hierarchy().static_count(cuda::gpu_thread, cuda::block) == BlockSize);
59+
CUDAX_REQUIRE(config.hierarchy().count(cuda::block, cuda::grid) == grid_size);
6060
kernel_run_proof = true;
6161
}
6262
};
@@ -248,7 +248,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst)
248248
// Lambda
249249
{
250250
cudax::launch(dst, cuda::block_dims<256>() & cuda::grid_dims(1), [] __device__(auto config) {
251-
if (config.dims.rank(cuda::gpu_thread, cuda::block) == 0)
251+
if (config.hierarchy().rank(cuda::gpu_thread, cuda::block) == 0)
252252
{
253253
printf("Hello from the GPU\n");
254254
kernel_run_proof = true;
@@ -354,8 +354,8 @@ void test_default_config()
354354
auto block = cuda::block_dims<256>;
355355

356356
auto verify_lambda = [] __device__(auto config) {
357-
static_assert(config.dims.count(cuda::gpu_thread, cuda::block) == 256);
358-
CUDAX_REQUIRE(config.dims.count(cuda::block) == 4);
357+
static_assert(config.hierarchy().count(cuda::gpu_thread, cuda::block) == 256);
358+
CUDAX_REQUIRE(config.hierarchy().count(cuda::block) == 4);
359359
cooperative_groups::this_grid().sync();
360360
};
361361

examples/cudax/vector_add/vector_add.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,9 @@ try
9595
auto config = cuda::distribute<threadsPerBlock>(numElements);
9696

9797
// Launch the vectorAdd kernel
98-
printf(
99-
"CUDA kernel launch with %d blocks of %d threads\n", config.dims.count(cuda::block, cuda::grid), threadsPerBlock);
98+
printf("CUDA kernel launch with %d blocks of %d threads\n",
99+
config.hierarchy().count(cuda::block, cuda::grid),
100+
threadsPerBlock);
100101
cudax::launch(stream, config, vectorAdd, in(A), in(B), out(C));
101102

102103
printf("waiting for the stream to finish\n");

libcudacxx/include/cuda/__hierarchy/block_level.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,16 @@ struct block_level : __native_hierarchy_level_base<block_level>
124124
// interactions with grid level in hierarchy
125125

126126
_CCCL_TEMPLATE(class _Tp, class _Hierarchy)
127-
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_v<_Hierarchy>)
127+
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_or_has_hierarchy_member_v<_Hierarchy>)
128128
[[nodiscard]] _CCCL_DEVICE_API static _Tp rank_as(const grid_level& __level, const _Hierarchy& __hier) noexcept
129129
{
130-
static_assert(has_unit_or_level_v<block_level, _Hierarchy>, "_Hierarchy doesn't contain block level");
131-
static_assert(has_level_v<grid_level, _Hierarchy>, "_Hierarchy doesn't contain grid level");
130+
auto& __hier_unpacked = ::cuda::__unpack_hierarchy_if_needed(__hier);
131+
using _HierarchyUnpacked = ::cuda::std::remove_cvref_t<decltype(__hier_unpacked)>;
132+
static_assert(has_unit_or_level_v<block_level, _HierarchyUnpacked>, "_Hierarchy doesn't contain block level");
133+
static_assert(has_level_v<grid_level, _HierarchyUnpacked>, "_Hierarchy doesn't contain grid level");
132134

133-
const auto __dims = dims_as<_Tp>(__level, __hier);
134-
const auto __idx = index_as<_Tp>(__level, __hier);
135+
const auto __dims = dims_as<_Tp>(__level, __hier_unpacked);
136+
const auto __idx = index_as<_Tp>(__level, __hier_unpacked);
135137
return static_cast<_Tp>((__idx.z * __dims.y + __idx.y) * __dims.x + __idx.x);
136138
}
137139
# endif // _CCCL_CUDA_COMPILATION()

0 commit comments

Comments
 (0)