Skip to content

Commit 68092b4

Browse files
committed
add rank threshold
1 parent 4f981e5 commit 68092b4

File tree

5 files changed

+41
-18
lines changed

5 files changed

+41
-18
lines changed

cudax/include/cuda/experimental/__copy/copy_shared_memory.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ _CCCL_HOST_API void __launch_copy_shared_mem_kernel(
314314
const auto __config = ::cuda::make_config(
315315
::cuda::block_dims(__thread_block_size),
316316
::cuda::grid_dims(__grid_size),
317-
::cuda::dynamic_shared_memory<__value_type[]>(__tile_total_size, ::cuda::non_portable));
317+
::cuda::dynamic_shared_memory<__value_type[]>(__tile_total_size));
318318
const auto __kernel = cudax::__copy_shared_mem_kernel<
319319
decltype(__config),
320320
_MaxRank,

cudax/include/cuda/experimental/__copy/copy_shared_memory_utils.cuh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,13 @@ __num_contiguous_dimensions(const __raw_tensor<_ExtentT, _StrideT, _Tp, _MaxRank
7777
return ::cuda::devices[__dev_id];
7878
}
7979

80-
inline constexpr ::cuda::std::size_t __max_tile_size = 32; // warp-size
80+
//! Maximum extent of a single tile dimension, set to the warp size so that the innermost tile dimension maps to a full
81+
//! warp of coalesced accesses.
82+
inline constexpr ::cuda::std::size_t __max_tile_size = 32;
83+
84+
//! Maximum tensor rank for which the shared-memory transpose kernel is instantiated. Higher ranks cause excessive
85+
//! register pressure (many rank-sized arrays and fully-unrolled loops).
86+
inline constexpr ::cuda::std::size_t __max_shared_mem_kernel_rank = 8;
8187

8288
//! @brief Decide whether the shared-memory tiled transpose kernel is profitable.
8389
//!
@@ -108,6 +114,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank
108114
{
109115
return false;
110116
}
117+
111118
// * source has at least one dimension with extent not equal to 1 -> otherwise, shared memory makes no sense
112119
const auto __ext_begin = __src.__extents.cbegin();
113120
const bool __has_non_one_extent = ::cuda::std::any_of(__ext_begin, __ext_begin + __src.__rank, [](auto __extent) {
@@ -117,6 +124,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank
117124
{
118125
return false;
119126
}
127+
120128
// * there are at least two contiguous destination dimensions -> otherwise, direct copy is better
121129
// * the tile is large enough to benefits from coalesced memory accesses
122130
const auto __current_dev = ::cuda::experimental::__current_device();
@@ -136,6 +144,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank
136144
{
137145
return false;
138146
}
147+
139148
// * there are enough tiles to keep the GPU busy (at least one full wave across all SMs)
140149
size_t __num_tiles = 1;
141150
for (__rank_t __r = 0; __r < __dst.__rank; ++__r)

cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -211,23 +211,24 @@ _CCCL_HOST_API void copy(::cuda::device_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyI
211211
return;
212212
}
213213
}
214-
// (4) transpose case
215-
if (cudax::__use_shared_mem_kernel(__src_normalized, __dst_normalized))
214+
// (4) transpose case (rank capped to avoid excessive register pressure in the kernel)
215+
if constexpr (__max_rank <= cudax::__max_shared_mem_kernel_rank)
216216
{
217-
cudax::__launch_copy_shared_mem_kernel(
218-
__src_normalized, __dst_normalized, __stream, __src.accessor(), __dst.accessor());
217+
if (cudax::__use_shared_mem_kernel(__src_normalized, __dst_normalized))
218+
{
219+
cudax::__launch_copy_shared_mem_kernel(
220+
__src_normalized, __dst_normalized, __stream, __src.accessor(), __dst.accessor());
221+
return;
222+
}
219223
}
220224
// (5) generic case (fallback)
221-
else
222-
{
223-
cudax::__copy_optimized(
224-
__src_normalized,
225-
__dst_normalized,
226-
cudax::__total_size(__src_normalized),
227-
__stream,
228-
__src.accessor(),
229-
__dst.accessor());
230-
}
225+
cudax::__copy_optimized(
226+
__src_normalized,
227+
__dst_normalized,
228+
cudax::__total_size(__src_normalized),
229+
__stream,
230+
__src.accessor(),
231+
__dst.accessor());
231232
}
232233
}
233234
} // namespace cuda::experimental

cudax/test/copy/copy_nvmath.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ TEST_CASE("copy d2d nvmath flatten_common", "[copy][d2d][nvmath][flatten]")
170170
alloc, 0, shape, make_flatten_common_src_strides(), alloc, 0, make_flatten_common_dst_strides());
171171
}
172172

173-
#if 0
174173
// src: (4,2,...,2):(5,2^4,...,2^22), alloc=2^23
175174
// dst: (4,2,...,2):(2^19,2^18,...,1), alloc=2^21
176175
TEST_CASE("copy d2d nvmath flatten_one", "[copy][d2d][nvmath][flatten]")
@@ -186,7 +185,7 @@ TEST_CASE("copy d2d nvmath flatten_one", "[copy][d2d][nvmath][flatten]")
186185
1 << 9, 1 << 8, 1 << 7, 1 << 6, 1 << 5, 1 << 4, 1 << 3, 1 << 2, 1 << 1, 1 << 0};
187186
test_copy_stride_relaxed<data_t>(src_alloc, 0, shape, src_strides, dst_alloc, 0, dst_strides);
188187
}
189-
#endif
188+
190189
/***********************************************************************************************************************
191190
* nvmath vectorize test cases (device-to-device)
192191
**********************************************************************************************************************/

cudax/test/copy/copy_nvmath_transpose.cu

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,17 @@ TEST_CASE("copy d2d nvmath transpose_inbalanced", "[copy][d2d][nvmath][transpose
158158
cuda::std::array<int, 2> dst_strides{1000033, 1};
159159
test_copy_stride_relaxed<data_t>(alloc, 0, shape, src_strides, alloc, 0, dst_strides);
160160
}
161+
162+
// src: (4,4,4,4,4,4,16,8):(1,4,...,65536), column-major
163+
// dst: (4,4,4,4,4,4,16,8):(131072,...,8,1), row-major
164+
// Rank 8 == __max_shared_mem_kernel_rank, verifying the shared-memory kernel is still instantiated at the maximum
165+
// allowed rank. The first 6 dimensions fit in one tile (4^6 = 4096 elements); the last 2 dimensions (16x8 = 128 tiles)
166+
// provide sufficient grid utilization.
167+
TEST_CASE("copy d2d nvmath transpose_max_shared_mem_rank", "[copy][d2d][nvmath][transpose]")
168+
{
169+
constexpr int alloc = 4 * 4 * 4 * 4 * 4 * 4 * 16 * 8; // 524288
170+
cuda::std::array<int, 8> shape{4, 4, 4, 4, 4, 4, 16, 8};
171+
cuda::std::array<int, 8> src_strides{1, 4, 16, 64, 256, 1024, 4096, 65536};
172+
cuda::std::array<int, 8> dst_strides{131072, 32768, 8192, 2048, 512, 128, 8, 1};
173+
test_copy_stride_relaxed<data_t>(alloc, 0, shape, src_strides, alloc, 0, dst_strides);
174+
}

0 commit comments

Comments
 (0)