add rank threshold

fbusato · fbusato · commit 68092b44dca4 · 2026-03-20T15:02:40.000-07:00
diff --git a/cudax/include/cuda/experimental/__copy/copy_shared_memory.cuh b/cudax/include/cuda/experimental/__copy/copy_shared_memory.cuh
@@ -314,7 +314,7 @@ _CCCL_HOST_API void __launch_copy_shared_mem_kernel(
   const auto __config = ::cuda::make_config(
     ::cuda::block_dims(__thread_block_size),
     ::cuda::grid_dims(__grid_size),
-    ::cuda::dynamic_shared_memory<__value_type[]>(__tile_total_size, ::cuda::non_portable));
+    ::cuda::dynamic_shared_memory<__value_type[]>(__tile_total_size));
   const auto __kernel = cudax::__copy_shared_mem_kernel<
     decltype(__config),
     _MaxRank,
diff --git a/cudax/include/cuda/experimental/__copy/copy_shared_memory_utils.cuh b/cudax/include/cuda/experimental/__copy/copy_shared_memory_utils.cuh
@@ -77,7 +77,13 @@ __num_contiguous_dimensions(const __raw_tensor<_ExtentT, _StrideT, _Tp, _MaxRank
   return ::cuda::devices[__dev_id];
 }
 
-inline constexpr ::cuda::std::size_t __max_tile_size = 32; // warp-size
+//! Maximum extent of a single tile dimension, set to the warp size so that the innermost tile dimension maps to a full
+//! warp of coalesced accesses.
+inline constexpr ::cuda::std::size_t __max_tile_size = 32;
+
+//! Maximum tensor rank for which the shared-memory transpose kernel is instantiated. Higher ranks cause excessive
+//! register pressure (many rank-sized arrays and fully-unrolled loops).
+inline constexpr ::cuda::std::size_t __max_shared_mem_kernel_rank = 8;
 
 //! @brief Decide whether the shared-memory tiled transpose kernel is profitable.
 //!
@@ -108,6 +114,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank
   {
     return false;
   }
+
   // * source has at least one dimension with extent not equal to 1 -> otherwise, shared memory makes no sense
   const auto __ext_begin          = __src.__extents.cbegin();
   const bool __has_non_one_extent = ::cuda::std::any_of(__ext_begin, __ext_begin + __src.__rank, [](auto __extent) {
@@ -117,6 +124,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank
   {
     return false;
   }
+
   // * there are at least two contiguous destination dimensions -> otherwise, direct copy is better
   // * the tile is large enough to benefits from coalesced memory accesses
   const auto __current_dev            = ::cuda::experimental::__current_device();
@@ -136,6 +144,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank
   {
     return false;
   }
+
   // * there are enough tiles to keep the GPU busy (at least one full wave across all SMs)
   size_t __num_tiles = 1;
   for (__rank_t __r = 0; __r < __dst.__rank; ++__r)
diff --git a/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh b/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh
@@ -211,23 +211,24 @@ _CCCL_HOST_API void copy(::cuda::device_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyI
         return;
       }
     }
-    // (4) transpose case
-    if (cudax::__use_shared_mem_kernel(__src_normalized, __dst_normalized))
+    // (4) transpose case (rank capped to avoid excessive register pressure in the kernel)
+    if constexpr (__max_rank <= cudax::__max_shared_mem_kernel_rank)
     {
-      cudax::__launch_copy_shared_mem_kernel(
-        __src_normalized, __dst_normalized, __stream, __src.accessor(), __dst.accessor());
+      if (cudax::__use_shared_mem_kernel(__src_normalized, __dst_normalized))
+      {
+        cudax::__launch_copy_shared_mem_kernel(
+          __src_normalized, __dst_normalized, __stream, __src.accessor(), __dst.accessor());
+        return;
+      }
     }
     // (5) generic case (fallback)
-    else
-    {
-      cudax::__copy_optimized(
-        __src_normalized,
-        __dst_normalized,
-        cudax::__total_size(__src_normalized),
-        __stream,
-        __src.accessor(),
-        __dst.accessor());
-    }
+    cudax::__copy_optimized(
+      __src_normalized,
+      __dst_normalized,
+      cudax::__total_size(__src_normalized),
+      __stream,
+      __src.accessor(),
+      __dst.accessor());
   }
 }
 } // namespace cuda::experimental
diff --git a/cudax/test/copy/copy_nvmath.cu b/cudax/test/copy/copy_nvmath.cu
@@ -170,7 +170,6 @@ TEST_CASE("copy d2d nvmath flatten_common", "[copy][d2d][nvmath][flatten]")
     alloc, 0, shape, make_flatten_common_src_strides(), alloc, 0, make_flatten_common_dst_strides());
 }
 
-#if 0
 // src: (4,2,...,2):(5,2^4,...,2^22), alloc=2^23
 // dst: (4,2,...,2):(2^19,2^18,...,1), alloc=2^21
 TEST_CASE("copy d2d nvmath flatten_one", "[copy][d2d][nvmath][flatten]")
@@ -186,7 +185,7 @@ TEST_CASE("copy d2d nvmath flatten_one", "[copy][d2d][nvmath][flatten]")
     1 << 9,  1 << 8,  1 << 7,  1 << 6,  1 << 5,  1 << 4,  1 << 3,  1 << 2,  1 << 1,  1 << 0};
   test_copy_stride_relaxed<data_t>(src_alloc, 0, shape, src_strides, dst_alloc, 0, dst_strides);
 }
-#endif 
+
 /***********************************************************************************************************************
  * nvmath vectorize test cases (device-to-device)
  **********************************************************************************************************************/
diff --git a/cudax/test/copy/copy_nvmath_transpose.cu b/cudax/test/copy/copy_nvmath_transpose.cu
@@ -158,3 +158,17 @@ TEST_CASE("copy d2d nvmath transpose_inbalanced", "[copy][d2d][nvmath][transpose
   cuda::std::array<int, 2> dst_strides{1000033, 1};
   test_copy_stride_relaxed<data_t>(alloc, 0, shape, src_strides, alloc, 0, dst_strides);
 }
+
+// src: (4,4,4,4,4,4,16,8):(1,4,...,65536), column-major
+// dst: (4,4,4,4,4,4,16,8):(131072,...,8,1), row-major
+// Rank 8 == __max_shared_mem_kernel_rank, verifying the shared-memory kernel is still instantiated at the maximum
+// allowed rank. The first 6 dimensions fit in one tile (4^6 = 4096 elements); the last 2 dimensions (16x8 = 128 tiles)
+// provide sufficient grid utilization.
+TEST_CASE("copy d2d nvmath transpose_max_shared_mem_rank", "[copy][d2d][nvmath][transpose]")
+{
+  constexpr int alloc = 4 * 4 * 4 * 4 * 4 * 4 * 16 * 8; // 524288
+  cuda::std::array<int, 8> shape{4, 4, 4, 4, 4, 4, 16, 8};
+  cuda::std::array<int, 8> src_strides{1, 4, 16, 64, 256, 1024, 4096, 65536};
+  cuda::std::array<int, 8> dst_strides{131072, 32768, 8192, 2048, 512, 128, 8, 1};
+  test_copy_stride_relaxed<data_t>(alloc, 0, shape, src_strides, alloc, 0, dst_strides);
+}

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,13 @@ __num_contiguous_dimensions(const __raw_tensor<_ExtentT, _StrideT, _Tp, _MaxRank`
`77`	`77`	`return ::cuda::devices[__dev_id];`
`78`	`78`	`}`
`79`	`79`
`80`		`-inline constexpr ::cuda::std::size_t __max_tile_size = 32; // warp-size`
	`80`	`+//! Maximum extent of a single tile dimension, set to the warp size so that the innermost tile dimension maps to a full`
	`81`	`+//! warp of coalesced accesses.`
	`82`	`+inline constexpr ::cuda::std::size_t __max_tile_size = 32;`
	`83`	`+`
	`84`	`+//! Maximum tensor rank for which the shared-memory transpose kernel is instantiated. Higher ranks cause excessive`
	`85`	`+//! register pressure (many rank-sized arrays and fully-unrolled loops).`
	`86`	`+inline constexpr ::cuda::std::size_t __max_shared_mem_kernel_rank = 8;`
`81`	`87`
`82`	`88`	`//! @brief Decide whether the shared-memory tiled transpose kernel is profitable.`
`83`	`89`	`//!`
`@@ -108,6 +114,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank`
`108`	`114`	`{`
`109`	`115`	`return false;`
`110`	`116`	`}`
	`117`	`+`
`111`	`118`	`// * source has at least one dimension with extent not equal to 1 -> otherwise, shared memory makes no sense`
`112`	`119`	`const auto __ext_begin = __src.__extents.cbegin();`
`113`	`120`	`const bool __has_non_one_extent = ::cuda::std::any_of(__ext_begin, __ext_begin + __src.__rank, [](auto __extent) {`
`@@ -117,6 +124,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank`
`117`	`124`	`{`
`118`	`125`	`return false;`
`119`	`126`	`}`
	`127`	`+`
`120`	`128`	`// * there are at least two contiguous destination dimensions -> otherwise, direct copy is better`
`121`	`129`	`// * the tile is large enough to benefits from coalesced memory accesses`
`122`	`130`	`const auto __current_dev = ::cuda::experimental::__current_device();`
`@@ -136,6 +144,7 @@ __use_shared_mem_kernel(const __raw_tensor<_ExtentT, _StrideTIn, _TpIn, _MaxRank`
`136`	`144`	`{`
`137`	`145`	`return false;`
`138`	`146`	`}`
	`147`	`+`
`139`	`148`	`// * there are enough tiles to keep the GPU busy (at least one full wave across all SMs)`
`140`	`149`	`size_t __num_tiles = 1;`
`141`	`150`	`for (__rank_t __r = 0; __r < __dst.__rank; ++__r)`