Merge branch 'branch/3.2.x' into backport-7034-to-branch/3.2.x

davebayer · web-flow · commit 6af5a464731a · 2026-01-05T20:51:43.000+01:00
diff --git a/libcudacxx/include/cuda/__tma/make_tma_descriptor.h b/libcudacxx/include/cuda/__tma/make_tma_descriptor.h
@@ -29,6 +29,7 @@
 #  include <cuda/std/__algorithm/min.h>
 #  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/__limits/numeric_limits.h>
+#  include <cuda/std/__utility/unreachable.h>
 #  include <cuda/std/array>
 #  include <cuda/std/cstdint>
 #  include <cuda/std/span>
@@ -94,7 +95,7 @@ enum class tma_swizzle
     case tma_oob_fill::nan:
       return ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -112,7 +113,7 @@ __to_cutensor_map(tma_l2_fetch_size __l2_fetch_size) noexcept
     case tma_l2_fetch_size::bytes256:
       return ::CU_TENSOR_MAP_L2_PROMOTION_L2_256B;
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -128,7 +129,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept
     case tma_interleave_layout::bytes32:
       return ::CU_TENSOR_MAP_INTERLEAVE_32B;
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -153,7 +154,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept
       return ::CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B;
 #  endif // _CCCL_CTK_AT_LEAST(12, 8)
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -366,11 +367,11 @@ __get_tensor_sizes(const ::DLTensor& __tensor, int __rank, ::CUtensorMapDataType
 {
   using ::cuda::std::int64_t;
   __tma_strides_array_t __output_strides{1}; // inner stride is implicit = 1
-  const auto __input_strides                = __tensor.strides;
-  const auto __input_sizes                  = __tensor.shape;
-  const auto __alignment                    = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;
-  constexpr auto __max_allowed_stride_bytes = int64_t{1} << 40; // 2^40
-  int64_t __cumulative_size                 = 1;
+  const auto __input_strides                 = __tensor.strides;
+  const auto __input_sizes                   = __tensor.shape;
+  const auto __alignment                     = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;
+  constexpr auto __max_allowed_stride_bytes  = int64_t{1} << 40; // 2^40
+  [[maybe_unused]] int64_t __cumulative_size = 1;
   if (__input_strides == nullptr)
   {
     for (int __i = 0; __i < __rank - 1; ++__i)
diff --git a/libcudacxx/test/libcudacxx/cuda/tma/make_tma_descriptor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/tma/make_tma_descriptor.pass.cpp
@@ -232,7 +232,7 @@ bool test_enums()
         cuda::tma_swizzle swizzle,
         cuda::tma_l2_fetch_size l2_fetch_size,
         cuda::tma_oob_fill oobfill) {
-      tensor.dtype.bits    = bits;
+      tensor.dtype.bits    = static_cast<uint8_t>(bits);
       box_sizes_storage[0] = /*min_align=*/16 * /*bits=*/8 / tensor.dtype.bits;
       box_sizes_storage[1] = /*min_align=*/16 * /*bits=*/8 / tensor.dtype.bits;
       box_sizes_storage[2] = /*min_align=*/16 * /*bits=*/8 / tensor.dtype.bits;
@@ -284,7 +284,7 @@ bool test_enums()
                   kDLFloat8_e5m2fnuz,
                   kDLFloat8_e8m0fnu})
             {
-              tensor.dtype.code  = code;
+              tensor.dtype.code  = static_cast<uint8_t>(code);
               constexpr int bits = 8;
               exec_make_tma_descriptor(bits, no_interleave, swizzle, l2_fetch_size, oobfill);
             }

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`# include <cuda/std/__algorithm/min.h>`
`30`	`30`	`# include <cuda/std/__cstddef/types.h>`
`31`	`31`	`# include <cuda/std/__limits/numeric_limits.h>`
	`32`	`+# include <cuda/std/__utility/unreachable.h>`
`32`	`33`	`# include <cuda/std/array>`
`33`	`34`	`# include <cuda/std/cstdint>`
`34`	`35`	`# include <cuda/std/span>`
`@@ -94,7 +95,7 @@ enum class tma_swizzle`
`94`	`95`	`case tma_oob_fill::nan:`
`95`	`96`	`return ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;`
`96`	`97`	`default:`
`97`		`- _CCCL_UNREACHABLE();`
	`98`	`+ ::cuda::std::unreachable();`
`98`	`99`	`}`
`99`	`100`	`}`
`100`	`101`
`@@ -112,7 +113,7 @@ __to_cutensor_map(tma_l2_fetch_size __l2_fetch_size) noexcept`
`112`	`113`	`case tma_l2_fetch_size::bytes256:`
`113`	`114`	`return ::CU_TENSOR_MAP_L2_PROMOTION_L2_256B;`
`114`	`115`	`default:`
`115`		`- _CCCL_UNREACHABLE();`
	`116`	`+ ::cuda::std::unreachable();`
`116`	`117`	`}`
`117`	`118`	`}`
`118`	`119`
`@@ -128,7 +129,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept`
`128`	`129`	`case tma_interleave_layout::bytes32:`
`129`	`130`	`return ::CU_TENSOR_MAP_INTERLEAVE_32B;`
`130`	`131`	`default:`
`131`		`- _CCCL_UNREACHABLE();`
	`132`	`+ ::cuda::std::unreachable();`
`132`	`133`	`}`
`133`	`134`	`}`
`134`	`135`
`@@ -153,7 +154,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept`
`153`	`154`	`return ::CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B;`
`154`	`155`	`# endif // _CCCL_CTK_AT_LEAST(12, 8)`
`155`	`156`	`default:`
`156`		`- _CCCL_UNREACHABLE();`
	`157`	`+ ::cuda::std::unreachable();`
`157`	`158`	`}`
`158`	`159`	`}`
`159`	`160`
`@@ -366,11 +367,11 @@ __get_tensor_sizes(const ::DLTensor& __tensor, int __rank, ::CUtensorMapDataType`
`366`	`367`	`{`
`367`	`368`	`using ::cuda::std::int64_t;`
`368`	`369`	`__tma_strides_array_t __output_strides{1}; // inner stride is implicit = 1`
`369`		`- const auto __input_strides = __tensor.strides;`
`370`		`- const auto __input_sizes = __tensor.shape;`
`371`		`- const auto __alignment = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;`
`372`		`- constexpr auto __max_allowed_stride_bytes = int64_t{1} << 40; // 2^40`
`373`		`- int64_t __cumulative_size = 1;`
	`370`	`+ const auto __input_strides = __tensor.strides;`
	`371`	`+ const auto __input_sizes = __tensor.shape;`
	`372`	`+ const auto __alignment = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;`
	`373`	`+ constexpr auto __max_allowed_stride_bytes = int64_t{1} << 40; // 2^40`
	`374`	`+ [[maybe_unused]] int64_t __cumulative_size = 1;`
`374`	`375`	`if (__input_strides == nullptr)`
`375`	`376`	`{`
`376`	`377`	`for (int __i = 0; __i < __rank - 1; ++__i)`