Merge branch 'branch/3.2.x' into backport-3.2-tensormap-bugfix

fbusato · web-flow · commit e06947e37071 · 2026-01-05T12:28:45.000-08:00
diff --git a/libcudacxx/include/cuda/__tma/make_tma_descriptor.h b/libcudacxx/include/cuda/__tma/make_tma_descriptor.h
@@ -94,7 +94,7 @@ enum class tma_swizzle
     case tma_oob_fill::nan:
       return ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -112,7 +112,7 @@ __to_cutensor_map(tma_l2_fetch_size __l2_fetch_size) noexcept
     case tma_l2_fetch_size::bytes256:
       return ::CU_TENSOR_MAP_L2_PROMOTION_L2_256B;
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -128,7 +128,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept
     case tma_interleave_layout::bytes32:
       return ::CU_TENSOR_MAP_INTERLEAVE_32B;
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -153,7 +153,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept
       return ::CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B;
 #  endif // _CCCL_CTK_AT_LEAST(12, 8)
     default:
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
   }
 }
 
@@ -405,11 +405,11 @@ __get_tensor_sizes(const ::DLTensor& __tensor, int __rank, ::CUtensorMapDataType
 {
   using ::cuda::std::int64_t;
   __tma_strides_array_t __output_strides{1}; // inner stride is implicit = 1
-  const auto __input_strides                = __tensor.strides;
-  const auto __input_sizes                  = __tensor.shape;
-  const auto __alignment                    = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;
-  constexpr auto __max_allowed_stride_bytes = int64_t{1} << 40; // 2^40
-  int64_t __cumulative_size                 = 1;
+  const auto __input_strides                 = __tensor.strides;
+  const auto __input_sizes                   = __tensor.shape;
+  const auto __alignment                     = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;
+  constexpr auto __max_allowed_stride_bytes  = int64_t{1} << 40; // 2^40
+  [[maybe_unused]] int64_t __cumulative_size = 1;
   if (__input_strides == nullptr)
   {
     for (int __i = 0; __i < __rank - 1; ++__i)
diff --git a/libcudacxx/test/libcudacxx/cuda/tma/make_tma_descriptor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/tma/make_tma_descriptor.pass.cpp
@@ -232,7 +232,7 @@ bool test_enums()
         cuda::tma_swizzle swizzle,
         cuda::tma_l2_fetch_size l2_fetch_size,
         cuda::tma_oob_fill oobfill) {
-      tensor.dtype.bits    = bits;
+      tensor.dtype.bits    = static_cast<uint8_t>(bits);
       box_sizes_storage[0] = /*min_align=*/16 * /*bits=*/8 / tensor.dtype.bits;
       box_sizes_storage[1] = /*min_align=*/16 * /*bits=*/8 / tensor.dtype.bits;
       box_sizes_storage[2] = /*min_align=*/16 * /*bits=*/8 / tensor.dtype.bits;
@@ -284,7 +284,7 @@ bool test_enums()
                   kDLFloat8_e5m2fnuz,
                   kDLFloat8_e8m0fnu})
             {
-              tensor.dtype.code  = code;
+              tensor.dtype.code  = static_cast<uint8_t>(code);
               constexpr int bits = 8;
               exec_make_tma_descriptor(bits, no_interleave, swizzle, l2_fetch_size, oobfill);
             }

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ enum class tma_swizzle`
`94`	`94`	`case tma_oob_fill::nan:`
`95`	`95`	`return ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;`
`96`	`96`	`default:`
`97`		`- _CCCL_UNREACHABLE();`
	`97`	`+ ::cuda::std::unreachable();`
`98`	`98`	`}`
`99`	`99`	`}`
`100`	`100`
`@@ -112,7 +112,7 @@ __to_cutensor_map(tma_l2_fetch_size __l2_fetch_size) noexcept`
`112`	`112`	`case tma_l2_fetch_size::bytes256:`
`113`	`113`	`return ::CU_TENSOR_MAP_L2_PROMOTION_L2_256B;`
`114`	`114`	`default:`
`115`		`- _CCCL_UNREACHABLE();`
	`115`	`+ ::cuda::std::unreachable();`
`116`	`116`	`}`
`117`	`117`	`}`
`118`	`118`
`@@ -128,7 +128,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept`
`128`	`128`	`case tma_interleave_layout::bytes32:`
`129`	`129`	`return ::CU_TENSOR_MAP_INTERLEAVE_32B;`
`130`	`130`	`default:`
`131`		`- _CCCL_UNREACHABLE();`
	`131`	`+ ::cuda::std::unreachable();`
`132`	`132`	`}`
`133`	`133`	`}`
`134`	`134`
`@@ -153,7 +153,7 @@ __to_cutensor_map(tma_interleave_layout __interleave_layout) noexcept`
`153`	`153`	`return ::CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B;`
`154`	`154`	`# endif // _CCCL_CTK_AT_LEAST(12, 8)`
`155`	`155`	`default:`
`156`		`- _CCCL_UNREACHABLE();`
	`156`	`+ ::cuda::std::unreachable();`
`157`	`157`	`}`
`158`	`158`	`}`
`159`	`159`
`@@ -405,11 +405,11 @@ __get_tensor_sizes(const ::DLTensor& __tensor, int __rank, ::CUtensorMapDataType`
`405`	`405`	`{`
`406`	`406`	`using ::cuda::std::int64_t;`
`407`	`407`	`__tma_strides_array_t __output_strides{1}; // inner stride is implicit = 1`
`408`		`- const auto __input_strides = __tensor.strides;`
`409`		`- const auto __input_sizes = __tensor.shape;`
`410`		`- const auto __alignment = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;`
`411`		`- constexpr auto __max_allowed_stride_bytes = int64_t{1} << 40; // 2^40`
`412`		`- int64_t __cumulative_size = 1;`
	`408`	`+ const auto __input_strides = __tensor.strides;`
	`409`	`+ const auto __input_sizes = __tensor.shape;`
	`410`	`+ const auto __alignment = (__interleave_layout == tma_interleave_layout::bytes32) ? 32 : 16;`
	`411`	`+ constexpr auto __max_allowed_stride_bytes = int64_t{1} << 40; // 2^40`
	`412`	`+ [[maybe_unused]] int64_t __cumulative_size = 1;`
`413`	`413`	`if (__input_strides == nullptr)`
`414`	`414`	`{`
`415`	`415`	`for (int __i = 0; __i < __rank - 1; ++__i)`