Migrate old CUB code to be compatible with CUB 3 (#4835)

q10 · facebook-github-bot · commit e0b24f6dfb31 · 2025-09-08T21:22:23.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1863 - Migrate old CUB code to be compatible with CUB 3 (which is introduced in CUDA 13). The radix sort API is updated in CUB 3 to remove debug_synchronous flag completely, and was already an unused flag as far back as CUB 2.2.0 (part of CUDA 12.3): https://github.com/NVIDIA/cccl/blob/v2.2.0/cub/cub/device/device_radix_sort.cuh https://docs.nvidia.com/cuda/archive/12.3.0/cuda-toolkit-release-notes/index.html However, it appears to be still in use in the ROCm equivalent: https://github.com/ROCm/rocm-libraries/blob/main/projects/rocprim/rocprim/include/rocprim/device/device_radix_sort.hpp - Add ROCm compatibility with CUB Min and Max by using https://github.com/ROCm/rocm-libraries/blob/main/projects/rocthrust/thrust/functional.h Pull Request resolved: #4835 Reviewed By: cthi Differential Revision: D81960727 Pulled By: q10 fbshipit-source-id: 0a282bfd3cef3d78078df23d93a8c5ab85b64cb8
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu
@@ -38,6 +38,7 @@
 
 #include "fbgemm_gpu/utils/cuda_block_count.h"
 #include "fbgemm_gpu/utils/cuda_prelude.cuh"
+#include "fbgemm_gpu/utils/device_sort.cuh"
 #include "fbgemm_gpu/utils/stochastic_rounding.cuh"
 
 #if !(                                                  \
@@ -54,7 +55,6 @@
 #ifndef USE_ROCM
 #include <mma.h>
 #endif
-#include <cub/cub.cuh>
 
 #include <torch/torch.h>
 
@@ -1702,7 +1702,8 @@ __device__ float compute_max_block(
 
   __shared__ typename BlockReduce::TempStorage temp_storage[THREAD_Y];
 
-  float amax = BlockReduce(temp_storage[threadIdx.y]).Reduce(xabs, cub::Max());
+  float amax =
+      BlockReduce(temp_storage[threadIdx.y]).Reduce(xabs, Max<float>());
 
   __shared__ float amax_smem[THREAD_Y];
   if (threadIdx.x == 0)
@@ -1724,7 +1725,7 @@ __device__ float compute_max_warp(
   typedef cub::WarpReduce<float> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp_storage[THREAD_Y];
 
-  float amax = WarpReduce(temp_storage[threadIdx.y]).Reduce(xabs, cub::Max());
+  float amax = WarpReduce(temp_storage[threadIdx.y]).Reduce(xabs, Max<float>());
   amax = __shfl_sync(0xffffffff, amax, 0);
   return amax;
 }
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/device_sort.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/device_sort.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+
+#ifdef USE_ROCM
+#include <thrust/functional.h>
+#else
+#include <cuda/functional>
+#endif
+
+// clang-format off
+#include "fbgemm_gpu/utils/cub_namespace_prefix.cuh"
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_scan.cuh>
+#include "fbgemm_gpu/utils/cub_namespace_postfix.cuh"
+// clang-format on
+
+namespace fbgemm_gpu {
+
+#ifdef USE_ROCM
+template <typename T>
+using Max = thrust::maximum<T>;
+#else
+#if CUDA_VERSION >= 13000
+template <typename T>
+using Max = cuda::maximum<T>;
+#else
+template <typename T>
+using Max = cub::Max;
+#endif
+#endif
+
+#ifdef USE_ROCM
+template <typename T>
+using Min = thrust::minimum<T>;
+#else
+#if CUDA_VERSION >= 13000
+template <typename T>
+using Min = cuda::minimum<T>;
+#else
+template <typename T>
+using Min = cub::Min;
+#endif
+#endif
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
@@ -16,13 +16,8 @@
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/library.h>
 #include <ATen/cuda/Atomic.cuh>
-#include <cub/cub.cuh>
 
-// clang-format off
-#include "fbgemm_gpu/utils/cub_namespace_prefix.cuh"
-#include <cub/device/device_scan.cuh>
-#include "fbgemm_gpu/utils/cub_namespace_postfix.cuh"
-// clang-format on
+#include "fbgemm_gpu/utils/device_sort.cuh"
 
 #include "common.h"
 #include "fbgemm_gpu/sparse_ops.h"
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_forward.cu
@@ -60,7 +60,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel(
 
           // Collectively compute the block-wide max reduction
           scalar_t block_max_value =
-              BlockReduceT(temp_storage).Reduce(thread_val, cub::Max());
+              BlockReduceT(temp_storage).Reduce(thread_val, Max<index_t>());
           __syncthreads();
 
           if (tid == 0) {
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_unique_indices.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_unique_indices.cu
@@ -121,8 +121,10 @@ __global__ __launch_bounds__(kMaxThreads) void unique_indices_length_kernel(
     t_min = (value < t_min) ? value : t_min;
   }
 
-  index_t block_max = BlockReduce(temp_storage_max).Reduce(t_max, cub::Max());
-  index_t block_min = BlockReduce(temp_storage_min).Reduce(t_min, cub::Min());
+  index_t block_max =
+      BlockReduce(temp_storage_max).Reduce(t_max, Max<index_t>());
+  index_t block_min =
+      BlockReduce(temp_storage_min).Reduce(t_min, Min<index_t>());
   if (tid == 0) {
     block_results[0] = block_max;
     block_results[1] = block_min;
@@ -240,7 +242,8 @@ __global__ __launch_bounds__(kMaxThreads) void compute_hash_size_kernel(
     t_max = (value > t_max) ? value : t_max;
   }
 
-  index_t block_max = BlockReduce(temp_storage_max).Reduce(t_max, cub::Max());
+  index_t block_max =
+      BlockReduce(temp_storage_max).Reduce(t_max, Max<index_t>());
   if (tid == 0) {
     hash_size[bid] = block_max + 1;
   }
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
@@ -152,8 +152,7 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
             N,
             0,
             int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
+            at::cuda::getCurrentCUDAStream()));
         auto temp_storage = at::empty(
             {static_cast<int64_t>(temp_storage_bytes)},
             unique_indices.options().dtype(at::kByte));
@@ -167,8 +166,7 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
             N,
             0,
             int(log2(float(lxu_cache_state.size(0) + 1)) + 1) + kLFUCounterBits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
+            at::cuda::getCurrentCUDAStream()));
       });
   return {sorted_cache_sets, cache_set_sorted_unique_indices};
 }
diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
@@ -232,8 +232,7 @@ get_unique_indices_cuda_impl(
       N,                                                                  \
       0,                                                                  \
       int(log2(float(max_indices + 1)) + 1),                              \
-      at::cuda::getCurrentCUDAStream(),                                   \
-      false))
+      at::cuda::getCurrentCUDAStream()))
 
 #define INVOKE_CUB_SORT_KEYS(TEMP_STORAGE_PTR)                           \
   AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortKeys( \
@@ -244,8 +243,7 @@ get_unique_indices_cuda_impl(
       N,                                                                 \
       0,                                                                 \
       int(log2(float(max_indices + 1)) + 1),                             \
-      at::cuda::getCurrentCUDAStream(),                                  \
-      false))
+      at::cuda::getCurrentCUDAStream()))
 
 #define INVOKE_CUB_ENCODE(TEMP_STORAGE_PTR)                                  \
   AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRunLengthEncode::Encode( \
@@ -256,8 +254,7 @@ get_unique_indices_cuda_impl(
       unique_indices_count->data_ptr<int32_t>(),                             \
       unique_indices_length.data_ptr<int32_t>(),                             \
       N,                                                                     \
-      at::cuda::getCurrentCUDAStream(),                                      \
-      false))
+      at::cuda::getCurrentCUDAStream()))
 
 #define INVOKE_CUB_UNIQUE(TEMP_STORAGE_PTR)                         \
   AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceSelect::Unique( \
@@ -267,8 +264,7 @@ get_unique_indices_cuda_impl(
       unique_indices.data_ptr<index_t>(),                           \
       unique_indices_length.data_ptr<int32_t>(),                    \
       N,                                                            \
-      at::cuda::getCurrentCUDAStream(),                             \
-      false))
+      at::cuda::getCurrentCUDAStream()))
 
   AT_DISPATCH_INDEX_TYPES(
       linear_indices.scalar_type(), "get_unique_indices_cuda", [&] {
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
@@ -206,8 +206,7 @@ lru_cache_find_uncached_cuda(
       N,                                                                  \
       0,                                                                  \
       int(log2(float(lxu_cache_state.size(0) + 1)) + 1),                  \
-      at::cuda::getCurrentCUDAStream(),                                   \
-      false))
+      at::cuda::getCurrentCUDAStream()))
 
   AT_DISPATCH_INDEX_TYPES(
       unique_indices.scalar_type(), "lru_cache_find_uncached_cuda", [&] {
diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu
@@ -313,8 +313,7 @@ transpose_embedding_input(
                         linear_indices.numel(),
                         0,
                         total_hash_size_bits,
-                        at::cuda::getCurrentCUDAStream(),
-                        false));
+                        at::cuda::getCurrentCUDAStream()));
                 auto temp_storage = at::empty(
                     {static_cast<int64_t>(temp_storage_bytes)},
                     indices.options().dtype(at::kByte));
@@ -329,8 +328,7 @@ transpose_embedding_input(
                         linear_indices.numel(),
                         0,
                         total_hash_size_bits,
-                        at::cuda::getCurrentCUDAStream(),
-                        false));
+                        at::cuda::getCurrentCUDAStream()));
 #else
 	        using config = rocprim::radix_sort_config<
                 rocprim::default_config,