@@ -636,14 +636,19 @@ BENCHMARK(sorting_with_openmp)
636636 * @see NVCC Identification Macros docs:
637637 * https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#nvcc-identification-macro
638638 */
639- #define _LESS_SLOW_WITH_CUDA 0
639+ #if !defined(USE_NVIDIA_CCCL)
640640#if defined(__has_include)
641641#if __has_include(<cuda_runtime.h>)
642- #define _LESS_SLOW_WITH_CUDA 1
643- #endif
644- #endif
642+ #define USE_NVIDIA_CCCL 1
643+ #else
644+ #define USE_NVIDIA_CCCL 0
645+ #endif // __has_include(<cuda_runtime.h>)
646+ #else
647+ #define USE_NVIDIA_CCCL 0
648+ #endif // defined(__has_include)
649+ #endif // !defined(USE_NVIDIA_CCCL)
645650
646- #if _LESS_SLOW_WITH_CUDA
651+ #if USE_NVIDIA_CCCL
647652
648653/* *
649654 * Unlike STL, Thrust provides some very handy abstractions for sorting
@@ -2050,8 +2055,9 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne
20502055
20512056#pragma region GPGPU Programming
20522057
2053- #if _LESS_SLOW_WITH_CUDA
2058+ #if USE_NVIDIA_CCCL
20542059#include < cuda.h>
2060+ #include < cuda_runtime.h>
20552061
20562062/* *
20572063 * Different generations of matrix multiplication instructions on GPUs use
@@ -3018,7 +3024,7 @@ BENCHMARK(eigen_tops<_Float16>)->RangeMultiplier(2)->Range(8, 16384)->Complexity
30183024 * enough.
30193025 */
30203026
3021- #if _LESS_SLOW_WITH_CUDA
3027+ #if USE_NVIDIA_CCCL
30223028#include < cublas_v2.h>
30233029
30243030/* *
@@ -3327,7 +3333,7 @@ BENCHMARK(cublaslt_tops<fp8_e4m3_t, float>)->RangeMultiplier(2)->Range(256, 1638
33273333 * - 10 P for `i8` matrix multiplications into `i32`.
33283334 */
33293335
3330- #endif // _LESS_SLOW_WITH_CUDA
3336+ #endif // USE_NVIDIA_CCCL
33313337
33323338#pragma endregion // Memory Bound Linear Algebra
33333339
0 commit comments