Skip to content

Commit 21dfdf3

Browse files
committed
Improve: Detecting CUDA availability
1 parent 8f32d65 commit 21dfdf3

File tree

1 file changed

+14
-8
lines changed

1 file changed

+14
-8
lines changed

less_slow.cpp

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -636,14 +636,19 @@ BENCHMARK(sorting_with_openmp)
636636
* @see NVCC Identification Macros docs:
637637
* https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#nvcc-identification-macro
638638
*/
639-
#define _LESS_SLOW_WITH_CUDA 0
639+
#if !defined(USE_NVIDIA_CCCL)
640640
#if defined(__has_include)
641641
#if __has_include(<cuda_runtime.h>)
642-
#define _LESS_SLOW_WITH_CUDA 1
643-
#endif
644-
#endif
642+
#define USE_NVIDIA_CCCL 1
643+
#else
644+
#define USE_NVIDIA_CCCL 0
645+
#endif // __has_include(<cuda_runtime.h>)
646+
#else
647+
#define USE_NVIDIA_CCCL 0
648+
#endif // defined(__has_include)
649+
#endif // !defined(USE_NVIDIA_CCCL)
645650

646-
#if _LESS_SLOW_WITH_CUDA
651+
#if USE_NVIDIA_CCCL
647652

648653
/**
649654
* Unlike STL, Thrust provides some very handy abstractions for sorting
@@ -2050,8 +2055,9 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne
20502055

20512056
#pragma region GPGPU Programming
20522057

2053-
#if _LESS_SLOW_WITH_CUDA
2058+
#if USE_NVIDIA_CCCL
20542059
#include <cuda.h>
2060+
#include <cuda_runtime.h>
20552061

20562062
/**
20572063
* Different generations of matrix multiplication instructions on GPUs use
@@ -3018,7 +3024,7 @@ BENCHMARK(eigen_tops<_Float16>)->RangeMultiplier(2)->Range(8, 16384)->Complexity
30183024
* enough.
30193025
*/
30203026

3021-
#if _LESS_SLOW_WITH_CUDA
3027+
#if USE_NVIDIA_CCCL
30223028
#include <cublas_v2.h>
30233029

30243030
/**
@@ -3327,7 +3333,7 @@ BENCHMARK(cublaslt_tops<fp8_e4m3_t, float>)->RangeMultiplier(2)->Range(256, 1638
33273333
* - 10 P for `i8` matrix multiplications into `i32`.
33283334
*/
33293335

3330-
#endif // _LESS_SLOW_WITH_CUDA
3336+
#endif // USE_NVIDIA_CCCL
33313337

33323338
#pragma endregion // Memory Bound Linear Algebra
33333339

0 commit comments

Comments
 (0)