Skip to content

Commit 27ed1f9

Browse files
[Offload][Conformance] Add support for CUDA Math and HIP Math providers (#152362)
This patch extends the conformance testing infrastructure to support two new providers of math function implementations for GPUs: CUDA Math (`cuda-math`) and HIP Math (`hip-math`).
1 parent 4698631 commit 27ed1f9

File tree

8 files changed

+517
-16
lines changed

8 files changed

+517
-16
lines changed

offload/unittests/CMakeLists.txt

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ endif ()
1818
set(OFFLOAD_UNITTESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR})
1919

2020
function(add_offload_test_device_code test_filename test_name)
21+
cmake_parse_arguments(
22+
"OFFLOAD_TESTS" "WITH_DEVICE_MATH_LIBS" "" "" ${ARGN})
23+
2124
set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename})
2225
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
2326

@@ -37,13 +40,25 @@ function(add_offload_test_device_code test_filename test_name)
3740
endif()
3841

3942
if(nvptx_arch AND CUDAToolkit_FOUND)
43+
set(nvptx_compile_flags ${OFFLOAD_TESTS_UNPARSED_ARGUMENTS})
44+
45+
if(OFFLOAD_TESTS_WITH_DEVICE_MATH_LIBS)
46+
file(GLOB libdevice_paths "${CUDAToolkit_LIBRARY_ROOT}/nvvm/libdevice/libdevice.*.bc")
47+
if(libdevice_paths)
48+
list(GET libdevice_paths 0 libdevice_path)
49+
list(APPEND nvptx_compile_flags "-Xclang" "-mlink-builtin-bitcode")
50+
list(APPEND nvptx_compile_flags "-Xclang" "${libdevice_path}")
51+
list(APPEND nvptx_compile_flags "-DCUDA_MATH_FOUND=1")
52+
endif()
53+
endif()
54+
4055
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin")
4156
add_custom_command(
4257
OUTPUT ${output_file}
4358
COMMAND ${CMAKE_CXX_COMPILER}
4459
-I${OFFLOAD_UNITTESTS_DIR}
4560
--target=nvptx64-nvidia-cuda -march=${nvptx_arch}
46-
-nogpulib --cuda-path=${cuda_path} -flto ${ARGN}
61+
-nogpulib --cuda-path=${cuda_path} -flto ${nvptx_compile_flags}
4762
${SRC_PATH} -o ${output_file}
4863
DEPENDS ${SRC_PATH}
4964
)
@@ -62,13 +77,25 @@ function(add_offload_test_device_code test_filename test_name)
6277
endif()
6378

6479
if(amdgpu_arch)
80+
set(amdgpu_compile_flags ${OFFLOAD_TESTS_UNPARSED_ARGUMENTS})
81+
82+
if(OFFLOAD_TESTS_WITH_DEVICE_MATH_LIBS)
83+
find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
84+
if(AMDDeviceLibs_FOUND)
85+
get_target_property(ocml_path ocml IMPORTED_LOCATION)
86+
list(APPEND amdgpu_compile_flags "-Xclang" "-mlink-builtin-bitcode")
87+
list(APPEND amdgpu_compile_flags "-Xclang" "${ocml_path}")
88+
list(APPEND amdgpu_compile_flags "-DHIP_MATH_FOUND=1")
89+
endif()
90+
endif()
91+
6592
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin")
6693
add_custom_command(
6794
OUTPUT ${output_file}
6895
COMMAND ${CMAKE_CXX_COMPILER}
6996
-I${OFFLOAD_UNITTESTS_DIR}
7097
--target=amdgcn-amd-amdhsa -mcpu=${amdgpu_arch}
71-
-nogpulib -flto ${ARGN} ${SRC_PATH} -o ${output_file}
98+
-nogpulib -flto ${amdgpu_compile_flags} ${SRC_PATH} -o ${output_file}
7299
DEPENDS ${SRC_PATH}
73100
)
74101
add_custom_target(${test_name}.amdgpu DEPENDS ${output_file})
Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1+
add_offload_test_device_code(CUDAMath.cpp cuda-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin)
2+
add_offload_test_device_code(HIPMath.cpp hip-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin)
13
add_offload_test_device_code(LLVMLibm.cpp llvm-libm -O3 -stdlib -fno-builtin)
24

3-
add_custom_target(conformance_device_binaries DEPENDS llvm-libm.bin)
5+
add_custom_target(conformance_device_binaries DEPENDS
6+
cuda-math.bin
7+
hip-math.bin
8+
llvm-libm.bin
9+
)
410
set(OFFLOAD_CONFORMANCE_DEVICE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file contains the implementation of the device kernels that wrap the
11+
/// math functions from the cuda-math provider.
12+
///
13+
//===----------------------------------------------------------------------===//
14+
15+
#ifdef CUDA_MATH_FOUND
16+
17+
#include "Conformance/device_code/DeviceAPIs.hpp"
18+
#include "Conformance/device_code/KernelRunner.hpp"
19+
20+
#include <gpuintrin.h>
21+
#include <stddef.h>
22+
23+
using namespace kernels;
24+
25+
//===----------------------------------------------------------------------===//
26+
// Helpers
27+
//===----------------------------------------------------------------------===//
28+
29+
static inline float sincosfSin(float X) {
30+
float SinX, CosX;
31+
__nv_sincosf(X, &SinX, &CosX);
32+
return SinX;
33+
}
34+
35+
static inline float sincosfCos(float X) {
36+
float SinX, CosX;
37+
__nv_sincosf(X, &SinX, &CosX);
38+
return CosX;
39+
}
40+
41+
//===----------------------------------------------------------------------===//
42+
// Kernels
43+
//===----------------------------------------------------------------------===//
44+
45+
extern "C" {
46+
47+
__gpu_kernel void acosfKernel(const float *X, float *Out,
48+
size_t NumElements) noexcept {
49+
runKernelBody<__nv_acosf>(NumElements, Out, X);
50+
}
51+
52+
__gpu_kernel void acoshfKernel(const float *X, float *Out,
53+
size_t NumElements) noexcept {
54+
runKernelBody<__nv_acoshf>(NumElements, Out, X);
55+
}
56+
57+
__gpu_kernel void asinfKernel(const float *X, float *Out,
58+
size_t NumElements) noexcept {
59+
runKernelBody<__nv_asinf>(NumElements, Out, X);
60+
}
61+
62+
__gpu_kernel void asinhfKernel(const float *X, float *Out,
63+
size_t NumElements) noexcept {
64+
runKernelBody<__nv_asinhf>(NumElements, Out, X);
65+
}
66+
67+
__gpu_kernel void atanfKernel(const float *X, float *Out,
68+
size_t NumElements) noexcept {
69+
runKernelBody<__nv_atanf>(NumElements, Out, X);
70+
}
71+
72+
__gpu_kernel void atanhfKernel(const float *X, float *Out,
73+
size_t NumElements) noexcept {
74+
runKernelBody<__nv_atanhf>(NumElements, Out, X);
75+
}
76+
77+
__gpu_kernel void cbrtfKernel(const float *X, float *Out,
78+
size_t NumElements) noexcept {
79+
runKernelBody<__nv_cbrtf>(NumElements, Out, X);
80+
}
81+
82+
__gpu_kernel void cosfKernel(const float *X, float *Out,
83+
size_t NumElements) noexcept {
84+
runKernelBody<__nv_cosf>(NumElements, Out, X);
85+
}
86+
87+
__gpu_kernel void coshfKernel(const float *X, float *Out,
88+
size_t NumElements) noexcept {
89+
runKernelBody<__nv_coshf>(NumElements, Out, X);
90+
}
91+
92+
__gpu_kernel void cospifKernel(const float *X, float *Out,
93+
size_t NumElements) noexcept {
94+
runKernelBody<__nv_cospif>(NumElements, Out, X);
95+
}
96+
97+
__gpu_kernel void erffKernel(const float *X, float *Out,
98+
size_t NumElements) noexcept {
99+
runKernelBody<__nv_erff>(NumElements, Out, X);
100+
}
101+
102+
__gpu_kernel void expfKernel(const float *X, float *Out,
103+
size_t NumElements) noexcept {
104+
runKernelBody<__nv_expf>(NumElements, Out, X);
105+
}
106+
107+
__gpu_kernel void exp10fKernel(const float *X, float *Out,
108+
size_t NumElements) noexcept {
109+
runKernelBody<__nv_exp10f>(NumElements, Out, X);
110+
}
111+
112+
__gpu_kernel void exp2fKernel(const float *X, float *Out,
113+
size_t NumElements) noexcept {
114+
runKernelBody<__nv_exp2f>(NumElements, Out, X);
115+
}
116+
117+
__gpu_kernel void expm1fKernel(const float *X, float *Out,
118+
size_t NumElements) noexcept {
119+
runKernelBody<__nv_expm1f>(NumElements, Out, X);
120+
}
121+
122+
__gpu_kernel void logfKernel(const float *X, float *Out,
123+
size_t NumElements) noexcept {
124+
runKernelBody<__nv_logf>(NumElements, Out, X);
125+
}
126+
127+
__gpu_kernel void log10fKernel(const float *X, float *Out,
128+
size_t NumElements) noexcept {
129+
runKernelBody<__nv_log10f>(NumElements, Out, X);
130+
}
131+
132+
__gpu_kernel void log1pfKernel(const float *X, float *Out,
133+
size_t NumElements) noexcept {
134+
runKernelBody<__nv_log1pf>(NumElements, Out, X);
135+
}
136+
137+
__gpu_kernel void log2fKernel(const float *X, float *Out,
138+
size_t NumElements) noexcept {
139+
runKernelBody<__nv_log2f>(NumElements, Out, X);
140+
}
141+
142+
__gpu_kernel void sinfKernel(const float *X, float *Out,
143+
size_t NumElements) noexcept {
144+
runKernelBody<__nv_sinf>(NumElements, Out, X);
145+
}
146+
147+
__gpu_kernel void sincosfSinKernel(const float *X, float *Out,
148+
size_t NumElements) noexcept {
149+
runKernelBody<sincosfSin>(NumElements, Out, X);
150+
}
151+
152+
__gpu_kernel void sincosfCosKernel(const float *X, float *Out,
153+
size_t NumElements) noexcept {
154+
runKernelBody<sincosfCos>(NumElements, Out, X);
155+
}
156+
157+
__gpu_kernel void sinhfKernel(const float *X, float *Out,
158+
size_t NumElements) noexcept {
159+
runKernelBody<__nv_sinhf>(NumElements, Out, X);
160+
}
161+
162+
__gpu_kernel void sinpifKernel(const float *X, float *Out,
163+
size_t NumElements) noexcept {
164+
runKernelBody<__nv_sinpif>(NumElements, Out, X);
165+
}
166+
167+
__gpu_kernel void tanfKernel(const float *X, float *Out,
168+
size_t NumElements) noexcept {
169+
runKernelBody<__nv_tanf>(NumElements, Out, X);
170+
}
171+
172+
__gpu_kernel void tanhfKernel(const float *X, float *Out,
173+
size_t NumElements) noexcept {
174+
runKernelBody<__nv_tanhf>(NumElements, Out, X);
175+
}
176+
} // extern "C"
177+
178+
#endif // CUDA_MATH_FOUND
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file contains platform-specific definitions and forward declarations
11+
/// for device-side APIs used by the kernels.
12+
///
13+
//===----------------------------------------------------------------------===//
14+
15+
#ifndef CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP
16+
#define CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP
17+
18+
#include <stdint.h>
19+
20+
typedef _Float16 float16;
21+
22+
#ifdef __AMDGPU__
23+
24+
// The ROCm device library uses control globals to alter codegen for the
25+
// different targets. To avoid needing to link them in manually, we simply
26+
// define them here.
27+
extern "C" {
28+
extern const inline uint8_t __oclc_unsafe_math_opt = 0;
29+
extern const inline uint8_t __oclc_daz_opt = 0;
30+
extern const inline uint8_t __oclc_correctly_rounded_sqrt32 = 1;
31+
extern const inline uint8_t __oclc_finite_only_opt = 0;
32+
extern const inline uint32_t __oclc_ISA_version = 9000;
33+
}
34+
35+
// These aliases cause Clang to emit the control constants with ODR linkage.
36+
// This allows us to link against the symbols without preventing them from being
37+
// optimized out or causing symbol collisions.
38+
[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
39+
[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
40+
[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
41+
__oclc_correctly_rounded_sqrt32__;
42+
[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
43+
[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
44+
45+
#endif // __AMDGPU__
46+
47+
#ifdef CUDA_MATH_FOUND
48+
49+
extern "C" {
50+
51+
float __nv_acosf(float);
52+
float __nv_acoshf(float);
53+
float __nv_asinf(float);
54+
float __nv_asinhf(float);
55+
float __nv_atanf(float);
56+
float __nv_atanhf(float);
57+
float __nv_cbrtf(float);
58+
float __nv_cosf(float);
59+
float __nv_coshf(float);
60+
float __nv_cospif(float);
61+
float __nv_erff(float);
62+
float __nv_expf(float);
63+
float __nv_exp10f(float);
64+
float __nv_exp2f(float);
65+
float __nv_expm1f(float);
66+
float __nv_logf(float);
67+
float __nv_log10f(float);
68+
float __nv_log1pf(float);
69+
float __nv_log2f(float);
70+
float __nv_sinf(float);
71+
void __nv_sincosf(float, float *, float *);
72+
float __nv_sinhf(float);
73+
float __nv_sinpif(float);
74+
float __nv_tanf(float);
75+
float __nv_tanhf(float);
76+
} // extern "C"
77+
78+
#endif // CUDA_MATH_FOUND
79+
80+
#ifdef HIP_MATH_FOUND
81+
82+
extern "C" {
83+
84+
float __ocml_acos_f32(float);
85+
float __ocml_acosh_f32(float);
86+
float __ocml_asin_f32(float);
87+
float __ocml_asinh_f32(float);
88+
float __ocml_atan_f32(float);
89+
float __ocml_atanh_f32(float);
90+
float __ocml_cbrt_f32(float);
91+
float __ocml_cos_f32(float);
92+
float __ocml_cosh_f32(float);
93+
float __ocml_cospi_f32(float);
94+
float __ocml_erf_f32(float);
95+
float __ocml_exp_f32(float);
96+
float __ocml_exp10_f32(float);
97+
float __ocml_exp2_f32(float);
98+
float __ocml_expm1_f32(float);
99+
float __ocml_log_f32(float);
100+
float __ocml_log10_f32(float);
101+
float __ocml_log1p_f32(float);
102+
float __ocml_log2_f32(float);
103+
float __ocml_sin_f32(float);
104+
float __ocml_sincos_f32(float, float *);
105+
float __ocml_sinh_f32(float);
106+
float __ocml_sinpi_f32(float);
107+
float __ocml_tan_f32(float);
108+
float __ocml_tanh_f32(float);
109+
} // extern "C"
110+
111+
#endif // HIP_MATH_FOUND
112+
113+
#endif // CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP

0 commit comments

Comments
 (0)