llvm · jhuber6 · Aug 7, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 7, 2025
diff --git a/offload/unittests/CMakeLists.txt b/offload/unittests/CMakeLists.txt
@@ -18,6 +18,9 @@ endif ()
 set(OFFLOAD_UNITTESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 function(add_offload_test_device_code test_filename test_name)
+  cmake_parse_arguments(
+    "ARGS" "WITH_DEVICE_MATH_LIBS" "" "" ${ARGN})
+
   set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename})
   set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 
@@ -37,13 +40,25 @@ function(add_offload_test_device_code test_filename test_name)
     endif()
 
     if(nvptx_arch AND CUDAToolkit_FOUND)
+      set(nvptx_compile_flags ${ARGS_UNPARSED_ARGUMENTS})
+
+      if(ARGS_WITH_DEVICE_MATH_LIBS)
+        file(GLOB libdevice_paths "${CUDAToolkit_LIBRARY_ROOT}/nvvm/libdevice/libdevice.*.bc")
+        if(libdevice_paths)
+          list(GET libdevice_paths 0 libdevice_path)
+          list(APPEND nvptx_compile_flags "-Xclang" "-mlink-builtin-bitcode")
+          list(APPEND nvptx_compile_flags "-Xclang" "${libdevice_path}")
+          list(APPEND nvptx_compile_flags "-DCUDA_MATH_FOUND=1")
+        endif()
+      endif()
+
       set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin")
       add_custom_command(
         OUTPUT ${output_file}
         COMMAND ${CMAKE_CXX_COMPILER}
         -I${OFFLOAD_UNITTESTS_DIR}
         --target=nvptx64-nvidia-cuda -march=${nvptx_arch}
-        -nogpulib --cuda-path=${cuda_path} -flto ${ARGN}
+        -nogpulib --cuda-path=${cuda_path} -flto ${nvptx_compile_flags}
         ${SRC_PATH} -o ${output_file}
         DEPENDS ${SRC_PATH}
       )
@@ -62,13 +77,25 @@ function(add_offload_test_device_code test_filename test_name)
     endif()
 
     if(amdgpu_arch)
+      set(amdgpu_compile_flags ${ARGS_UNPARSED_ARGUMENTS})
+
+      if(ARGS_WITH_DEVICE_MATH_LIBS)
+        find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+        if(AMDDeviceLibs_FOUND)
+          get_target_property(ocml_path ocml IMPORTED_LOCATION)
+          list(APPEND amdgpu_compile_flags "-Xclang" "-mlink-builtin-bitcode")
+          list(APPEND amdgpu_compile_flags "-Xclang" "${ocml_path}")
+          list(APPEND amdgpu_compile_flags "-DHIP_MATH_FOUND=1")
+        endif()
+      endif()
+
       set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin")
       add_custom_command(
         OUTPUT ${output_file}
         COMMAND ${CMAKE_CXX_COMPILER}
         -I${OFFLOAD_UNITTESTS_DIR}
         --target=amdgcn-amd-amdhsa -mcpu=${amdgpu_arch}
-        -nogpulib -flto ${ARGN} ${SRC_PATH} -o ${output_file}
+        -nogpulib -flto ${amdgpu_compile_flags} ${SRC_PATH} -o ${output_file}
         DEPENDS ${SRC_PATH}
       )
       add_custom_target(${test_name}.amdgpu DEPENDS ${output_file})

diff --git a/offload/unittests/Conformance/device_code/CMakeLists.txt b/offload/unittests/Conformance/device_code/CMakeLists.txt
@@ -1,4 +1,10 @@
+add_offload_test_device_code(CUDAMath.cpp cuda-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin)
+add_offload_test_device_code(HIPMath.cpp hip-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin)
 add_offload_test_device_code(LLVMLibm.cpp llvm-libm -O3 -stdlib -fno-builtin)
 
-add_custom_target(conformance_device_binaries DEPENDS llvm-libm.bin)
+add_custom_target(conformance_device_binaries DEPENDS
+  cuda-math.bin
+  hip-math.bin
+  llvm-libm.bin
+)
 set(OFFLOAD_CONFORMANCE_DEVICE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/Conformance/device_code/CUDAMath.cpp b/offload/unittests/Conformance/device_code/CUDAMath.cpp
@@ -0,0 +1,178 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the implementation of the device kernels that wrap the
+/// math functions from the cuda-math provider.
+///
+//===----------------------------------------------------------------------===//
+
+#ifdef CUDA_MATH_FOUND
+
+#include "Conformance/device_code/DeviceAPIs.hpp"
+#include "Conformance/device_code/KernelRunner.hpp"
+
+#include <gpuintrin.h>
+#include <stddef.h>
+
+using namespace kernels;
+
+//===----------------------------------------------------------------------===//
+// Helpers
+//===----------------------------------------------------------------------===//
+
+static inline float sincosfSin(float X) {
+  float SinX, CosX;
+  __nv_sincosf(X, &SinX, &CosX);
+  return SinX;
+}
+
+static inline float sincosfCos(float X) {
+  float SinX, CosX;
+  __nv_sincosf(X, &SinX, &CosX);
+  return CosX;
+}
+
+//===----------------------------------------------------------------------===//
+// Kernels
+//===----------------------------------------------------------------------===//
+
+extern "C" {
+
+__gpu_kernel void acosfKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_acosf>(NumElements, Out, X);
+}
+
+__gpu_kernel void acoshfKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_acoshf>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinfKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_asinf>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinhfKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_asinhf>(NumElements, Out, X);
+}
+
+__gpu_kernel void atanfKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_atanf>(NumElements, Out, X);
+}
+
+__gpu_kernel void atanhfKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_atanhf>(NumElements, Out, X);
+}
+
+__gpu_kernel void cbrtfKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_cbrtf>(NumElements, Out, X);
+}
+
+__gpu_kernel void cosfKernel(const float *X, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_cosf>(NumElements, Out, X);
+}
+
+__gpu_kernel void coshfKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_coshf>(NumElements, Out, X);
+}
+
+__gpu_kernel void cospifKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_cospif>(NumElements, Out, X);
+}
+
+__gpu_kernel void erffKernel(const float *X, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_erff>(NumElements, Out, X);
+}
+
+__gpu_kernel void expfKernel(const float *X, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_expf>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp10fKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_exp10f>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp2fKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_exp2f>(NumElements, Out, X);
+}
+
+__gpu_kernel void expm1fKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_expm1f>(NumElements, Out, X);
+}
+
+__gpu_kernel void logfKernel(const float *X, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_logf>(NumElements, Out, X);
+}
+
+__gpu_kernel void log10fKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_log10f>(NumElements, Out, X);
+}
+
+__gpu_kernel void log1pfKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_log1pf>(NumElements, Out, X);
+}
+
+__gpu_kernel void log2fKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_log2f>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinfKernel(const float *X, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_sinf>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosfSinKernel(const float *X, float *Out,
+                                   size_t NumElements) noexcept {
+  runKernelBody<sincosfSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosfCosKernel(const float *X, float *Out,
+                                   size_t NumElements) noexcept {
+  runKernelBody<sincosfCos>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinhfKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_sinhf>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinpifKernel(const float *X, float *Out,
+                               size_t NumElements) noexcept {
+  runKernelBody<__nv_sinpif>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanfKernel(const float *X, float *Out,
+                             size_t NumElements) noexcept {
+  runKernelBody<__nv_tanf>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanhfKernel(const float *X, float *Out,
+                              size_t NumElements) noexcept {
+  runKernelBody<__nv_tanhf>(NumElements, Out, X);
+}
+} // extern "C"
+
+#endif // CUDA_MATH_FOUND
diff --git a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
@@ -0,0 +1,113 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains platform-specific definitions and forward declarations
+/// for device-side APIs used by the kernels.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP
+#define CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP
+
+#include <stdint.h>
+
+typedef _Float16 float16;
+
+#ifdef __AMDGPU__
+
+// The ROCm device library uses control globals to alter codegen for the
+// different targets. To avoid needing to link them in manually, we simply
+// define them here.
+extern "C" {
+extern const inline uint8_t __oclc_unsafe_math_opt = 0;
+extern const inline uint8_t __oclc_daz_opt = 0;
+extern const inline uint8_t __oclc_correctly_rounded_sqrt32 = 1;
+extern const inline uint8_t __oclc_finite_only_opt = 0;
+extern const inline uint32_t __oclc_ISA_version = 9000;
+}
+
+// These aliases cause Clang to emit the control constants with ODR linkage.
+// This allows us to link against the symbols without preventing them from being
+// optimized out or causing symbol collisions.
+[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
+[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
+[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
+    __oclc_correctly_rounded_sqrt32__;
+[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
+[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
+
+#endif // __AMDGPU__
+
+#ifdef CUDA_MATH_FOUND
+
+extern "C" {
+
+float __nv_acosf(float);
+float __nv_acoshf(float);
+float __nv_asinf(float);
+float __nv_asinhf(float);
+float __nv_atanf(float);
+float __nv_atanhf(float);
+float __nv_cbrtf(float);
+float __nv_cosf(float);
+float __nv_coshf(float);
+float __nv_cospif(float);
+float __nv_erff(float);
+float __nv_expf(float);
+float __nv_exp10f(float);
+float __nv_exp2f(float);
+float __nv_expm1f(float);
+float __nv_logf(float);
+float __nv_log10f(float);
+float __nv_log1pf(float);
+float __nv_log2f(float);
+float __nv_sinf(float);
+void __nv_sincosf(float, float *, float *);
+float __nv_sinhf(float);
+float __nv_sinpif(float);
+float __nv_tanf(float);
+float __nv_tanhf(float);
+} // extern "C"
+
+#endif // CUDA_MATH_FOUND
+
+#ifdef HIP_MATH_FOUND
+
+extern "C" {
+
+float __ocml_acos_f32(float);
+float __ocml_acosh_f32(float);
+float __ocml_asin_f32(float);
+float __ocml_asinh_f32(float);
+float __ocml_atan_f32(float);
+float __ocml_atanh_f32(float);
+float __ocml_cbrt_f32(float);
+float __ocml_cos_f32(float);
+float __ocml_cosh_f32(float);
+float __ocml_cospi_f32(float);
+float __ocml_erf_f32(float);
+float __ocml_exp_f32(float);
+float __ocml_exp10_f32(float);
+float __ocml_exp2_f32(float);
+float __ocml_expm1_f32(float);
+float __ocml_log_f32(float);
+float __ocml_log10_f32(float);
+float __ocml_log1p_f32(float);
+float __ocml_log2_f32(float);
+float __ocml_sin_f32(float);
+float __ocml_sincos_f32(float, float *);
+float __ocml_sinh_f32(float);
+float __ocml_sinpi_f32(float);
+float __ocml_tan_f32(float);
+float __ocml_tanh_f32(float);
+} // extern "C"
+
+#endif // HIP_MATH_FOUND
+
+#endif // CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP