diff --git a/offload/unittests/CMakeLists.txt b/offload/unittests/CMakeLists.txt index a0d5c01263056..308849a8364ac 100644 --- a/offload/unittests/CMakeLists.txt +++ b/offload/unittests/CMakeLists.txt @@ -18,6 +18,9 @@ endif () set(OFFLOAD_UNITTESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}) function(add_offload_test_device_code test_filename test_name) + cmake_parse_arguments( + "OFFLOAD_TESTS" "WITH_DEVICE_MATH_LIBS" "" "" ${ARGN}) + set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename}) set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) @@ -37,13 +40,25 @@ function(add_offload_test_device_code test_filename test_name) endif() if(nvptx_arch AND CUDAToolkit_FOUND) + set(nvptx_compile_flags ${OFFLOAD_TESTS_UNPARSED_ARGUMENTS}) + + if(OFFLOAD_TESTS_WITH_DEVICE_MATH_LIBS) + file(GLOB libdevice_paths "${CUDAToolkit_LIBRARY_ROOT}/nvvm/libdevice/libdevice.*.bc") + if(libdevice_paths) + list(GET libdevice_paths 0 libdevice_path) + list(APPEND nvptx_compile_flags "-Xclang" "-mlink-builtin-bitcode") + list(APPEND nvptx_compile_flags "-Xclang" "${libdevice_path}") + list(APPEND nvptx_compile_flags "-DCUDA_MATH_FOUND=1") + endif() + endif() + set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin") add_custom_command( OUTPUT ${output_file} COMMAND ${CMAKE_CXX_COMPILER} -I${OFFLOAD_UNITTESTS_DIR} --target=nvptx64-nvidia-cuda -march=${nvptx_arch} - -nogpulib --cuda-path=${cuda_path} -flto ${ARGN} + -nogpulib --cuda-path=${cuda_path} -flto ${nvptx_compile_flags} ${SRC_PATH} -o ${output_file} DEPENDS ${SRC_PATH} ) @@ -62,13 +77,25 @@ function(add_offload_test_device_code test_filename test_name) endif() if(amdgpu_arch) + set(amdgpu_compile_flags ${OFFLOAD_TESTS_UNPARSED_ARGUMENTS}) + + if(OFFLOAD_TESTS_WITH_DEVICE_MATH_LIBS) + find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm) + if(AMDDeviceLibs_FOUND) + get_target_property(ocml_path ocml IMPORTED_LOCATION) + list(APPEND amdgpu_compile_flags "-Xclang" "-mlink-builtin-bitcode") + list(APPEND amdgpu_compile_flags "-Xclang" "${ocml_path}") + list(APPEND amdgpu_compile_flags "-DHIP_MATH_FOUND=1") + endif() + endif() + set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin") add_custom_command( OUTPUT ${output_file} COMMAND ${CMAKE_CXX_COMPILER} -I${OFFLOAD_UNITTESTS_DIR} --target=amdgcn-amd-amdhsa -mcpu=${amdgpu_arch} - -nogpulib -flto ${ARGN} ${SRC_PATH} -o ${output_file} + -nogpulib -flto ${amdgpu_compile_flags} ${SRC_PATH} -o ${output_file} DEPENDS ${SRC_PATH} ) add_custom_target(${test_name}.amdgpu DEPENDS ${output_file}) diff --git a/offload/unittests/Conformance/device_code/CMakeLists.txt b/offload/unittests/Conformance/device_code/CMakeLists.txt index 789dd167bb9ff..a0c5369f24ae1 100644 --- a/offload/unittests/Conformance/device_code/CMakeLists.txt +++ b/offload/unittests/Conformance/device_code/CMakeLists.txt @@ -1,4 +1,10 @@ +add_offload_test_device_code(CUDAMath.cpp cuda-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin) +add_offload_test_device_code(HIPMath.cpp hip-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin) add_offload_test_device_code(LLVMLibm.cpp llvm-libm -O3 -stdlib -fno-builtin) -add_custom_target(conformance_device_binaries DEPENDS llvm-libm.bin) +add_custom_target(conformance_device_binaries DEPENDS + cuda-math.bin + hip-math.bin + llvm-libm.bin +) set(OFFLOAD_CONFORMANCE_DEVICE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE) diff --git a/offload/unittests/Conformance/device_code/CUDAMath.cpp b/offload/unittests/Conformance/device_code/CUDAMath.cpp new file mode 100644 index 0000000000000..a351e924b8f89 --- /dev/null +++ b/offload/unittests/Conformance/device_code/CUDAMath.cpp @@ -0,0 +1,178 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the implementation of the device kernels that wrap the +/// math functions from the cuda-math provider. +/// +//===----------------------------------------------------------------------===// + +#ifdef CUDA_MATH_FOUND + +#include "Conformance/device_code/DeviceAPIs.hpp" +#include "Conformance/device_code/KernelRunner.hpp" + +#include +#include + +using namespace kernels; + +//===----------------------------------------------------------------------===// +// Helpers +//===----------------------------------------------------------------------===// + +static inline float sincosfSin(float X) { + float SinX, CosX; + __nv_sincosf(X, &SinX, &CosX); + return SinX; +} + +static inline float sincosfCos(float X) { + float SinX, CosX; + __nv_sincosf(X, &SinX, &CosX); + return CosX; +} + +//===----------------------------------------------------------------------===// +// Kernels +//===----------------------------------------------------------------------===// + +extern "C" { + +__gpu_kernel void acosfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_acosf>(NumElements, Out, X); +} + +__gpu_kernel void acoshfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_acoshf>(NumElements, Out, X); +} + +__gpu_kernel void asinfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_asinf>(NumElements, Out, X); +} + +__gpu_kernel void asinhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_asinhf>(NumElements, Out, X); +} + +__gpu_kernel void atanfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_atanf>(NumElements, Out, X); +} + +__gpu_kernel void atanhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_atanhf>(NumElements, Out, X); +} + +__gpu_kernel void cbrtfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_cbrtf>(NumElements, Out, X); +} + +__gpu_kernel void cosfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_cosf>(NumElements, Out, X); +} + +__gpu_kernel void coshfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_coshf>(NumElements, Out, X); +} + +__gpu_kernel void cospifKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_cospif>(NumElements, Out, X); +} + +__gpu_kernel void erffKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_erff>(NumElements, Out, X); +} + +__gpu_kernel void expfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_expf>(NumElements, Out, X); +} + +__gpu_kernel void exp10fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp10f>(NumElements, Out, X); +} + +__gpu_kernel void exp2fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_exp2f>(NumElements, Out, X); +} + +__gpu_kernel void expm1fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_expm1f>(NumElements, Out, X); +} + +__gpu_kernel void logfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_logf>(NumElements, Out, X); +} + +__gpu_kernel void log10fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log10f>(NumElements, Out, X); +} + +__gpu_kernel void log1pfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log1pf>(NumElements, Out, X); +} + +__gpu_kernel void log2fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_log2f>(NumElements, Out, X); +} + +__gpu_kernel void sinfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_sinf>(NumElements, Out, X); +} + +__gpu_kernel void sincosfSinKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody(NumElements, Out, X); +} + +__gpu_kernel void sincosfCosKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody(NumElements, Out, X); +} + +__gpu_kernel void sinhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_sinhf>(NumElements, Out, X); +} + +__gpu_kernel void sinpifKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_sinpif>(NumElements, Out, X); +} + +__gpu_kernel void tanfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_tanf>(NumElements, Out, X); +} + +__gpu_kernel void tanhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__nv_tanhf>(NumElements, Out, X); +} +} // extern "C" + +#endif // CUDA_MATH_FOUND diff --git a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp new file mode 100644 index 0000000000000..8476dcbeff0c9 --- /dev/null +++ b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp @@ -0,0 +1,113 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains platform-specific definitions and forward declarations +/// for device-side APIs used by the kernels. +/// +//===----------------------------------------------------------------------===// + +#ifndef CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP +#define CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP + +#include + +typedef _Float16 float16; + +#ifdef __AMDGPU__ + +// The ROCm device library uses control globals to alter codegen for the +// different targets. To avoid needing to link them in manually, we simply +// define them here. +extern "C" { +extern const inline uint8_t __oclc_unsafe_math_opt = 0; +extern const inline uint8_t __oclc_daz_opt = 0; +extern const inline uint8_t __oclc_correctly_rounded_sqrt32 = 1; +extern const inline uint8_t __oclc_finite_only_opt = 0; +extern const inline uint32_t __oclc_ISA_version = 9000; +} + +// These aliases cause Clang to emit the control constants with ODR linkage. +// This allows us to link against the symbols without preventing them from being +// optimized out or causing symbol collisions. +[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__; +[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__; +[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t + __oclc_correctly_rounded_sqrt32__; +[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__; +[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__; + +#endif // __AMDGPU__ + +#ifdef CUDA_MATH_FOUND + +extern "C" { + +float __nv_acosf(float); +float __nv_acoshf(float); +float __nv_asinf(float); +float __nv_asinhf(float); +float __nv_atanf(float); +float __nv_atanhf(float); +float __nv_cbrtf(float); +float __nv_cosf(float); +float __nv_coshf(float); +float __nv_cospif(float); +float __nv_erff(float); +float __nv_expf(float); +float __nv_exp10f(float); +float __nv_exp2f(float); +float __nv_expm1f(float); +float __nv_logf(float); +float __nv_log10f(float); +float __nv_log1pf(float); +float __nv_log2f(float); +float __nv_sinf(float); +void __nv_sincosf(float, float *, float *); +float __nv_sinhf(float); +float __nv_sinpif(float); +float __nv_tanf(float); +float __nv_tanhf(float); +} // extern "C" + +#endif // CUDA_MATH_FOUND + +#ifdef HIP_MATH_FOUND + +extern "C" { + +float __ocml_acos_f32(float); +float __ocml_acosh_f32(float); +float __ocml_asin_f32(float); +float __ocml_asinh_f32(float); +float __ocml_atan_f32(float); +float __ocml_atanh_f32(float); +float __ocml_cbrt_f32(float); +float __ocml_cos_f32(float); +float __ocml_cosh_f32(float); +float __ocml_cospi_f32(float); +float __ocml_erf_f32(float); +float __ocml_exp_f32(float); +float __ocml_exp10_f32(float); +float __ocml_exp2_f32(float); +float __ocml_expm1_f32(float); +float __ocml_log_f32(float); +float __ocml_log10_f32(float); +float __ocml_log1p_f32(float); +float __ocml_log2_f32(float); +float __ocml_sin_f32(float); +float __ocml_sincos_f32(float, float *); +float __ocml_sinh_f32(float); +float __ocml_sinpi_f32(float); +float __ocml_tan_f32(float); +float __ocml_tanh_f32(float); +} // extern "C" + +#endif // HIP_MATH_FOUND + +#endif // CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP diff --git a/offload/unittests/Conformance/device_code/HIPMath.cpp b/offload/unittests/Conformance/device_code/HIPMath.cpp new file mode 100644 index 0000000000000..36efe6b2696ab --- /dev/null +++ b/offload/unittests/Conformance/device_code/HIPMath.cpp @@ -0,0 +1,178 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the implementation of the device kernels that wrap the +/// math functions from the hip-math provider. +/// +//===----------------------------------------------------------------------===// + +#ifdef HIP_MATH_FOUND + +#include "Conformance/device_code/DeviceAPIs.hpp" +#include "Conformance/device_code/KernelRunner.hpp" + +#include +#include + +using namespace kernels; + +//===----------------------------------------------------------------------===// +// Helpers +//===----------------------------------------------------------------------===// + +static inline float sincosfSin(float X) { + float CosX; + float SinX = __ocml_sincos_f32(X, &CosX); + return SinX; +} + +static inline float sincosfCos(float X) { + float CosX; + float SinX = __ocml_sincos_f32(X, &CosX); + return CosX; +} + +//===----------------------------------------------------------------------===// +// Kernels +//===----------------------------------------------------------------------===// + +extern "C" { + +__gpu_kernel void acosfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acos_f32>(NumElements, Out, X); +} + +__gpu_kernel void acoshfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_acosh_f32>(NumElements, Out, X); +} + +__gpu_kernel void asinfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asin_f32>(NumElements, Out, X); +} + +__gpu_kernel void asinhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_asinh_f32>(NumElements, Out, X); +} + +__gpu_kernel void atanfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atan_f32>(NumElements, Out, X); +} + +__gpu_kernel void atanhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_atanh_f32>(NumElements, Out, X); +} + +__gpu_kernel void cbrtfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cbrt_f32>(NumElements, Out, X); +} + +__gpu_kernel void cosfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cos_f32>(NumElements, Out, X); +} + +__gpu_kernel void coshfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cosh_f32>(NumElements, Out, X); +} + +__gpu_kernel void cospifKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_cospi_f32>(NumElements, Out, X); +} + +__gpu_kernel void erffKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_erf_f32>(NumElements, Out, X); +} + +__gpu_kernel void expfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp_f32>(NumElements, Out, X); +} + +__gpu_kernel void exp10fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp10_f32>(NumElements, Out, X); +} + +__gpu_kernel void exp2fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_exp2_f32>(NumElements, Out, X); +} + +__gpu_kernel void expm1fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_expm1_f32>(NumElements, Out, X); +} + +__gpu_kernel void logfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log_f32>(NumElements, Out, X); +} + +__gpu_kernel void log10fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log10_f32>(NumElements, Out, X); +} + +__gpu_kernel void log1pfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log1p_f32>(NumElements, Out, X); +} + +__gpu_kernel void log2fKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_log2_f32>(NumElements, Out, X); +} + +__gpu_kernel void sinfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sin_f32>(NumElements, Out, X); +} + +__gpu_kernel void sincosfSinKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody(NumElements, Out, X); +} + +__gpu_kernel void sincosfCosKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody(NumElements, Out, X); +} + +__gpu_kernel void sinhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sinh_f32>(NumElements, Out, X); +} + +__gpu_kernel void sinpifKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_sinpi_f32>(NumElements, Out, X); +} + +__gpu_kernel void tanfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tan_f32>(NumElements, Out, X); +} + +__gpu_kernel void tanhfKernel(const float *X, float *Out, + size_t NumElements) noexcept { + runKernelBody<__ocml_tanh_f32>(NumElements, Out, X); +} +} // extern "C" + +#endif // HIP_MATH_FOUND diff --git a/offload/unittests/Conformance/device_code/Common.hpp b/offload/unittests/Conformance/device_code/KernelRunner.hpp similarity index 70% rename from offload/unittests/Conformance/device_code/Common.hpp rename to offload/unittests/Conformance/device_code/KernelRunner.hpp index bcf3ac617b54c..e64a62fbdf018 100644 --- a/offload/unittests/Conformance/device_code/Common.hpp +++ b/offload/unittests/Conformance/device_code/KernelRunner.hpp @@ -7,21 +7,19 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file contains common utilities for defining device kernel wrappers to -/// math functions. +/// This file contains the definition of the runKernelBody, a template helper +/// that executes the per-thread logic of a math function's kernel wrapper. /// //===----------------------------------------------------------------------===// -#ifndef CONFORMANCE_DEVICE_CODE_COMMON_HPP -#define CONFORMANCE_DEVICE_CODE_COMMON_HPP +#ifndef CONFORMANCE_DEVICE_CODE_KERNELRUNNER_HPP +#define CONFORMANCE_DEVICE_CODE_KERNELRUNNER_HPP #include #include #include -namespace common { - -typedef _Float16 float16; +namespace kernels { template void runKernelBody(size_t NumElements, OutType *Out, const InTypes *...Ins) { @@ -32,6 +30,6 @@ void runKernelBody(size_t NumElements, OutType *Out, const InTypes *...Ins) { Out[Index] = Func(Ins[Index]...); } } -} // namespace common +} // namespace kernels -#endif // CONFORMANCE_DEVICE_CODE_COMMON_HPP +#endif // CONFORMANCE_DEVICE_CODE_KERNELRUNNER_HPP diff --git a/offload/unittests/Conformance/device_code/LLVMLibm.cpp b/offload/unittests/Conformance/device_code/LLVMLibm.cpp index f137ba3d23752..8869d87017486 100644 --- a/offload/unittests/Conformance/device_code/LLVMLibm.cpp +++ b/offload/unittests/Conformance/device_code/LLVMLibm.cpp @@ -12,13 +12,14 @@ /// //===----------------------------------------------------------------------===// -#include "Conformance/device_code/Common.hpp" +#include "Conformance/device_code/DeviceAPIs.hpp" +#include "Conformance/device_code/KernelRunner.hpp" #include #include #include -using namespace common; +using namespace kernels; //===----------------------------------------------------------------------===// // Helpers diff --git a/offload/unittests/Conformance/include/mathtest/TestRunner.hpp b/offload/unittests/Conformance/include/mathtest/TestRunner.hpp index f89d151d0161e..ab17f1d83768a 100644 --- a/offload/unittests/Conformance/include/mathtest/TestRunner.hpp +++ b/offload/unittests/Conformance/include/mathtest/TestRunner.hpp @@ -41,11 +41,11 @@ void printPreamble(const TestConfig &Config, size_t Index, size_t Total) noexcept { using FunctionConfig = FunctionConfig; - llvm::outs() << "[" << (Index + 1) << "/" << Total << "] " + llvm::errs() << "[" << (Index + 1) << "/" << Total << "] " << "Running conformance test '" << FunctionConfig::Name << "' with '" << Config.Provider << "' on '" << Config.Platform << "'\n"; - llvm::outs().flush(); + llvm::errs().flush(); } template