diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 6ca134b12a479..cf8c9902ca7f0 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -40,6 +40,7 @@ add_unittest_framework_library( LibcGpuBenchmarkMain.cpp HDRS LibcGpuBenchmark.h + Random.h DEPENDS libc.benchmarks.gpu.timing.timing libc.hdr.stdint_proxy @@ -49,12 +50,17 @@ add_unittest_framework_library( libc.src.__support.CPP.algorithm libc.src.__support.CPP.atomic libc.src.__support.CPP.array + libc.src.__support.CPP.optional libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.nearest_integer_operations libc.src.__support.FPUtil.sqrt + libc.src.__support.sign libc.src.__support.fixedvector libc.src.__support.GPU.utils libc.src.__support.time.gpu.time_utils + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types libc.src.stdio.printf libc.src.time.clock ) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index e36e93c7efc18..b310d49a60fd1 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -1,6 +1,8 @@ #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H +#include "benchmarks/gpu/Random.h" + #include "benchmarks/gpu/timing/timing.h" #include "hdr/stdint_proxy.h" @@ -175,94 +177,6 @@ class Benchmark { } }; -class RandomGenerator { - uint64_t state; - - static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept { - x += 0x9E3779B97F4A7C15ULL; - x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL; - x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL; - x = (x ^ (x >> 31)); - return x ? x : 0x9E3779B97F4A7C15ULL; - } - -public: - explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept - : state(splitmix64(seed)) {} - - LIBC_INLINE uint64_t next64() noexcept { - uint64_t x = state; - x ^= x >> 12; - x ^= x << 25; - x ^= x >> 27; - state = x; - return x * 0x2545F4914F6CDD1DULL; - } - - LIBC_INLINE uint32_t next32() noexcept { - return static_cast(next64() >> 32); - } -}; - -// We want random floating-point values whose *unbiased* exponent e is -// approximately uniform in [min_exp, max_exp]. That is, -// 2^min_exp <= |value| < 2^(max_exp + 1). -// Caveats / boundaries: -// - e = -EXP_BIAS ==> subnormal range (biased exponent = 0). We ensure a -// non-zero mantissa so we don't accidentally produce 0. -// - e in [1 - EXP_BIAS, EXP_BIAS] ==> normal numbers. -// - e = EXP_BIAS + 1 ==> Inf/NaN. We do not include it by default; max_exp -// defaults to EXP_BIAS. -template -static T -get_rand_input(RandomGenerator &rng, - int min_exp = -LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS, - int max_exp = LIBC_NAMESPACE::fputil::FPBits::EXP_BIAS) { - using FPBits = LIBC_NAMESPACE::fputil::FPBits; - using Storage = typename FPBits::StorageType; - - // Sanitize and clamp requested range to what the format supports - if (min_exp > max_exp) { - auto tmp = min_exp; - min_exp = max_exp; - max_exp = tmp; - }; - min_exp = cpp::max(min_exp, -FPBits::EXP_BIAS); - max_exp = cpp::min(max_exp, FPBits::EXP_BIAS); - - // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo - // bias - auto sample_in_range = [&](uint64_t r) -> int32_t { - const uint64_t range = static_cast( - static_cast(max_exp) - static_cast(min_exp) + 1); - const uint64_t threshold = (-range) % range; - while (r < threshold) - r = rng.next64(); - return static_cast(min_exp + static_cast(r % range)); - }; - const int32_t e = sample_in_range(rng.next64()); - - // Start from random bits to get random sign and mantissa - FPBits xbits([&] { - if constexpr (cpp::is_same_v) - return FPBits(rng.next64()); - else - return FPBits(rng.next32()); - }()); - - if (e == -FPBits::EXP_BIAS) { - // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0 - xbits.set_biased_exponent(Storage(0)); - if (xbits.get_mantissa() == Storage(0)) - xbits.set_mantissa(Storage(1)); - } else { - // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS] - const int32_t biased = e + FPBits::EXP_BIAS; - xbits.set_biased_exponent(static_cast(biased)); - } - return xbits.get_val(); -} - template class MathPerf { static LIBC_INLINE uint64_t make_seed(uint64_t base_seed, uint64_t salt) { const uint64_t tid = gpu::get_thread_id(); @@ -271,9 +185,9 @@ template class MathPerf { public: // Returns cycles-per-call (lower is better) - template - static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp, - uint32_t call_index) { + template + static uint64_t run_throughput(T (*f)(T), const Dist &dist, + uint32_t call_index) { cpp::array inputs; uint64_t base_seed = static_cast(call_index); @@ -281,7 +195,7 @@ template class MathPerf { RandomGenerator rng(make_seed(base_seed, salt)); for (size_t i = 0; i < N; ++i) - inputs[i] = get_rand_input(rng, min_exp, max_exp); + inputs[i] = dist(rng); uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); @@ -289,11 +203,9 @@ template class MathPerf { } // Returns cycles-per-call (lower is better) - template - static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, - int arg1_max_exp, int arg2_min_exp, - int arg2_max_exp, - uint32_t call_index) { + template + static uint64_t run_throughput(T (*f)(T, T), const Dist1 &dist1, + const Dist2 &dist2, uint32_t call_index) { cpp::array inputs1; cpp::array inputs2; @@ -302,8 +214,8 @@ template class MathPerf { RandomGenerator rng(make_seed(base_seed, salt)); for (size_t i = 0; i < N; ++i) { - inputs1[i] = get_rand_input(rng, arg1_min_exp, arg1_max_exp); - inputs2[i] = get_rand_input(rng, arg2_min_exp, arg2_max_exp); + inputs1[i] = dist1(rng); + inputs2[i] = dist2(rng); } uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); diff --git a/libc/benchmarks/gpu/Random.h b/libc/benchmarks/gpu/Random.h new file mode 100644 index 0000000000000..f7d272289a6d9 --- /dev/null +++ b/libc/benchmarks/gpu/Random.h @@ -0,0 +1,190 @@ +//===-- Pseudo-random number generation utilities ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_BENCHMARKS_GPU_RANDOM_H +#define LLVM_LIBC_BENCHMARKS_GPU_RANDOM_H + +#include "hdr/stdint_proxy.h" +#include "src/__support/CPP/algorithm.h" +#include "src/__support/CPP/optional.h" +#include "src/__support/CPP/type_traits.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" +#include "src/__support/sign.h" + +namespace LIBC_NAMESPACE_DECL { +namespace benchmarks { + +// Pseudo-random number generator (PRNG) that produces unsigned 64-bit, 32-bit, +// and 16-bit integers. The implementation is based on the xorshift* generator, +// seeded using SplitMix64 for robust initialization. For more details, see: +// https://en.wikipedia.org/wiki/Xorshift +class RandomGenerator { + uint64_t state; + + static LIBC_INLINE uint64_t splitmix64(uint64_t x) noexcept { + x += 0x9E3779B97F4A7C15ULL; + x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL; + x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL; + x = (x ^ (x >> 31)); + return x ? x : 0x9E3779B97F4A7C15ULL; + } + +public: + explicit LIBC_INLINE RandomGenerator(uint64_t seed) noexcept + : state(splitmix64(seed)) {} + + LIBC_INLINE uint64_t next64() noexcept { + uint64_t x = state; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + state = x; + return x * 0x2545F4914F6CDD1DULL; + } + + LIBC_INLINE uint32_t next32() noexcept { + return static_cast(next64() >> 32); + } + + LIBC_INLINE uint16_t next16() noexcept { + return static_cast(next64() >> 48); + } +}; + +// Generates random floating-point numbers where the unbiased binary exponent +// is sampled uniformly in `[min_exp, max_exp]`. The significand bits are +// always randomized, while the sign is randomized by default but can be fixed. +// Evenly covers orders of magnitude; never yields Inf/NaN. +template class UniformExponent { + static_assert(cpp::is_same_v || cpp::is_same_v || + cpp::is_same_v, + "UniformExponent supports float16, float, and double"); + + using FPBits = LIBC_NAMESPACE::fputil::FPBits; + using Storage = typename FPBits::StorageType; + +public: + explicit UniformExponent(int min_exp = -FPBits::EXP_BIAS, + int max_exp = FPBits::EXP_BIAS, + cpp::optional forced_sign = cpp::nullopt) + : min_exp(clamp_exponent(cpp::min(min_exp, max_exp))), + max_exp(clamp_exponent(cpp::max(min_exp, max_exp))), + forced_sign(forced_sign) {} + + LIBC_INLINE T operator()(RandomGenerator &rng) const noexcept { + // Sample unbiased exponent e uniformly in [min_exp, max_exp] without modulo + // bias, using rejection sampling + auto sample_in_range = [&](uint64_t r) -> int32_t { + const uint64_t range = static_cast( + static_cast(max_exp) - static_cast(min_exp) + 1); + const uint64_t threshold = (-range) % range; + while (r < threshold) + r = rng.next64(); + return static_cast(min_exp + static_cast(r % range)); + }; + const int32_t e = sample_in_range(rng.next64()); + + // Start from random bits to get random sign and mantissa + FPBits xbits([&] { + if constexpr (cpp::is_same_v) + return FPBits(rng.next64()); + else if constexpr (cpp::is_same_v) + return FPBits(rng.next32()); + else + return FPBits(rng.next16()); + }()); + + if (e == -FPBits::EXP_BIAS) { + // Subnormal: biased exponent must be 0; ensure mantissa != 0 to avoid 0 + xbits.set_biased_exponent(Storage(0)); + if (xbits.get_mantissa() == Storage(0)) + xbits.set_mantissa(Storage(1)); + } else { + // Normal: biased exponent in [1, 2 * FPBits::EXP_BIAS] + const int32_t biased = e + FPBits::EXP_BIAS; + xbits.set_biased_exponent(static_cast(biased)); + } + + if (forced_sign) + xbits.set_sign(*forced_sign); + + return xbits.get_val(); + } + +private: + static LIBC_INLINE int clamp_exponent(int val) noexcept { + if (val < -FPBits::EXP_BIAS) + return -FPBits::EXP_BIAS; + + if (val > FPBits::EXP_BIAS) + return FPBits::EXP_BIAS; + + return val; + } + + const int min_exp; + const int max_exp; + const cpp::optional forced_sign; +}; + +// Generates random floating-point numbers that are uniformly distributed on +// a linear scale. Values are sampled from `[min_val, max_val)`. +template class UniformLinear { + static_assert(cpp::is_same_v || cpp::is_same_v || + cpp::is_same_v, + "UniformLinear supports float16, float, and double"); + + using FPBits = LIBC_NAMESPACE::fputil::FPBits; + using Storage = typename FPBits::StorageType; + + static constexpr T MAX_NORMAL = FPBits::max_normal().get_val(); + +public: + explicit UniformLinear(T min_val = -MAX_NORMAL, T max_val = MAX_NORMAL) + : min_val(clamp_val(cpp::min(min_val, max_val))), + max_val(clamp_val(cpp::max(min_val, max_val))) {} + + LIBC_INLINE T operator()(RandomGenerator &rng) const noexcept { + double u = standard_uniform(rng.next64()); + double a = static_cast(min_val); + double b = static_cast(max_val); + double y = a + (b - a) * u; + return static_cast(y); + } + +private: + static LIBC_INLINE T clamp_val(T val) noexcept { + if (val < -MAX_NORMAL) + return -MAX_NORMAL; + + if (val > MAX_NORMAL) + return MAX_NORMAL; + + return val; + } + + static LIBC_INLINE double standard_uniform(uint64_t x) noexcept { + constexpr int PREC_BITS = + LIBC_NAMESPACE::fputil::FPBits::SIG_LEN + 1; + constexpr int SHIFT_BITS = LIBC_NAMESPACE::fputil::FPBits::EXP_LEN; + constexpr double INV = 1.0 / static_cast(1ULL << PREC_BITS); + + return static_cast(x >> SHIFT_BITS) * INV; + } + + const T min_val; + const T max_val; +}; + +} // namespace benchmarks +} // namespace LIBC_NAMESPACE_DECL + +#endif diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt index 8417f23c124a0..53da45d9eb2ba 100644 --- a/libc/benchmarks/gpu/src/math/CMakeLists.txt +++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt @@ -25,15 +25,19 @@ if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) endif() add_benchmark( - sin_benchmark + atan2_benchmark SUITE libc-gpu-math-benchmarks SRCS - sin_benchmark.cpp + atan2_benchmark.cpp + HDRS + platform.h DEPENDS libc.hdr.stdint_proxy - libc.src.math.sin - libc.src.math.sinf + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.math.atan2 COMPILE_OPTIONS ${math_benchmark_flags} LOADER_ARGS @@ -41,14 +45,143 @@ add_benchmark( ) add_benchmark( - atan2_benchmark + exp_benchmark SUITE libc-gpu-math-benchmarks SRCS - atan2_benchmark.cpp + exp_benchmark.cpp + HDRS + platform.h DEPENDS libc.hdr.stdint_proxy - libc.src.math.atan2 + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.math.exp + COMPILE_OPTIONS + ${math_benchmark_flags} + LOADER_ARGS + --threads 64 +) + +add_benchmark( + expf_benchmark + SUITE + libc-gpu-math-benchmarks + SRCS + expf_benchmark.cpp + HDRS + platform.h + DEPENDS + libc.hdr.stdint_proxy + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.math.expf + COMPILE_OPTIONS + ${math_benchmark_flags} + LOADER_ARGS + --threads 64 +) + +add_benchmark( + expf16_benchmark + SUITE + libc-gpu-math-benchmarks + SRCS + expf16_benchmark.cpp + HDRS + platform.h + DEPENDS + libc.hdr.stdint_proxy + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.math.expf16 + COMPILE_OPTIONS + ${math_benchmark_flags} + LOADER_ARGS + --threads 64 +) + +add_benchmark( + log_benchmark + SUITE + libc-gpu-math-benchmarks + SRCS + log_benchmark.cpp + HDRS + platform.h + DEPENDS + libc.hdr.stdint_proxy + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.__support.sign + libc.src.math.log + COMPILE_OPTIONS + ${math_benchmark_flags} + LOADER_ARGS + --threads 64 +) + +add_benchmark( + logf_benchmark + SUITE + libc-gpu-math-benchmarks + SRCS + logf_benchmark.cpp + HDRS + platform.h + DEPENDS + libc.hdr.stdint_proxy + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.__support.sign + libc.src.math.logf + COMPILE_OPTIONS + ${math_benchmark_flags} + LOADER_ARGS + --threads 64 +) + +add_benchmark( + logf16_benchmark + SUITE + libc-gpu-math-benchmarks + SRCS + logf16_benchmark.cpp + HDRS + platform.h + DEPENDS + libc.hdr.stdint_proxy + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.__support.sign + libc.src.math.logf16 + COMPILE_OPTIONS + ${math_benchmark_flags} + LOADER_ARGS + --threads 64 +) + +add_benchmark( + sin_benchmark + SUITE + libc-gpu-math-benchmarks + SRCS + sin_benchmark.cpp + HDRS + platform.h + DEPENDS + libc.hdr.stdint_proxy + libc.src.__support.macros.attributes + libc.src.__support.macros.config + libc.src.__support.macros.properties.types + libc.src.math.sin + libc.src.math.sinf COMPILE_OPTIONS ${math_benchmark_flags} LOADER_ARGS diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp index 82bb0c5d7de49..6039f0c66b2ad 100644 --- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp +++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp @@ -9,8 +9,11 @@ #define BM_RANDOM_INPUTS(T, Func, MinExp, MaxExp, N) \ [](uint32_t call_index) { \ - return LIBC_NAMESPACE::benchmarks::MathPerf::run_throughput_in_range< \ - N>(Func, MinExp, MaxExp, MinExp, MaxExp, call_index); \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformExponent dist(MinExp, MaxExp); \ + return MathPerf::template run_throughput(Func, dist, dist, \ + call_index); \ } #define BENCH(T, Name, Func, MinExp, MaxExp) \ diff --git a/libc/benchmarks/gpu/src/math/exp_benchmark.cpp b/libc/benchmarks/gpu/src/math/exp_benchmark.cpp new file mode 100644 index 0000000000000..2398c4b9f17bd --- /dev/null +++ b/libc/benchmarks/gpu/src/math/exp_benchmark.cpp @@ -0,0 +1,59 @@ +//===-- GPU benchmark for exp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "benchmarks/gpu/Random.h" + +#include "hdr/stdint_proxy.h" +#include "src/math/exp.h" + +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" +#endif + +#define RANDOM_INPUT(T, Func, Dist, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const Dist dist(Min, Max); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define BENCH(T, Name, Func, Dist, Min, Max) \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpGpuBenchmark, Name##_1, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 1)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpGpuBenchmark, Name##_128, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 128)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpGpuBenchmark, Name##_1024, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 1024)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpGpuBenchmark, Name##_4096, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 4096)) + +using LIBC_NAMESPACE::exp; + +BENCH(double, ExpSubnormal, exp, UniformExponent, -1022, -1022); +BENCH(double, ExpCoreRange, exp, UniformLinear, -10.0, 10.0); +BENCH(double, ExpFinite, exp, UniformLinear, -745.0, 709.0); +BENCH(double, ExpUnderflow, exp, UniformLinear, -746.0, -745.0); +BENCH(double, ExpOverflow, exp, UniformLinear, 709.0, 710.0); + +#ifdef NVPTX_MATH_FOUND +BENCH(double, NvExpSubnormal, __nv_exp, UniformExponent, -1022, -1022); +BENCH(double, NvExpCoreRange, __nv_exp, UniformLinear, -10.0, 10.0); +BENCH(double, NvExpFinite, __nv_exp, UniformLinear, -745.0, 709.0); +BENCH(double, NvExpUnderflow, __nv_exp, UniformLinear, -746.0, -745.0); +BENCH(double, NvExpOverflow, __nv_exp, UniformLinear, 709.0, 710.0); +#endif + +#ifdef AMDGPU_MATH_FOUND +BENCH(double, AmdExpSubnormal, __ocml_exp_f64, UniformExponent, -1022, -1022); +BENCH(double, AmdExpCoreRange, __ocml_exp_f64, UniformLinear, -10.0, 10.0); +BENCH(double, AmdExpFinite, __ocml_exp_f64, UniformLinear, -745.0, 709.0); +BENCH(double, AmdExpUnderflow, __ocml_exp_f64, UniformLinear, -746.0, -745.0); +BENCH(double, AmdExpOverflow, __ocml_exp_f64, UniformLinear, 709.0, 710.0); +#endif diff --git a/libc/benchmarks/gpu/src/math/expf16_benchmark.cpp b/libc/benchmarks/gpu/src/math/expf16_benchmark.cpp new file mode 100644 index 0000000000000..20e045b893ec0 --- /dev/null +++ b/libc/benchmarks/gpu/src/math/expf16_benchmark.cpp @@ -0,0 +1,56 @@ +//===-- GPU benchmark for expf16 ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "benchmarks/gpu/Random.h" + +#include "hdr/stdint_proxy.h" +#include "src/__support/macros/properties/types.h" +#include "src/math/expf16.h" + +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" +#endif + +#define RANDOM_INPUT(T, Func, Dist, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const Dist dist(Min, Max); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define BENCH(T, Name, Func, Dist, Min, Max) \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpf16GpuBenchmark, Name##_1, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 1)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpf16GpuBenchmark, Name##_128, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 128)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpf16GpuBenchmark, Name##_1024, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 1024)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpf16GpuBenchmark, Name##_4096, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 4096)) + +using LIBC_NAMESPACE::expf16; + +BENCH(float16, Expf16Subnormal, expf16, UniformExponent, -14, -14); +BENCH(float16, Expf16CoreRange, expf16, UniformLinear, -10.0f16, 10.0f16); +BENCH(float16, Expf16Finite, expf16, UniformLinear, -16.0f16, 11.0f16); +BENCH(float16, Expf16Underflow, expf16, UniformLinear, -17.0f16, -16.0f16); +BENCH(float16, Expf16Overflow, expf16, UniformLinear, 11.0f16, 12.0f16); + +#ifdef AMDGPU_MATH_FOUND +BENCH(float16, AmdExpf16Subnormal, __ocml_exp_f16, UniformExponent, -14, -14); +BENCH(float16, AmdExpf16CoreRange, __ocml_exp_f16, UniformLinear, -10.0f16, + 10.0f16); +BENCH(float16, AmdExpf16Finite, __ocml_exp_f16, UniformLinear, -16.0f16, + 11.0f16); +BENCH(float16, AmdExpf16Underflow, __ocml_exp_f16, UniformLinear, -17.0f16, + -16.0f16); +BENCH(float16, AmdExpf16Overflow, __ocml_exp_f16, UniformLinear, 11.0f16, + 12.0f16); +#endif diff --git a/libc/benchmarks/gpu/src/math/expf_benchmark.cpp b/libc/benchmarks/gpu/src/math/expf_benchmark.cpp new file mode 100644 index 0000000000000..4ef54c53baf48 --- /dev/null +++ b/libc/benchmarks/gpu/src/math/expf_benchmark.cpp @@ -0,0 +1,59 @@ +//===-- GPU benchmark for expf --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "benchmarks/gpu/Random.h" + +#include "hdr/stdint_proxy.h" +#include "src/math/expf.h" + +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" +#endif + +#define RANDOM_INPUT(T, Func, Dist, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const Dist dist(Min, Max); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define BENCH(T, Name, Func, Dist, Min, Max) \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpfGpuBenchmark, Name##_1, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 1)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpfGpuBenchmark, Name##_128, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 128)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpfGpuBenchmark, Name##_1024, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 1024)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcExpfGpuBenchmark, Name##_4096, \ + RANDOM_INPUT(T, Func, Dist, Min, Max, 4096)) + +using LIBC_NAMESPACE::expf; + +BENCH(float, ExpfSubnormal, expf, UniformExponent, -126, -126); +BENCH(float, ExpfCoreRange, expf, UniformLinear, -10.0f, 10.0f); +BENCH(float, ExpfFinite, expf, UniformLinear, -103.0f, 88.0f); +BENCH(float, ExpfUnderflow, expf, UniformLinear, -104.0f, -103.0f); +BENCH(float, ExpfOverflow, expf, UniformLinear, 88.0f, 89.0f); + +#ifdef NVPTX_MATH_FOUND +BENCH(float, NvExpfSubnormal, __nv_expf, UniformExponent, -126, -126); +BENCH(float, NvExpfCoreRange, __nv_expf, UniformLinear, -10.0f, 10.0f); +BENCH(float, NvExpfFinite, __nv_expf, UniformLinear, -103.0f, 88.0f); +BENCH(float, NvExpfUnderflow, __nv_expf, UniformLinear, -104.0f, -103.0f); +BENCH(float, NvExpfOverflow, __nv_expf, UniformLinear, 88.0f, 89.0f); +#endif + +#ifdef AMDGPU_MATH_FOUND +BENCH(float, AmdExpfSubnormal, __ocml_exp_f32, UniformExponent, -126, -126); +BENCH(float, AmdExpfCoreRange, __ocml_exp_f32, UniformLinear, -10.0f, 10.0f); +BENCH(float, AmdExpfFinite, __ocml_exp_f32, UniformLinear, -103.0f, 88.0f); +BENCH(float, AmdExpfUnderflow, __ocml_exp_f32, UniformLinear, -104.0f, -103.0f); +BENCH(float, AmdExpfOverflow, __ocml_exp_f32, UniformLinear, 88.0f, 89.0f); +#endif diff --git a/libc/benchmarks/gpu/src/math/log_benchmark.cpp b/libc/benchmarks/gpu/src/math/log_benchmark.cpp new file mode 100644 index 0000000000000..0ea1906ff053b --- /dev/null +++ b/libc/benchmarks/gpu/src/math/log_benchmark.cpp @@ -0,0 +1,68 @@ +//===-- GPU benchmark for log ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "benchmarks/gpu/Random.h" + +#include "hdr/stdint_proxy.h" +#include "src/__support/sign.h" +#include "src/math/log.h" + +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" +#endif + +#define RANDOM_INPUT_UniformExponent(T, Func, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformExponent dist(Min, Max, LIBC_NAMESPACE::Sign::POS); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define RANDOM_INPUT_UniformLinear(T, Func, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformLinear dist(Min, Max); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define BENCH(T, Name, Func, Dist, Min, Max) \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogGpuBenchmark, Name##_1, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 1)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogGpuBenchmark, Name##_128, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 128)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogGpuBenchmark, Name##_1024, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 1024)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogGpuBenchmark, Name##_4096, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 4096)) + +using LIBC_NAMESPACE::log; + +static constexpr double INV_E = 0x1.78b56362cef38p-2; // exp(-1.0) +static constexpr double E = 0x1.5bf0a8b145769p+1; // exp(+1.0) + +BENCH(double, LogSubnormal, log, UniformExponent, -1022, -1022); +BENCH(double, LogAroundOne, log, UniformLinear, INV_E, E); +BENCH(double, LogMedMag, log, UniformExponent, -10, 10); +BENCH(double, LogNormal, log, UniformExponent, -1021, 1023); + +#ifdef NVPTX_MATH_FOUND +BENCH(double, NvLogSubnormal, __nv_log, UniformExponent, -1022, -1022); +BENCH(double, NvLogAroundOne, __nv_log, UniformLinear, INV_E, E); +BENCH(double, NvLogMedMag, __nv_log, UniformExponent, -10, 10); +BENCH(double, NvLogNormal, __nv_log, UniformExponent, -1021, 1023); +#endif + +#ifdef AMDGPU_MATH_FOUND +BENCH(double, AmdLogSubnormal, __ocml_log_f64, UniformExponent, -1022, -1022); +BENCH(double, AmdLogAroundOne, __ocml_log_f64, UniformLinear, INV_E, E); +BENCH(double, AmdLogMedMag, __ocml_log_f64, UniformExponent, -10, 10); +BENCH(double, AmdLogNormal, __ocml_log_f64, UniformExponent, -1021, 1023); +#endif diff --git a/libc/benchmarks/gpu/src/math/logf16_benchmark.cpp b/libc/benchmarks/gpu/src/math/logf16_benchmark.cpp new file mode 100644 index 0000000000000..9748e15c4640b --- /dev/null +++ b/libc/benchmarks/gpu/src/math/logf16_benchmark.cpp @@ -0,0 +1,62 @@ +//===-- GPU benchmark for logf16 ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "benchmarks/gpu/Random.h" + +#include "hdr/stdint_proxy.h" +#include "src/__support/macros/properties/types.h" +#include "src/__support/sign.h" +#include "src/math/logf16.h" + +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" +#endif + +#define RANDOM_INPUT_UniformExponent(T, Func, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformExponent dist(Min, Max, LIBC_NAMESPACE::Sign::POS); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define RANDOM_INPUT_UniformLinear(T, Func, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformLinear dist(Min, Max); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define BENCH(T, Name, Func, Dist, Min, Max) \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogf16GpuBenchmark, Name##_1, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 1)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogf16GpuBenchmark, Name##_128, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 128)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogf16GpuBenchmark, Name##_1024, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 1024)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogf16GpuBenchmark, Name##_4096, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 4096)) + +using LIBC_NAMESPACE::logf16; + +static constexpr float16 INV_E = 0x1.78b56362cef38p-2f16; // exp(-1.0) +static constexpr float16 E = 0x1.5bf0a8b145769p+1f16; // exp(+1.0) + +BENCH(float16, Logf16Subnormal, logf16, UniformExponent, -14, -14); +BENCH(float16, Logf16AroundOne, logf16, UniformLinear, INV_E, E); +BENCH(float16, Logf16MedMag, logf16, UniformExponent, -10, 10); +BENCH(float16, Logf16Normal, logf16, UniformExponent, -13, 15); + +#ifdef AMDGPU_MATH_FOUND +BENCH(float16, AmdLogf16Subnormal, __ocml_log_f16, UniformExponent, -14, -14); +BENCH(float16, AmdLogf16AroundOne, __ocml_log_f16, UniformLinear, INV_E, E); +BENCH(float16, AmdLogf16MedMag, __ocml_log_f16, UniformExponent, -10, 10); +BENCH(float16, AmdLogf16Normal, __ocml_log_f16, UniformExponent, -13, 15); +#endif diff --git a/libc/benchmarks/gpu/src/math/logf_benchmark.cpp b/libc/benchmarks/gpu/src/math/logf_benchmark.cpp new file mode 100644 index 0000000000000..c4e5a226a18f5 --- /dev/null +++ b/libc/benchmarks/gpu/src/math/logf_benchmark.cpp @@ -0,0 +1,68 @@ +//===-- GPU benchmark for logf --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "benchmarks/gpu/Random.h" + +#include "hdr/stdint_proxy.h" +#include "src/__support/sign.h" +#include "src/math/logf.h" + +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" +#endif + +#define RANDOM_INPUT_UniformExponent(T, Func, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformExponent dist(Min, Max, LIBC_NAMESPACE::Sign::POS); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define RANDOM_INPUT_UniformLinear(T, Func, Min, Max, N) \ + [](uint32_t call_index) { \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformLinear dist(Min, Max); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ + } + +#define BENCH(T, Name, Func, Dist, Min, Max) \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogfGpuBenchmark, Name##_1, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 1)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogfGpuBenchmark, Name##_128, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 128)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogfGpuBenchmark, Name##_1024, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 1024)); \ + SINGLE_WAVE_BENCHMARK(LlvmLibcLogfGpuBenchmark, Name##_4096, \ + RANDOM_INPUT_##Dist(T, Func, Min, Max, 4096)) + +using LIBC_NAMESPACE::logf; + +static constexpr float INV_E = 0x1.78b56362cef38p-2f; // exp(-1.0) +static constexpr float E = 0x1.5bf0a8b145769p+1f; // exp(+1.0) + +BENCH(float, LogfSubnormal, logf, UniformExponent, -126, -126); +BENCH(float, LogfAroundOne, logf, UniformLinear, INV_E, E); +BENCH(float, LogfMedMag, logf, UniformExponent, -10, 10); +BENCH(float, LogfNormal, logf, UniformExponent, -125, 127); + +#ifdef NVPTX_MATH_FOUND +BENCH(float, NvLogfSubnormal, __nv_logf, UniformExponent, -126, -126); +BENCH(float, NvLogfAroundOne, __nv_logf, UniformLinear, INV_E, E); +BENCH(float, NvLogfMedMag, __nv_logf, UniformExponent, -10, 10); +BENCH(float, NvLogfNormal, __nv_logf, UniformExponent, -125, 127); +#endif + +#ifdef AMDGPU_MATH_FOUND +BENCH(float, AmdLogfSubnormal, __ocml_log_f32, UniformExponent, -126, -126); +BENCH(float, AmdLogfAroundOne, __ocml_log_f32, UniformLinear, INV_E, E); +BENCH(float, AmdLogfMedMag, __ocml_log_f32, UniformExponent, -10, 10); +BENCH(float, AmdLogfNormal, __ocml_log_f32, UniformExponent, -125, 127); +#endif diff --git a/libc/benchmarks/gpu/src/math/platform.h b/libc/benchmarks/gpu/src/math/platform.h index 2dfa9f2299d46..e675d1e7b0d12 100644 --- a/libc/benchmarks/gpu/src/math/platform.h +++ b/libc/benchmarks/gpu/src/math/platform.h @@ -11,6 +11,7 @@ #include "hdr/stdint_proxy.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" namespace LIBC_NAMESPACE_DECL { @@ -41,17 +42,27 @@ extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000; // Forward declarations for the vendor math libraries. extern "C" { #ifdef AMDGPU_MATH_FOUND -double __ocml_sin_f64(double); -float __ocml_sin_f32(float); double __ocml_atan2_f64(double, double); float __ocml_atan2_f32(float, float); +double __ocml_exp_f64(double); +float __ocml_exp_f32(float); +float16 __ocml_exp_f16(float16); +double __ocml_log_f64(double); +float __ocml_log_f32(float); +float16 __ocml_log_f16(float16); +double __ocml_sin_f64(double); +float __ocml_sin_f32(float); #endif #ifdef NVPTX_MATH_FOUND -double __nv_sin(double); -float __nv_sinf(float); double __nv_atan2(double, double); float __nv_atan2f(float, float); +double __nv_exp(double); +float __nv_expf(float); +double __nv_log(double); +float __nv_logf(float); +double __nv_sin(double); +float __nv_sinf(float); #endif } diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp index 5fe95c3f3b268..5ed82c845decc 100644 --- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp +++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp @@ -1,4 +1,5 @@ #include "benchmarks/gpu/LibcGpuBenchmark.h" +#include "benchmarks/gpu/Random.h" #include "hdr/stdint_proxy.h" #include "src/math/sin.h" @@ -10,8 +11,10 @@ #define BM_RANDOM_INPUT(T, Func, MinExp, MaxExp, N) \ [](uint32_t call_index) { \ - return LIBC_NAMESPACE::benchmarks::MathPerf::run_throughput_in_range< \ - N>(Func, MinExp, MaxExp, call_index); \ + using namespace LIBC_NAMESPACE::benchmarks; \ + \ + const UniformExponent dist(MinExp, MaxExp); \ + return MathPerf::template run_throughput(Func, dist, call_index); \ } #define BENCH(T, Name, Func, MinExp, MaxExp) \