-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[libc] Polish GPU benchmarking #153900
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] Polish GPU benchmarking #153900
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-libc Author: Leandro Lacerda (leandrolcampos) ChangesThis patch provides cleanups and improvements for the GPU benchmarking infrastructure. The key changes are:
Full diff: https://github.com/llvm/llvm-project/pull/153900.diff 9 Files Affected:
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
deleted file mode 100644
index d5996a74f6dd7..0000000000000
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "benchmarks/gpu/BenchmarkLogger.h"
-#include "hdr/stdint_proxy.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h" // write_to_stderr
-#include "src/__support/big_int.h" // is_big_int
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
-#include "src/__support/uint128.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace benchmarks {
-
-// cpp::string_view specialization
-template <>
-BenchmarkLogger &
- BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
- LIBC_NAMESPACE::write_to_stderr(str);
- return *this;
-}
-
-// cpp::string specialization
-template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
- return *this << static_cast<cpp::string_view>(str);
-}
-
-// const char* specialization
-template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
- return *this << cpp::string_view(str);
-}
-
-// char* specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
- return *this << cpp::string_view(str);
-}
-
-// char specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
- return *this << cpp::string_view(&ch, 1);
-}
-
-// bool specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
- return *this << (cond ? "true" : "false");
-}
-
-// void * specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
- return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
-}
-
-template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
- if constexpr (is_big_int_v<T> ||
- (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
- (sizeof(T) > sizeof(uint64_t)))) {
- static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
- const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
- return *this << buffer.view();
- } else {
- return *this << cpp::to_string(t);
- }
-}
-
-// is_integral specializations
-// char is already specialized to handle character
-template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
-template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
-template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
-template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned char>(unsigned char);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned short>(unsigned short);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned int>(unsigned int);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned long>(unsigned long);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
-
-#ifdef LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
-#endif // LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
-
-// TODO: Add floating point formatting once it's supported by StringStream.
-
-BenchmarkLogger log;
-
-} // namespace benchmarks
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
deleted file mode 100644
index 2b22aba085f86..0000000000000
--- a/libc/benchmarks/gpu/BenchmarkLogger.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
-#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
-
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace benchmarks {
-
-// A class to log to standard output in the context of hermetic tests.
-struct BenchmarkLogger {
- constexpr BenchmarkLogger() = default;
- template <typename T> BenchmarkLogger &operator<<(T);
-};
-
-// A global TestLogger instance to be used in tests.
-extern BenchmarkLogger log;
-
-} // namespace benchmarks
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index beedac78d4826..6ca134b12a479 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -38,31 +38,25 @@ add_unittest_framework_library(
SRCS
LibcGpuBenchmark.cpp
LibcGpuBenchmarkMain.cpp
- BenchmarkLogger.cpp
HDRS
LibcGpuBenchmark.h
- BenchmarkLogger.h
DEPENDS
+ libc.benchmarks.gpu.timing.timing
libc.hdr.stdint_proxy
- libc.src.__support.big_int
- libc.src.__support.c_string
libc.src.__support.CPP.string
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
- libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
libc.src.__support.CPP.array
- libc.src.__support.fixed_point.fx_rep
- libc.src.__support.macros.properties.types
- libc.src.__support.OSUtil.osutil
- libc.src.__support.uint128
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.nearest_integer_operations
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
- libc.src.time.clock
- libc.benchmarks.gpu.timing.timing
+ libc.src.__support.GPU.utils
+ libc.src.__support.time.gpu.time_utils
libc.src.stdio.printf
+ libc.src.time.clock
)
add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index ef816c51a87d7..a4a0ff4ec46e5 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -2,16 +2,17 @@
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
-#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/NearestIntegerOperations.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
#include "src/__support/time/gpu/time_utils.h"
#include "src/stdio/printf.h"
+#include "src/time/clock.h"
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
@@ -134,11 +135,13 @@ void print_results(Benchmark *b) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
LIBC_NAMESPACE::printf(
- "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
+ "%-24s |%15.0f |%9.0f |%8llu |%8llu |%15llu |%9u |\n",
b->get_test_name().data(), final_result.cycles,
- final_result.standard_deviation, (unsigned long long)final_result.min,
- (unsigned long long)final_result.max,
- (unsigned long long)final_result.total_iterations, (unsigned)num_threads);
+ final_result.standard_deviation,
+ static_cast<unsigned long long>(final_result.min),
+ static_cast<unsigned long long>(final_result.max),
+ static_cast<unsigned long long>(final_result.total_iterations),
+ static_cast<unsigned>(num_threads));
}
void print_header() {
@@ -147,7 +150,7 @@ void print_header() {
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
- " Min | Max | Iterations | Threads |\n";
+ " Min | Max | Iterations | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());
cpp::string separator(titles.size(), '-');
@@ -226,7 +229,8 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
change_ratio < options.epsilon)
break;
- iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
+ iterations = static_cast<uint32_t>(
+ fputil::ceil(iterations * options.scaling_factor));
}
const auto &estimator = rep.get_estimator();
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 60f69edf86556..e36e93c7efc18 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -1,18 +1,16 @@
#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
-#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
+
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/macros/config.h"
-#include "src/time/clock.h"
namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index d6a89d04dab97..f85152e69c346 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,10 +4,11 @@ add_header_library(
timing.h
DEPENDS
libc.hdr.stdint_proxy
- libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
+ libc.src.__support.CPP.atomic
libc.src.__support.CPP.type_traits
+ libc.src.__support.GPU.utils
)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index de721a2d6ce6b..b4a174f729817 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -15,7 +15,6 @@
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
-#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 801080e7a6e98..4615f53e3d247 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,10 +4,11 @@ add_header_library(
timing.h
DEPENDS
libc.hdr.stdint_proxy
- libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
+ libc.src.__support.CPP.atomic
libc.src.__support.CPP.type_traits
+ libc.src.__support.GPU.utils
)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 133032ca08423..0c93a67129b8d 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -13,9 +13,7 @@
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
-#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
-#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
@@ -66,7 +64,7 @@ template <typename F, typename T>
uint64_t stop = gpu::processor_clock();
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
- volatile T output = result;
+ volatile auto output = result;
// Return the time elapsed.
return stop - start;
|
Here's what I get on my NVIDIA GeForce RTX 4070 Laptop GPU. Note that the [1/4] Running hermetic test libc.benchmarks.gpu.src.ctype.isalnum_benchmark
Running Suite: LlvmLibcIsAlNumGpuBenchmark
Benchmark | Cycles (Mean) | Stddev | Min | Max | Iterations | Threads |
------------------------------------------------------------------------------------------------------
IsAlnum | 53 | 0 | 53 | 53 | 11904 | 64 |
IsAlnumSingleThread | 53 | 0 | 53 | 53 | 186 | 1 |
IsAlnumSingleWave | 53 | 0 | 53 | 53 | 5952 | 32 |
IsAlnumCapital | 53 | 0 | 53 | 53 | 11904 | 64 |
IsAlnumNotAlnum | 43 | 0 | 43 | 43 | 11904 | 64 |
[2/4] Running hermetic test libc.benchmarks.gpu.src.ctype.isalpha_benchmark
Running Suite: LlvmLibcIsAlphaGpuBenchmark
Benchmark | Cycles (Mean) | Stddev | Min | Max | Iterations | Threads |
------------------------------------------------------------------------------------------------------
IsAlpha | 53 | 0 | 53 | 53 | 186 | 1 |
[3/4] Running hermetic test libc.benchmarks.gpu.src.math.sin_benchmark
Running Suite: LlvmLibcSinGpuBenchmark
Benchmark | Cycles (Mean) | Stddev | Min | Max | Iterations | Threads |
------------------------------------------------------------------------------------------------------
Sin_1 | 3120 | 153 | 2934 | 3610 | 2735008 | 32 |
Sin_128 | 364 | 6 | 346 | 382 | 12032 | 32 |
Sin_1024 | 354 | 2 | 348 | 359 | 5952 | 32 |
Sin_4096 | 361 | 1 | 358 | 363 | 4160 | 32 |
SinTwoPi_1 | 2217 | 11 | 2204 | 2494 | 24032 | 32 |
SinTwoPi_128 | 262 | 2 | 258 | 268 | 24032 | 32 |
SinTwoPi_1024 | 271 | 1 | 271 | 276 | 24032 | 32 |
SinTwoPi_4096 | 282 | 0 | 281 | 282 | 1984 | 32 |
SinTwoPow30_1 | 3133 | 11 | 3116 | 3177 | 1984 | 32 |
SinTwoPow30_128 | 348 | 2 | 345 | 352 | 1344 | 32 |
SinTwoPow30_1024 | 359 | 1 | 358 | 361 | 1984 | 32 |
SinTwoPow30_4096 | 368 | 0 | 367 | 368 | 576 | 32 |
SinVeryLarge_1 | 2855 | 15 | 2821 | 3082 | 8480 | 32 |
SinVeryLarge_128 | 316 | 2 | 310 | 324 | 17024 | 32 |
SinVeryLarge_1024 | 317 | 1 | 316 | 320 | 1344 | 32 |
SinVeryLarge_4096 | 325 | 0 | 325 | 326 | 896 | 32 |
NvSin_1 | 2521 | 69 | 2264 | 2876 | 5952 | 32 |
NvSin_128 | 1860 | 2 | 1857 | 1865 | 576 | 32 |
NvSin_1024 | 2069 | 1 | 2068 | 2071 | 352 | 32 |
NvSin_4096 | 2087 | 0 | 2087 | 2088 | 352 | 32 |
NvSinTwoPi_1 | 1106 | 1 | 1104 | 1108 | 1344 | 32 |
NvSinTwoPi_128 | 925 | 0 | 925 | 925 | 352 | 32 |
NvSinTwoPi_1024 | 1136 | 0 | 1136 | 1136 | 352 | 32 |
NvSinTwoPi_4096 | 1155 | 0 | 1155 | 1155 | 352 | 32 |
NvSinTwoPow30_1 | 1106 | 1 | 1104 | 1107 | 1984 | 32 |
NvSinTwoPow30_128 | 925 | 0 | 925 | 927 | 576 | 32 |
NvSinTwoPow30_1024 | 1136 | 0 | 1136 | 1136 | 352 | 32 |
NvSinTwoPow30_4096 | 1155 | 0 | 1155 | 1155 | 352 | 32 |
NvSinVeryLarge_1 | 2496 | 23 | 2251 | 2840 | 12032 | 32 |
NvSinVeryLarge_128 | 1827 | 1 | 1826 | 1829 | 576 | 32 |
NvSinVeryLarge_1024 | 2034 | 0 | 2034 | 2035 | 352 | 32 |
NvSinVeryLarge_4096 | 2052 | 0 | 2052 | 2052 | 352 | 32 |
Sinf_1 | 2200 | 170 | 1521 | 2397 | 507776 | 32 |
Sinf_128 | 240 | 5 | 218 | 255 | 47616 | 32 |
Sinf_1024 | 240 | 2 | 235 | 248 | 5952 | 32 |
Sinf_4096 | 259 | 1 | 257 | 261 | 896 | 32 |
SinfTwoPi_1 | 1441 | 11 | 1425 | 1760 | 33856 | 32 |
SinfTwoPi_128 | 147 | 1 | 146 | 150 | 12032 | 32 |
SinfTwoPi_1024 | 147 | 0 | 145 | 148 | 4160 | 32 |
SinfTwoPi_4096 | 165 | 0 | 165 | 167 | 4160 | 32 |
SinfTwoPow30_1 | 1087 | 10 | 1078 | 1160 | 1984 | 32 |
SinfTwoPow30_128 | 102 | 1 | 101 | 104 | 576 | 32 |
SinfTwoPow30_1024 | 102 | 0 | 100 | 103 | 2880 | 32 |
SinfTwoPow30_4096 | 121 | 0 | 121 | 122 | 4160 | 32 |
SinfVeryLarge_1 | 1924 | 14 | 1867 | 2277 | 24032 | 32 |
SinfVeryLarge_128 | 205 | 1 | 205 | 207 | 896 | 32 |
SinfVeryLarge_1024 | 206 | 0 | 204 | 206 | 5952 | 32 |
SinfVeryLarge_4096 | 225 | 0 | 224 | 226 | 8480 | 32 |
NvSinf_1 | 1023 | 6 | 1018 | 1036 | 2880 | 32 |
NvSinf_128 | 786 | 0 | 786 | 788 | 576 | 32 |
NvSinf_1024 | 975 | 1 | 970 | 975 | 2880 | 32 |
NvSinf_4096 | 1010 | 0 | 1010 | 1010 | 352 | 32 |
NvSinfTwoPi_1 | 162 | 3 | 162 | 509 | 362464 | 32 |
NvSinfTwoPi_128 | 141 | 0 | 141 | 143 | 2880 | 32 |
NvSinfTwoPi_1024 | 331 | 0 | 330 | 331 | 896 | 32 |
NvSinfTwoPi_4096 | 366 | 0 | 366 | 366 | 352 | 32 |
NvSinfTwoPow30_1 | 1025 | 10 | 1018 | 1280 | 47616 | 32 |
NvSinfTwoPow30_128 | 776 | 0 | 776 | 777 | 352 | 32 |
NvSinfTwoPow30_1024 | 966 | 0 | 966 | 967 | 352 | 32 |
NvSinfTwoPow30_4096 | 1000 | 0 | 1000 | 1000 | 352 | 32 |
NvSinfVeryLarge_1 | 1002 | 2 | 1000 | 1023 | 4160 | 32 |
NvSinfVeryLarge_128 | 757 | 0 | 757 | 758 | 576 | 32 |
NvSinfVeryLarge_1024 | 948 | 0 | 948 | 948 | 352 | 32 |
NvSinfVeryLarge_4096 | 981 | 0 | 981 | 981 | 352 | 32 |
[4/4] Running hermetic test libc.benchmarks.gpu.src.math.atan2_benchmark
Running Suite: LlvmLibcAtan2GpuBenchmark
Benchmark | Cycles (Mean) | Stddev | Min | Max | Iterations | Threads |
------------------------------------------------------------------------------------------------------
Atan2_1 | 4081 | 954 | 1892 | 5264 | 24032 | 32 |
Atan2_128 | 2505 | 37 | 2346 | 2601 | 33856 | 32 |
Atan2_1024 | 2747 | 13 | 2708 | 2783 | 2880 | 32 |
Atan2_4096 | 2749 | 6 | 2737 | 2759 | 576 | 32 |
Atan2TwoPi_1 | 2737 | 16 | 2728 | 3156 | 24032 | 32 |
Atan2TwoPi_128 | 1068 | 3 | 1064 | 1109 | 8480 | 32 |
Atan2TwoPi_1024 | 1301 | 1 | 1300 | 1303 | 576 | 32 |
Atan2TwoPi_4096 | 1302 | 0 | 1302 | 1303 | 576 | 32 |
Atan2TwoPow30_1 | 2733 | 16 | 2722 | 3157 | 24032 | 32 |
Atan2TwoPow30_128 | 1072 | 3 | 1065 | 1108 | 5952 | 32 |
Atan2TwoPow30_1024 | 1302 | 1 | 1301 | 1304 | 896 | 32 |
Atan2TwoPow30_4096 | 1303 | 0 | 1302 | 1303 | 352 | 32 |
Atan2Large_1 | 3569 | 382 | 1125 | 3878 | 131648 | 32 |
Atan2Large_128 | 1811 | 17 | 1766 | 1839 | 1984 | 32 |
Atan2Large_1024 | 2050 | 4 | 2043 | 2059 | 576 | 32 |
Atan2Large_4096 | 2049 | 2 | 2046 | 2053 | 352 | 32 |
NvAtan2_1 | 2909 | 38 | 2866 | 3341 | 17024 | 32 |
NvAtan2_128 | 2836 | 3 | 2830 | 2853 | 2880 | 32 |
NvAtan2_1024 | 3075 | 1 | 3074 | 3076 | 352 | 32 |
NvAtan2_4096 | 3075 | 1 | 3074 | 3076 | 352 | 32 |
NvAtan2TwoPi_1 | 2032 | 13 | 2032 | 2383 | 24032 | 32 |
NvAtan2TwoPi_128 | 1979 | 3 | 1979 | 1999 | 1984 | 32 |
NvAtan2TwoPi_1024 | 2218 | 0 | 2217 | 2218 | 352 | 32 |
NvAtan2TwoPi_4096 | 2218 | 0 | 2218 | 2218 | 352 | 32 |
NvAtan2TwoPow30_1 | 2032 | 8 | 2032 | 2184 | 12032 | 32 |
NvAtan2TwoPow30_128 | 1979 | 2 | 1979 | 1998 | 1984 | 32 |
NvAtan2TwoPow30_1024 | 2218 | 0 | 2217 | 2218 | 352 | 32 |
NvAtan2TwoPow30_4096 | 2218 | 0 | 2218 | 2219 | 576 | 32 |
NvAtan2Large_1 | 2032 | 12 | 2032 | 2354 | 24032 | 32 |
NvAtan2Large_128 | 1979 | 2 | 1979 | 1997 | 1984 | 32 |
NvAtan2Large_1024 | 2218 | 0 | 2217 | 2218 | 352 | 32 |
NvAtan2Large_4096 | 2219 | 0 | 2218 | 2219 | 352 | 32 | |
@@ -66,7 +64,7 @@ template <typename F, typename T> | |||
uint64_t stop = gpu::processor_clock(); | |||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); | |||
asm("" ::"r"(stop)); | |||
volatile T output = result; | |||
volatile auto output = result; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Weird that we need this here and not in the AMDGPU version, seems weird that these wouldn't be the same type. What were you seeing here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reason I had to touch the NVPTX version (and not the AMDGPU one) is simply that only the NVPTX latency()
had this instruction at the end:
volatile T output = result;
In the ctype benches, we instantiate latency<int (*)(int), char>
. The function returns int
, but the template parameter T
(the input type) is char
. That line therefore assigns an int
to a volatile char
, which produces:
[4/15] Building CXX object libc/benchmarks/gpu/src/ctype/CMakeFiles/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__.dir/isalnum_benchmark.cpp.o
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp:1:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/LibcGpuBenchmark.h:4:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/timing/timing.h:17:
/home/leandro/llvm-project/libc/benchmarks/gpu/timing/nvptx/timing.h:67:23: warning: implicit conversion loses integer precision: 'int' to 'volatile char' [-Wimplicit-int-conversion]
67 | volatile T output = result;
| ~~~~~~ ^~~~~~
/home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp:7:26: note: in instantiation of function template specialization '__llvm_libc_22_0_0_git::latency<int (*)(int), char>' requested here
7 | return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
| ^
1 warning generated.
[6/15] Building CXX object libc/benchmarks/gpu/src/ctype/CMakeFiles/libc.benchmarks.gpu.src.ctype.isalpha_benchmark.__build__.dir/isalpha_benchmark.cpp.o
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp:1:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/LibcGpuBenchmark.h:4:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/timing/timing.h:17:
/home/leandro/llvm-project/libc/benchmarks/gpu/timing/nvptx/timing.h:67:23: warning: implicit conversion loses integer precision: 'int' to 'volatile char' [-Wimplicit-int-conversion]
67 | volatile T output = result;
| ~~~~~~ ^~~~~~
/home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp:7:26: note: in instantiation of function template specialization '__llvm_libc_22_0_0_git::latency<int (*)(int), char>' requested here
7 | return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
| ^
1 warning generated.
On AMDGPU, there is no such instruction in latency()
(it seems to rely on the v_or_b32
asm to use the value), and it also has a small-type special-case for the asm operand (char/bool), so there’s no narrowing assignment there.
I didn’t dive deeper here because latency()
isn’t used by the math benchmarks; I only wanted to make the ctype benches warning-free.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I vaguely remember needing that because the PTX optimizer would defy the inline assembly so I still needed it. That's fine it needed.
This patch provides cleanups and improvements for the GPU benchmarking infrastructure. The key changes are:
BenchmarkLogger
files: This is dead code that added maintenance and cognitive overhead without providing functionality.