Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 0 additions & 97 deletions libc/benchmarks/gpu/BenchmarkLogger.cpp

This file was deleted.

29 changes: 0 additions & 29 deletions libc/benchmarks/gpu/BenchmarkLogger.h

This file was deleted.

16 changes: 5 additions & 11 deletions libc/benchmarks/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,31 +38,25 @@ add_unittest_framework_library(
SRCS
LibcGpuBenchmark.cpp
LibcGpuBenchmarkMain.cpp
BenchmarkLogger.cpp
HDRS
LibcGpuBenchmark.h
BenchmarkLogger.h
DEPENDS
libc.benchmarks.gpu.timing.timing
libc.hdr.stdint_proxy
libc.src.__support.big_int
libc.src.__support.c_string
libc.src.__support.CPP.string
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
libc.src.__support.CPP.array
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.nearest_integer_operations
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
libc.benchmarks.gpu.timing.timing
libc.src.__support.GPU.utils
libc.src.__support.time.gpu.time_utils
libc.src.stdio.printf
libc.src.time.clock
)

add_subdirectory(src)
18 changes: 11 additions & 7 deletions libc/benchmarks/gpu/LibcGpuBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@

#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/NearestIntegerOperations.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
#include "src/__support/time/gpu/time_utils.h"
#include "src/stdio/printf.h"
#include "src/time/clock.h"

namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
Expand Down Expand Up @@ -134,11 +135,13 @@ void print_results(Benchmark *b) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);

LIBC_NAMESPACE::printf(
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%15llu |%9u |\n",
b->get_test_name().data(), final_result.cycles,
final_result.standard_deviation, (unsigned long long)final_result.min,
(unsigned long long)final_result.max,
(unsigned long long)final_result.total_iterations, (unsigned)num_threads);
final_result.standard_deviation,
static_cast<unsigned long long>(final_result.min),
static_cast<unsigned long long>(final_result.max),
static_cast<unsigned long long>(final_result.total_iterations),
static_cast<unsigned>(num_threads));
}

void print_header() {
Expand All @@ -147,7 +150,7 @@ void print_header() {
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
" Min | Max | Iterations | Threads |\n";
" Min | Max | Iterations | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());

cpp::string separator(titles.size(), '-');
Expand Down Expand Up @@ -226,7 +229,8 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
change_ratio < options.epsilon)
break;

iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
iterations = static_cast<uint32_t>(
fputil::ceil(iterations * options.scaling_factor));
}

const auto &estimator = rep.get_estimator();
Expand Down
4 changes: 1 addition & 3 deletions libc/benchmarks/gpu/LibcGpuBenchmark.h
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H

#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"

#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/macros/config.h"
#include "src/time/clock.h"

namespace LIBC_NAMESPACE_DECL {

Expand Down
3 changes: 2 additions & 1 deletion libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ add_header_library(
timing.h
DEPENDS
libc.hdr.stdint_proxy
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
libc.src.__support.CPP.atomic
libc.src.__support.CPP.type_traits
libc.src.__support.GPU.utils
)
1 change: 0 additions & 1 deletion libc/benchmarks/gpu/timing/amdgpu/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"

Expand Down
3 changes: 2 additions & 1 deletion libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ add_header_library(
timing.h
DEPENDS
libc.hdr.stdint_proxy
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
libc.src.__support.CPP.atomic
libc.src.__support.CPP.type_traits
libc.src.__support.GPU.utils
)
4 changes: 1 addition & 3 deletions libc/benchmarks/gpu/timing/nvptx/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"

Expand Down Expand Up @@ -66,7 +64,7 @@ template <typename F, typename T>
uint64_t stop = gpu::processor_clock();
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
volatile T output = result;
volatile auto output = result;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weird that we need this here and not in the AMDGPU version, seems weird that these wouldn't be the same type. What were you seeing here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I had to touch the NVPTX version (and not the AMDGPU one) is simply that only the NVPTX latency() had this instruction at the end:

volatile T output = result;

In the ctype benches, we instantiate latency<int (*)(int), char>. The function returns int, but the template parameter T (the input type) is char. That line therefore assigns an int to a volatile char, which produces:

[4/15] Building CXX object libc/benchmarks/gpu/src/ctype/CMakeFiles/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__.dir/isalnum_benchmark.cpp.o
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp:1:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/LibcGpuBenchmark.h:4:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/timing/timing.h:17:
/home/leandro/llvm-project/libc/benchmarks/gpu/timing/nvptx/timing.h:67:23: warning: implicit conversion loses integer precision: 'int' to 'volatile char' [-Wimplicit-int-conversion]
   67 |   volatile T output = result;
      |              ~~~~~~   ^~~~~~
/home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp:7:26: note: in instantiation of function template specialization '__llvm_libc_22_0_0_git::latency<int (*)(int), char>' requested here
    7 |   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
      |                          ^
1 warning generated.
[6/15] Building CXX object libc/benchmarks/gpu/src/ctype/CMakeFiles/libc.benchmarks.gpu.src.ctype.isalpha_benchmark.__build__.dir/isalpha_benchmark.cpp.o
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp:1:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/LibcGpuBenchmark.h:4:
In file included from /home/leandro/llvm-project/libc/benchmarks/gpu/timing/timing.h:17:
/home/leandro/llvm-project/libc/benchmarks/gpu/timing/nvptx/timing.h:67:23: warning: implicit conversion loses integer precision: 'int' to 'volatile char' [-Wimplicit-int-conversion]
   67 |   volatile T output = result;
      |              ~~~~~~   ^~~~~~
/home/leandro/llvm-project/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp:7:26: note: in instantiation of function template specialization '__llvm_libc_22_0_0_git::latency<int (*)(int), char>' requested here
    7 |   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
      |                          ^
1 warning generated.

On AMDGPU, there is no such instruction in latency() (it seems to rely on the v_or_b32 asm to use the value), and it also has a small-type special-case for the asm operand (char/bool), so there’s no narrowing assignment there.

I didn’t dive deeper here because latency() isn’t used by the math benchmarks; I only wanted to make the ctype benches warning-free.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I vaguely remember needing that because the PTX optimizer would defy the inline assembly so I still needed it. That's fine it needed.


// Return the time elapsed.
return stop - start;
Expand Down
Loading