Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
d717ce2
Add minimum to benchmark reporting, add repetitions to GitHub workflow
ProExpertProg Nov 21, 2024
bcc7034
Don't recompute redundant derivatives
ProExpertProg Nov 18, 2024
0f09ef7
constexpr for constants
ProExpertProg Nov 19, 2024
0339918
Merge remote-tracking branch 'origin/main' into luka/opt/logging
ProExpertProg Nov 21, 2024
144fa5b
Added grid class (with tests)
ProExpertProg Oct 21, 2024
de0c4c2
Added grid stuff to naive and related utilities
ProExpertProg Oct 21, 2024
beac1e5
compile errs
ProExpertProg Oct 23, 2024
3834192
Add transformer class
ProExpertProg Oct 23, 2024
bb806fc
Extract exporting (and importing) to Exporter. Currently complex is f…
ProExpertProg Oct 30, 2024
c42d912
Change export to save as layout-right. Verified the recorded moments …
ProExpertProg Oct 30, 2024
e6bf417
Formatting
ProExpertProg Oct 30, 2024
4e57bde
Remove old apar function
ProExpertProg Oct 30, 2024
fb465de
Add PrintTo utility for mdspan printing in GoogleTest, and print in m…
ProExpertProg Nov 12, 2024
c254187
Extract filtering to Filter class (TODO tests). Add cached versions a…
ProExpertProg Nov 6, 2024
ffcb60f
Remove extra mdspans
ProExpertProg Nov 14, 2024
f7b0bb6
filter tests
ProExpertProg Nov 10, 2024
3edb54f
Exporter
ProExpertProg Nov 22, 2024
6c77a1d
const Filter
ProExpertProg Nov 14, 2024
0227db1
Extract bracket calculation (correct)
ProExpertProg Nov 14, 2024
0c8292c
Extract PrepareDerivatives
ProExpertProg Nov 14, 2024
c7a3191
Move kx/ky/kperp to grid, move hyper to own file
ProExpertProg Nov 22, 2024
0fe404f
add std:: for exp and pow
ProExpertProg Nov 19, 2024
166e971
Update exp_* signatures, save hyper, use `dt` member
ProExpertProg Nov 22, 2024
9d41c18
Extract exp_* to classes
ProExpertProg Nov 22, 2024
b905f3b
Bench for exps
ProExpertProg Nov 22, 2024
8b4780b
Add moment-like bench
ProExpertProg Nov 22, 2024
cd745f3
Add cached exp to benchmark
ProExpertProg Nov 22, 2024
2798877
Add update and M scaling for benchmarks, save dt in cached
ProExpertProg Nov 24, 2024
fe28dca
Add test for cached exps
ProExpertProg Nov 24, 2024
8234115
Add vectorized version of filter
ProExpertProg Nov 10, 2024
20d7e0f
Extract vector dereferences outside of loop.
ProExpertProg Nov 10, 2024
908b92f
2 vectors along kx at a time, single vector to load values in, use pe…
ProExpertProg Nov 12, 2024
3b91e0b
Extract duplication utility, allows compilation without AVX-512
ProExpertProg Nov 12, 2024
ce0fb6a
Add tile along KY dimension, significant speed improvement (30-60%)
ProExpertProg Nov 14, 2024
b9aa679
Add vectorized PrepareDerivatives, test, and benchmark
ProExpertProg Nov 15, 2024
192795d
Manually extract common subexpression
ProExpertProg Nov 15, 2024
ac747dc
Vectorized norm
ProExpertProg Nov 15, 2024
87eb4a3
vector derivatives, Hou-Li
ProExpertProg Nov 24, 2024
d7c3f38
Add cilk_scope to bench/main
ProExpertProg Nov 24, 2024
79bfa0d
Spawn and cilk_for everything in Naive, Grid::for_each_* loops, and v…
ProExpertProg Nov 24, 2024
6fca533
TEMP: reducers, initial grid changes, no vector
ProExpertProg Nov 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ AllowShortLoopsOnASingleLine: false
BreakBeforeBraces: Attach
InsertBraces: true
ColumnLimit: 100
ForEachMacros: [ cilk_for ]

IncludeCategories:
- Regex: '^<'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
cmake ../cpp -G Ninja -D CMAKE_BUILD_TYPE=Release -D ENABLE_CILK=OFF
ninja
- name: benchmark
run: ./cmake-build-release/bench/bench-naive --benchmark_out_format=json --benchmark_out=../bench-naive.json --benchmark_min_time=5x
run: ./cmake-build-release/bench/bench-naive --benchmark_out_format=json --benchmark_out=../bench-naive.json --benchmark_repetitions=10

# Compare to baseline (main)
- uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
Expand All @@ -48,7 +48,7 @@ jobs:
cmake ../cpp -G Ninja -D CMAKE_BUILD_TYPE=Release -D ENABLE_CILK=OFF
ninja
- name: benchmark-main
run: ./cmake-build-release-main/bench/bench-naive --benchmark_out_format=json --benchmark_out=../bench-naive-main.json --benchmark_min_time=5x
run: ./cmake-build-release-main/bench/bench-naive --benchmark_out_format=json --benchmark_out=../bench-naive-main.json --benchmark_repetitions=10
- name: compare-install
run: |
# Use cmake-build-release-main because cmake-build-release is gone after checkout
Expand Down
7 changes: 0 additions & 7 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 30 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,16 @@ FetchContent_Declare(
GIT_TAG e593f6695c6065e6b345fe2862f04a519ed484e0
)

FetchContent_MakeAvailable(argparse mdspan cnpy spdlog)
FetchContent_Declare(
eve
GIT_REPOSITORY https://github.com/jfalcou/eve.git
GIT_TAG b2d8b637e71d132654c52480549e9b79944d1f74
)

option(EVE_BUILD_TEST "Build EVE tests" OFF)
FetchContent_MakeAvailable(argparse mdspan cnpy spdlog eve)
target_include_directories(cnpy PUBLIC ${CNPY_SOURCE_DIR})
#target_include_directories(cnpy-static PUBLIC ${CNPY_SOURCE_DIR})

# ==========================
# Project code
Expand All @@ -44,23 +52,42 @@ target_link_libraries(fftw-cpp INTERFACE mdspan)
option(ENABLE_AVX512 "Enable AVX512 instructions (e.g. might want to turn off for the sake of Valgrind)" OFF)
if (ENABLE_AVX512)
add_compile_options("-mavx512f" "-march=native")
add_compile_definitions(AVX512_ENABLED)
endif ()

option(ENABLE_CILK "Enable Cilk parallelism" ON)
option(ENABLE_CILKSAN "Enable Cilk Sanitizer" OFF)
option(ENABLE_CILKSAN_WORKAROUND "Enable beta feature" OFF)
if (ENABLE_CILK)
add_compile_options("-fopencilk")
add_link_options("-fopencilk")
add_compile_definitions(CILK_ENABLED)
if (ENABLE_CILKSAN)
add_compile_options("-fsanitize=cilk")
add_link_options("-fsanitize=cilk")
add_compile_definitions(CILKSAN_ENABLED)
if (ENABLE_CILKSAN_WORKAROUND)
add_compile_options("-mllvm" "-cilksan-bc-path=/opt/OpenCilk-2.0.0/lib/clang/14.0.6/lib/x86_64-unknown-linux-gnu/libcilksan.bc")
add_link_options("-mllvm" "-cilksan-bc-path=/opt/OpenCilk-2.0.0/lib/clang/14.0.6/lib/x86_64-unknown-linux-gnu/libcilksan.bc")
endif ()
endif ()
endif ()

# TODO: cleanup (currently just to test formatting)
add_executable(adaptive_hermite_refinement lib/test-triangle.cpp)
target_include_directories(adaptive_hermite_refinement PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(adaptive_hermite_refinement argparse mdspan)

add_library(src-lib OBJECT lib/Naive.cpp lib/Naive.hpp lib/HermiteRunner.cpp lib/HermiteRunner.hpp)
add_library(src-lib OBJECT lib/Naive.cpp lib/Naive.hpp lib/HermiteRunner.cpp lib/HermiteRunner.hpp
lib/Transformer.hpp lib/Transformer.cpp
lib/Exporter.hpp lib/Exporter.cpp
lib/Filter.hpp lib/Filter.cpp
lib/PrepareDerivatives.hpp lib/PrepareDerivatives.cpp
lib/Brackets.hpp lib/Brackets.cpp
)
target_include_directories(src-lib PUBLIC lib/)
target_link_libraries(src-lib mdspan fftw-cpp cnpy spdlog::spdlog)
# TODO give eve only to vectorized targets
target_link_libraries(src-lib mdspan fftw-cpp cnpy spdlog::spdlog eve::eve)

add_subdirectory(bench)

Expand Down
11 changes: 10 additions & 1 deletion cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,18 @@ FetchContent_Declare(
)
FetchContent_MakeAvailable(google-benchmark)

add_executable(bench-naive naive.cpp main-no-log.cpp)
add_executable(bench-naive naive.cpp main.cpp)
target_link_libraries(bench-naive src-lib benchmark::benchmark)

add_executable(bench-hl-filter hl-filter.cpp main.cpp)
target_link_libraries(bench-hl-filter src-lib benchmark::benchmark)

add_executable(bench-exps exps.cpp main.cpp)
target_link_libraries(bench-exps src-lib benchmark::benchmark)

add_executable(bench-prepare prepare-derivatives.cpp main.cpp)
target_link_libraries(bench-prepare src-lib benchmark::benchmark)

if (ENABLE_CILK)
add_executable(bench-fftw-cilk fftw-cilk.cpp)
target_link_libraries(bench-fftw-cilk fftw-cpp benchmark::benchmark fftw3_threads)
Expand Down
9 changes: 9 additions & 0 deletions cpp/bench/benchmark-util.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once
#include <benchmark/benchmark.h>

/// Add `min` as a statistic to the benchmark, useful for serial execution.
/// Not reported when only running 1 repetition.
#define BENCHMARK_WMIN(...) \
BENCHMARK(__VA_ARGS__)->ComputeStatistics("min", [](const std::vector<double> &v) { \
return *std::min_element(v.begin(), v.end()); \
})
72 changes: 72 additions & 0 deletions cpp/bench/exps.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include "CachedExponentials.hpp"
#include "Exponentials.hpp"

#include "benchmark-util.hpp"
#include <iostream>

using namespace ahr;
using namespace ahr::exp;
template <space_like Exp> static void BM_ExpKXKY(benchmark::State &state) {
Dim const M = state.range(0);
Dim const X = state.range(1);
Dim const Y = state.range(2);

Grid grid{M, X, Y};
Real dt = 1.0;
HyperCoefficients hyper = HyperCoefficients::calculate(dt, grid);
Exp exp{grid};

for (auto _ : state) {
dt *= 1.2;
exp.update(hyper, dt);
for (int m = 0; m < M; ++m) {
grid.for_each_kxky([&](Dim kx, Dim ky) { benchmark::DoNotOptimize(exp(kx, ky)); });
}
}
}

template <moment_like Exp> static void BM_ExpM(benchmark::State &state) {
Dim const M = state.range(0);
Dim const X = state.range(1);
Dim const Y = state.range(2);

Grid grid{M, X, Y};
Real dt = 1.0;
HyperCoefficients hyper = HyperCoefficients::calculate(dt, grid);

Exp exp{grid};

for (auto _ : state) {
dt *= 1.2;
exp.update(hyper, dt);
for (Dim m = 0; m < M; ++m) {
grid.for_each_kxky([&](Dim kx, Dim ky) { benchmark::DoNotOptimize(exp(m)); });
}
}
}

BENCHMARK_WMIN(BM_ExpKXKY<Eta>)
->ArgsProduct({{2, 4, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_ExpKXKY<Nu>)
->ArgsProduct({{2, 4, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_ExpKXKY<NuG>)
->ArgsProduct({{2, 4, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_ExpM<GM>)
->ArgsProduct({{4, 8, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);

BENCHMARK_WMIN(BM_ExpKXKY<CachedKXKY<Eta>>)
->ArgsProduct({{2, 4, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_ExpKXKY<CachedKXKY<Nu>>)
->ArgsProduct({{2, 4, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_ExpKXKY<CachedKXKY<NuG>>)
->ArgsProduct({{2, 4, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_ExpM<CachedM<GM>>)
->ArgsProduct({{4, 8, 16}, {2048, 4096}, {2048, 4096}})
->Unit(benchmark::kMillisecond);
16 changes: 11 additions & 5 deletions cpp/bench/fftw-cilk.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "fftw-cpp/fftw-cpp.h"

#include <benchmark/benchmark.h>
#include "benchmark-util.hpp"
#include <cilk/cilk.h>
#include <cilk/cilkscale.h>
#include <cstring>
Expand Down Expand Up @@ -76,15 +76,21 @@ static void BM_FFTW_2D(benchmark::State &state) {
auto constexpr K = 1024;
auto constexpr M = K * K;

BENCHMARK(BM_copy_1D)->RangeMultiplier(2)->Range(16 * M, 64 * M)->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_copy_1D)
->RangeMultiplier(2)
->Range(16 * M, 64 * M)
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_FFTW_1D)->RangeMultiplier(2)->Range(16 * M, 64 * M)->Unit(benchmark::kMillisecond);
BENCHMARK_WMIN(BM_FFTW_1D)
->RangeMultiplier(2)
->Range(16 * M, 64 * M)
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_copy_2D)
BENCHMARK_WMIN(BM_copy_2D)
->ArgsProduct({{4 * K, 8 * K, 16 * K}, {4 * K, 8 * K, 16 * K}})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_FFTW_2D)
BENCHMARK_WMIN(BM_FFTW_2D)
->ArgsProduct({{4 * K, 8 * K, 16 * K}, {4 * K, 8 * K, 16 * K}})
->Unit(benchmark::kMillisecond);

Expand Down
41 changes: 41 additions & 0 deletions cpp/bench/hl-filter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "Filter.hpp"

#include <benchmark/benchmark.h>

static constexpr auto N_WARMUP_ITERS = 5;

using namespace ahr;
template <class Filter> static void BM_HouLiFilter(benchmark::State &state) {
Dim const X = state.range(0);
Dim const Y = state.range(1);

Grid grid{1, X, Y};
Filter filter{grid};

auto buf = grid.cBufXY();

// Warm-up
for (int i = 0; i < N_WARMUP_ITERS; i++) {
filter(buf);
}

for (auto _ : state) {
filter(buf);
}
}

BENCHMARK(BM_HouLiFilter<HouLiFilter>)
->ArgsProduct({{2048, 4096, 8192}, {2048, 4096, 8192}})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_HouLiFilter<HouLiFilterCached>)
->ArgsProduct({{2048, 4096, 8192}, {2048, 4096, 8192}})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_HouLiFilter<HouLiFilterCached1D>)
->ArgsProduct({{2048, 4096, 8192}, {2048, 4096, 8192}})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_HouLiFilter<HouLiFilterCached1DVector>)
->ArgsProduct({{2048, 4096, 8192}, {2048, 4096, 8192}})
->Unit(benchmark::kMillisecond);
11 changes: 8 additions & 3 deletions cpp/bench/main-no-log.cpp → cpp/bench/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
#include <spdlog/cfg/env.h>
#include <spdlog/spdlog.h>

#include "cilk.hpp"

int main(int argc, char **argv) {
// No logging in benchmarks (unless overridden via environment variable)
spdlog::set_level(spdlog::level::off);
spdlog::cfg::load_env_levels();

benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
// Invoke a cilk_scope to avoid Cilk startup/shutdown in benchmarks
cilk_scope {
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
}
}
6 changes: 2 additions & 4 deletions cpp/bench/naive.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#include "Naive.hpp"

#include <benchmark/benchmark.h>
#include "benchmark-util.hpp"
#include <iostream>

using namespace ahr;
static void BM_Naive(benchmark::State &state) {
std::ostringstream oss;

Dim const M = state.range(0);
Dim const X = state.range(1);
Dim const N = state.range(2);
Expand All @@ -26,7 +24,7 @@ static void BM_Naive(benchmark::State &state) {
}
}

BENCHMARK(BM_Naive)
BENCHMARK_WMIN(BM_Naive)
->ArgsProduct({{2, 4, 10, 45}, {128, 256, 512}, {5}})
->ArgsProduct({{2, 4}, {1024, 2048}, {5}})
->ArgsProduct({{2}, {4096}, {5}})
Expand Down
35 changes: 35 additions & 0 deletions cpp/bench/prepare-derivatives.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "PrepareDerivatives.hpp"

#include <benchmark/benchmark.h>

static constexpr auto N_WARMUP_ITERS = 5;

using namespace ahr;
template <class Prepare> static void BM_Prepare(benchmark::State &state) {
Dim const X = state.range(0);
Dim const Y = state.range(1);

Grid grid{1, X, Y};
Prepare prepare{grid};

auto buf = grid.cBufXY();
auto buf2 = grid.cBufXY();
auto buf3 = grid.cBufXY();

// Warm-up
for (int i = 0; i < N_WARMUP_ITERS; i++) {
prepare(buf, {buf2, buf3});
}

for (auto _ : state) {
prepare(buf, {buf2, buf3});
}
}

BENCHMARK(BM_Prepare<PrepareDerivatives>)
->ArgsProduct({{2048, 4096, 8192}, {2048, 4096, 8192}})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_Prepare<PrepareDerivativesVector>)
->ArgsProduct({{2048, 4096, 8192}, {2048, 4096, 8192}})
->Unit(benchmark::kMillisecond);
Loading
Loading