diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 401b06d268..3969b6068f 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -30,129 +30,60 @@ // The exact meaning of each argument depends on the benchmark, allocator, and size components used. // Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments. -static void default_alloc_fix_size(benchmark::internal::Benchmark *benchmark) { - benchmark->Args({10000, 0, 4096}); - benchmark->Args({10000, 100000, 4096}); +static void multithreaded(benchmark::internal::Benchmark *benchmark) { benchmark->Threads(4); benchmark->Threads(1); } -static void -default_alloc_uniform_size(benchmark::internal::Benchmark *benchmark) { - benchmark->Args({10000, 0, 8, 64 * 1024, 8}); - benchmark->Threads(4); - benchmark->Threads(1); -} - -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_fix, fixed_alloc_size, - glibc_malloc); - -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_fix) - ->Apply(&default_alloc_fix_size); - -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_uniform, - uniform_alloc_size, glibc_malloc); -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_uniform) - ->Apply(&default_alloc_uniform_size); - -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, os_provider, fixed_alloc_size, - provider_allocator); -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, os_provider) - ->Apply(&default_alloc_fix_size); - -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, proxy_pool, fixed_alloc_size, - pool_allocator>); - -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, proxy_pool) - ->Apply(&default_alloc_fix_size); - -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_fix, - fixed_alloc_size, - pool_allocator>); -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_fix) - ->Apply(&default_alloc_fix_size); - -// TODO: debug why this crashes -/*UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_uniform, - uniform_alloc_size, - pool_allocator>); -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_uniform) - ->Apply(&default_alloc_uniform_size); -*/ - -#ifdef UMF_POOL_JEMALLOC_ENABLED -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_fix, - fixed_alloc_size, - pool_allocator>); -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_fix) - ->Apply(&default_alloc_fix_size); - -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_uniform, - uniform_alloc_size, - pool_allocator>); -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_uniform) - ->Apply(&default_alloc_uniform_size); - -#endif -#ifdef UMF_POOL_SCALABLE_ENABLED -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_fix, - fixed_alloc_size, - pool_allocator>); - -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_fix) - ->Apply(&default_alloc_fix_size); - -UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_uniform, - uniform_alloc_size, - pool_allocator>); - -UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_uniform) - ->Apply(&default_alloc_uniform_size); -#endif -// Multiple allocs/free static void default_multiple_alloc_fix_size(benchmark::internal::Benchmark *benchmark) { - benchmark->Args({10000, 4096}); - benchmark->Threads(4); - benchmark->Threads(1); + benchmark->Args({10000, 1, 4096}); + benchmark->Iterations(500000); } static void default_multiple_alloc_uniform_size(benchmark::internal::Benchmark *benchmark) { - benchmark->Args({10000, 8, 64 * 1024, 8}); - benchmark->Threads(4); - benchmark->Threads(1); + benchmark->Args({10000, 1, 8, 4096, 8}); + benchmark->Args({10000, 1, 8, 128, 8}); + benchmark->Iterations(500000); } UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_fix, fixed_alloc_size, glibc_malloc); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_fix) - ->Apply(&default_multiple_alloc_fix_size); + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_uniform, uniform_alloc_size, glibc_malloc); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_uniform) - ->Apply(&default_multiple_alloc_uniform_size); + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, proxy_pool, fixed_alloc_size, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, proxy_pool) - ->Apply(&default_multiple_alloc_fix_size); + ->Apply(&default_multiple_alloc_fix_size) + // reduce iterations, as this benchmark is slower than others + ->Iterations(50000); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, os_provider, fixed_alloc_size, provider_allocator); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, os_provider) - ->Apply(&default_multiple_alloc_fix_size); + ->Apply(&default_multiple_alloc_fix_size) + // reduce iterations, as this benchmark is slower than others + ->Iterations(50000); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix, fixed_alloc_size, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix) - ->Apply(&default_multiple_alloc_fix_size); + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_uniform, uniform_alloc_size, @@ -165,13 +96,15 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix, fixed_alloc_size, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_fix) - ->Apply(&default_multiple_alloc_fix_size); + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_uniform, uniform_alloc_size, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_uniform) - ->Apply(&default_multiple_alloc_uniform_size); + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); #endif @@ -181,14 +114,25 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, scalable_pool_fix, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_fix) - ->Apply(&default_multiple_alloc_fix_size); + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, scalable_pool_uniform, uniform_alloc_size, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_uniform) - ->Apply(&default_multiple_alloc_uniform_size); + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); #endif -BENCHMARK_MAIN(); + +//BENCHMARK_MAIN(); +int main(int argc, char **argv) { + if (initAffinityMask()) { + return -1; + } + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); +} diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp index 50e75f8fb2..a960d89bca 100644 --- a/benchmark/benchmark.hpp +++ b/benchmark/benchmark.hpp @@ -70,8 +70,10 @@ * - Additional benchmarking scenarios can be created by extending `benchmark_interface`. */ -#include +#include #include + +#include #include #include @@ -83,13 +85,92 @@ struct alloc_data { size_t size; }; +struct next_alloc_data { + size_t offset; + size_t size; +}; + +#ifndef WIN32 +std::vector affinityMask; + +int initAffinityMask() { + cpu_set_t mask; + CPU_ZERO(&mask); + + if (sched_getaffinity(0, sizeof(mask), &mask) == -1) { + perror("sched_getaffinity"); + return 1; + } + + for (int cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (CPU_ISSET(cpu, &mask)) { + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + affinityMask.push_back(mask); + } + } + // we reverse affinityMask to avoid using cpu 0 if possible. + // CPU 0 is usually the most used one by other applications on the system. + std::reverse(affinityMask.begin(), affinityMask.end()); + return 0; +} + +void setAffinity(benchmark::State &state) { + size_t tid = state.thread_index(); + if (tid >= affinityMask.size()) { + state.SkipWithError("Not enough CPUs available to set affinity"); + } + + auto &mask = affinityMask[tid]; + + if (sched_setaffinity(0, sizeof(mask), &mask) != 0) { + state.SkipWithError("Failed to set affinity"); + } +} + +#else +int initAffinityMask() { + printf( + "Affinity set not supported on Windows, benchmark can be unstable\n"); + return 0; +} + +void setAffinity([[maybe_unused]] benchmark::State &state) { + // Not implemented for Windows +} + +#endif + +// function that ensures that all threads have reached the same point +inline void waitForAllThreads(const benchmark::State &state) { + static std::atomic count{0}; + static std::atomic generation{0}; + + const int totalThreads = state.threads(); + int gen = generation.load(std::memory_order_relaxed); + + int c = count.fetch_add(1, std::memory_order_acq_rel) + 1; + + if (c == totalThreads) { + // Last thread - reset count and bump generation + count.store(0, std::memory_order_relaxed); + generation.fetch_add(1, std::memory_order_acq_rel); + } else { + // Not the last thread: spin until the generation changes + while (generation.load(std::memory_order_acquire) == gen) { + std::this_thread::yield(); + } + } +} + template ::value>> class provider_allocator : public allocator_interface { public: - unsigned SetUp(::benchmark::State &state, unsigned r) override { + unsigned SetUp(::benchmark::State &state, unsigned argPos) override { provider.SetUp(state); - return r; + return argPos; } void TearDown(::benchmark::State &state) override { @@ -118,9 +199,9 @@ class provider_allocator : public allocator_interface { // TODO: assert Pool to be a pool_interface. template class pool_allocator : public allocator_interface { public: - unsigned SetUp(::benchmark::State &state, unsigned r) override { + unsigned SetUp(::benchmark::State &state, unsigned argPos) override { pool.SetUp(state); - return r; + return argPos; } void TearDown(::benchmark::State &state) override { pool.TearDown(state); } @@ -141,19 +222,28 @@ template class pool_allocator : public allocator_interface { template struct benchmark_interface : public benchmark::Fixture { - void SetUp(::benchmark::State &state) { - int argPos = alloc_size.SetUp(state, 0); - allocator.SetUp(state, argPos); + int parseArgs(::benchmark::State &state, int argPos) { + Size generator; + argPos = generator.SetUp(state, argPos); + argPos = allocator.SetUp(state, argPos); + alloc_sizes.resize(state.threads()); + for (auto &i : alloc_sizes) { + i = generator; + } + return argPos; } + void SetUp(::benchmark::State &state) { parseArgs(state, 0); } void TearDown(::benchmark::State &state) { - alloc_size.TearDown(state); + for (auto &i : alloc_sizes) { + i.TearDown(state); + } allocator.TearDown(state); } virtual void bench(::benchmark::State &state) = 0; - static std::vector argsName() { + virtual std::vector argsName() { auto s = Size::argsName(); auto a = Allocator::argsName(); std::vector res = {}; @@ -163,209 +253,167 @@ struct benchmark_interface : public benchmark::Fixture { } virtual std::string name() { return Allocator::name(); } - virtual int64_t iterations() { return 10000; } + static void defaultArgs(Benchmark *benchmark) { auto *bench = static_cast *>(benchmark); - benchmark->ArgNames(bench->argsName()) - ->Name(bench->name()) - ->Iterations(bench->iterations()); + benchmark->ArgNames(bench->argsName())->Name(bench->name()); } - Size alloc_size; + + std::vector alloc_sizes; Allocator allocator; }; -// This class benchmarks speed of alloc() operations. +// This class benchmarks performance of random deallocations and (re)allocations template < typename Size, typename Alloc, typename = std::enable_if_t::value>, typename = std::enable_if_t::value>> -class alloc_benchmark : public benchmark_interface { +class multiple_malloc_free_benchmark : public benchmark_interface { + using distribution = std::uniform_int_distribution; + template using vector2d = std::vector>; + using base = benchmark_interface; + + int allocsPerIterations = 10; + bool thread_local_allocations = true; + size_t max_allocs = 0; + + vector2d allocations; + std::vector iters; + + vector2d next; + std::vector::const_iterator> next_iter; + int64_t iterations; + public: - size_t max_allocs = 1000; - size_t pre_allocs = 0; void SetUp(::benchmark::State &state) override { - if (state.thread_index() != 0) { - return; - } + auto tid = state.thread_index(); - // unpack arguments - int argPos = 0; - max_allocs = state.range(argPos++); - pre_allocs = state.range(argPos++); - // pass rest of the arguments to "alloc_size" and "allocator" - argPos = base::alloc_size.SetUp(state, argPos); - base::allocator.SetUp(state, argPos); - - // initialize allocations tracking vectors (one per thread) - // and iterators for these vectors. - allocations.resize(state.threads()); - iters.resize(state.threads()); - - for (auto &i : iters) { - i = pre_allocs; + if (tid == 0) { + // unpack arguments + iterations = state.max_iterations; + int argPos = 0; + max_allocs = state.range(argPos++); + thread_local_allocations = state.range(argPos++); + base::parseArgs(state, argPos); + + allocations.resize(state.threads()); + next.resize(state.threads()); + next_iter.resize(state.threads()); + +#ifndef WIN32 + // Ensure that system malloc does not have memory pooled on the heap + malloc_trim(0); +#endif } - - // do "pre_alloc" allocations before actual benchmark. - for (auto &i : allocations) { - i.resize(max_allocs + pre_allocs); - - for (size_t j = 0; j < pre_allocs; j++) { - i[j].ptr = - base::allocator.benchAlloc(base::alloc_size.nextSize()); - if (i[j].ptr == NULL) { - state.SkipWithError("preallocation failed"); - return; - } - i[j].size = base::alloc_size.nextSize(); - } + setAffinity(state); + // sync thread to ensure that thread 0 parsed args and did all initialization + waitForAllThreads(state); + // Prepare workload for warp up + prealloc(state); + prepareWorkload(state); + // Start warm up with all threads at once + waitForAllThreads(state); + // warm up + for (int j = 0; j < iterations; j++) { + bench(state); } + waitForAllThreads(state); + // prepare workload for actual benchmark. + freeAllocs(state); + prealloc(state); + prepareWorkload(state); } void TearDown(::benchmark::State &state) override { - if (state.thread_index() != 0) { - return; - } - for (auto &i : allocations) { - for (auto &j : i) { - if (j.ptr != NULL) { - base::allocator.benchFree(j.ptr, j.size); - j.ptr = NULL; - j.size = 0; - } - } - } + auto tid = state.thread_index(); + freeAllocs(state); + waitForAllThreads(state); + if (tid == 0) { + // release memory used by benchmark + next.clear(); + next_iter.clear(); + allocations.clear(); + iters.clear(); + } base::TearDown(state); } void bench(benchmark::State &state) override { auto tid = state.thread_index(); - auto s = base::alloc_size.nextSize(); - auto &i = iters[tid]; - allocations[tid][i].ptr = base::allocator.benchAlloc(s); - if (allocations[tid][i].ptr == NULL) { - state.SkipWithError("allocation failed"); - return; - } - allocations[tid][i].size = s; - i++; - if (i >= max_allocs + pre_allocs) { - // This benchmark tests only allocations - - // if allocation tracker is full we pause benchmark to dealloc all allocations - - // excluding pre-allocated ones. - state.PauseTiming(); - while (i > pre_allocs) { - auto &allocation = allocations[tid][--i]; - base::allocator.benchFree(allocation.ptr, allocation.size); - allocation.ptr = NULL; - allocation.size = 0; + auto &allocation = allocations[tid]; + for (int i = 0; i < allocsPerIterations; i++) { + auto &n = *next_iter[tid]++; + auto &alloc = allocation[n.offset]; + base::allocator.benchFree(alloc.ptr, alloc.size); + + alloc.size = n.size; + alloc.ptr = base::allocator.benchAlloc(alloc.size); + + if (alloc.ptr == NULL) { + state.SkipWithError("allocation failed"); } - state.ResumeTiming(); } } + virtual std::string name() { + return base::name() + "/multiple_malloc_free"; + } + virtual std::vector argsName() { auto n = benchmark_interface::argsName(); - std::vector res = {"max_allocs", "pre_allocs"}; + std::vector res = {"max_allocs", + "thread_local_allocations"}; res.insert(res.end(), n.begin(), n.end()); return res; } - virtual std::string name() { return base::name() + "/alloc"; } - virtual int64_t iterations() { return 200000; } - - protected: - using base = benchmark_interface; - std::vector> allocations; - std::vector iters; -}; - -// This class benchmarks performance of random deallocations and (re)allocations -template < - typename Size, typename Alloc, - typename = - std::enable_if_t::value>, - typename = - std::enable_if_t::value>> -class multiple_malloc_free_benchmark : public alloc_benchmark { - using distribution = std::uniform_int_distribution; - using base = alloc_benchmark; - - public: - int reallocs = 100; - void SetUp(::benchmark::State &state) override { - if (state.thread_index() != 0) { - return; - } - // unpack arguments - int argPos = 0; - base::max_allocs = state.range(argPos++); - - // pass rest of the arguments to "alloc_size" and "allocator" - argPos = base::alloc_size.SetUp(state, argPos); - base::allocator.SetUp(state, argPos); - - // perform initial allocations which will be later freed and reallocated - base::allocations.resize(state.threads()); - for (auto &i : base::allocations) { - i.resize(base::max_allocs); - - for (size_t j = 0; j < base::max_allocs; j++) { - i[j].ptr = - base::allocator.benchAlloc(base::alloc_size.nextSize()); - if (i[j].ptr == NULL) { - state.SkipWithError("preallocation failed"); - return; - } - i[j].size = base::alloc_size.nextSize(); + private: + void prealloc(benchmark::State &state) { + auto tid = state.thread_index(); + auto &i = allocations[tid]; + i.resize(max_allocs); + auto sizeGenerator = base::alloc_sizes[tid]; + for (size_t j = 0; j < max_allocs; j++) { + auto size = sizeGenerator.nextSize(); + i[j].ptr = base::allocator.benchAlloc(size); + if (i[j].ptr == NULL) { + state.SkipWithError("preallocation failed"); + return; } + i[j].size = size; } - dist.param(distribution::param_type(0, base::max_allocs - 1)); } - void bench(benchmark::State &state) override { + void freeAllocs(benchmark::State &state) { auto tid = state.thread_index(); - auto &allocation = base::allocations[tid]; - std::vector to_alloc; - for (int j = 0; j < reallocs; j++) { - auto idx = dist(generator); - if (allocation[idx].ptr == NULL) { - continue; - } - to_alloc.push_back(idx); - - base::allocator.benchFree(allocation[idx].ptr, - allocation[idx].size); - allocation[idx].ptr = NULL; - allocation[idx].size = 0; - } - - for (auto idx : to_alloc) { - auto s = base::alloc_size.nextSize(); - allocation[idx].ptr = base::allocator.benchAlloc(s); - if (allocation[idx].ptr == NULL) { - state.SkipWithError("allocation failed"); + auto &i = allocations[tid]; + for (auto &j : i) { + if (j.ptr != NULL) { + base::allocator.benchFree(j.ptr, j.size); + j.ptr = NULL; + j.size = 0; } - allocation[idx].size = s; } } - virtual std::string name() { - return base::base::name() + "/multiple_malloc_free"; - } - - virtual std::vector argsName() { - auto n = benchmark_interface::argsName(); - std::vector res = {"max_allocs"}; - res.insert(res.end(), n.begin(), n.end()); - return res; + void prepareWorkload(benchmark::State &state) { + auto tid = state.thread_index(); + auto &n = next[tid]; + std::default_random_engine generator; + distribution dist; + generator.seed(0); + dist.param(distribution::param_type(0, max_allocs - 1)); + auto sizeGenerator = base::alloc_sizes[tid]; + + n.clear(); + for (int64_t j = 0; j < state.max_iterations * allocsPerIterations; + j++) { + n.push_back({dist(generator), sizeGenerator.nextSize()}); + } + next_iter[tid] = n.cbegin(); } - - virtual int64_t iterations() { return 2000; } - - std::default_random_engine generator; - distribution dist; }; diff --git a/benchmark/benchmark_size.hpp b/benchmark/benchmark_size.hpp index d17a6b2869..44e4bf1da8 100644 --- a/benchmark/benchmark_size.hpp +++ b/benchmark/benchmark_size.hpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -45,7 +45,7 @@ class uniform_alloc_size : public alloc_size_interface { state.SkipWithError("min and max must be divisible by granularity"); return argPos; } - + generator.seed(0); dist.param(distribution::param_type(min / gran, max / gran)); multiplier = gran; return argPos; @@ -53,11 +53,11 @@ class uniform_alloc_size : public alloc_size_interface { void TearDown([[maybe_unused]] ::benchmark::State &state) override {} size_t nextSize() override { return dist(generator) * multiplier; } static std::vector argsName() { - return {"min size", "max size", "granularity"}; + return {"min_size", "max_size", "granularity"}; } private: std::default_random_engine generator; distribution dist; - size_t multiplier; + size_t multiplier = 1; };