diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index f9b176b6ae..d1734b5d07 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -32,10 +32,9 @@ // The exact meaning of each argument depends on the benchmark, allocator, and size components used. // Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments. -template static void multithreaded(benchmark::internal::Benchmark *benchmark) { benchmark->Threads(1); - benchmark->DenseThreadRange(4, max_threads, 4); + benchmark->Threads(4); } static void singlethreaded(benchmark::internal::Benchmark *benchmark) { @@ -92,16 +91,14 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix) ->Apply(&default_multiple_alloc_fix_size) - // Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts. - ->Apply(&multithreaded<4>); + ->Apply(&multithreaded); UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_uniform, uniform_alloc_size, pool_allocator>); UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_uniform) ->Apply(&default_multiple_alloc_uniform_size) - // Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts. - ->Apply(&multithreaded<4>); + ->Apply(&multithreaded); #ifdef UMF_POOL_JEMALLOC_ENABLED UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix, @@ -159,6 +156,70 @@ UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, fixed_provider) // reduce iterations, to match os_provider benchmark ->Iterations(50000); +// peak +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_fix, fixed_alloc_size, + glibc_malloc); + +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_fix) + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_uniform, + uniform_alloc_size, glibc_malloc); +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_uniform) + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_fix, + fixed_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_fix) + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_uniform, + uniform_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_uniform) + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); + +#ifdef UMF_POOL_JEMALLOC_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_fix, + fixed_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_fix) + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_uniform, + uniform_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_uniform) + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); + +#endif + +#ifdef UMF_POOL_SCALABLE_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_fix, + fixed_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_fix) + ->Apply(&default_multiple_alloc_fix_size) + ->Apply(&multithreaded); + +UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_uniform, + uniform_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_uniform) + ->Apply(&default_multiple_alloc_uniform_size) + ->Apply(&multithreaded); + +#endif + //BENCHMARK_MAIN(); int main(int argc, char **argv) { if (initAffinityMask()) { diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp index 1749a1390c..8be2673d97 100644 --- a/benchmark/benchmark.hpp +++ b/benchmark/benchmark.hpp @@ -70,6 +70,7 @@ * - Additional benchmarking scenarios can be created by extending `benchmark_interface`. */ +#include #include #include @@ -86,6 +87,7 @@ struct alloc_data { }; struct next_alloc_data { + bool alloc; // true if allocation, false if deallocation size_t offset; size_t size; }; @@ -288,10 +290,9 @@ template < typename = std::enable_if_t::value>> class multiple_malloc_free_benchmark : public benchmark_interface { - using distribution = std::uniform_int_distribution; + protected: template using vector2d = std::vector>; using base = benchmark_interface; - int allocsPerIterations = 10; bool thread_local_allocations = true; size_t max_allocs = 0; @@ -299,7 +300,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface { vector2d allocations; vector2d next; using next_alloc_data_iterator = - std::vector::const_iterator; + typename std::vector::const_iterator; std::vector> next_iter; int64_t iterations; @@ -386,15 +387,20 @@ class multiple_malloc_free_benchmark : public benchmark_interface { auto tid = state.thread_index(); auto &allocation = allocations[tid]; auto &iter = next_iter[tid]; + for (int i = 0; i < allocsPerIterations; i++) { auto &n = *(*iter)++; auto &alloc = allocation[n.offset]; - base::allocator.benchFree(alloc.ptr, alloc.size); - alloc.size = n.size; - alloc.ptr = base::allocator.benchAlloc(alloc.size); - - if (alloc.ptr == NULL) { - state.SkipWithError("allocation failed"); + if (n.alloc) { + alloc.ptr = base::allocator.benchAlloc(n.size); + if (alloc.ptr == NULL) { + state.SkipWithError("allocation failed"); + } + alloc.size = n.size; + } else { + base::allocator.benchFree(alloc.ptr, alloc.size); + alloc.ptr = NULL; + alloc.size = 0; } } } @@ -412,13 +418,14 @@ class multiple_malloc_free_benchmark : public benchmark_interface { } private: - void prealloc(benchmark::State &state) { + virtual void prealloc(benchmark::State &state) { auto tid = state.thread_index(); auto &i = allocations[tid]; i.resize(max_allocs); auto sizeGenerator = base::alloc_sizes[tid]; - for (size_t j = 0; j < max_allocs; j++) { + // Preallocate half of the available slots, for allocations + for (size_t j = 0; j < max_allocs / 2; j++) { auto size = sizeGenerator.nextSize(); i[j].ptr = base::allocator.benchAlloc(size); if (i[j].ptr == NULL) { @@ -441,20 +448,168 @@ class multiple_malloc_free_benchmark : public benchmark_interface { } } - void prepareWorkload(benchmark::State &state) { + virtual void prepareWorkload(benchmark::State &state) { auto tid = state.thread_index(); auto &n = next[tid]; + + // Create generators for random index selection and binary decision. + using distribution = std::uniform_int_distribution; std::default_random_engine generator; - distribution dist; + distribution dist_offset(0, max_allocs - 1); + distribution dist_opt_type(0, 1); generator.seed(0); - dist.param(distribution::param_type(0, max_allocs - 1)); + auto sizeGenerator = base::alloc_sizes[tid]; + std::vector free; + std::vector allocated; + free.reserve(max_allocs / 2); + allocated.reserve(max_allocs / 2); + // Preallocate memory: initially, half the indices are allocated. + // See prealloc() function; + size_t i = 0; + while (i < max_allocs / 2) { + allocated.push_back(i++); + } + // The remaining indices are marked as free. + while (i < max_allocs) { + free.push_back(i++); + } n.clear(); for (int64_t j = 0; j < state.max_iterations * allocsPerIterations; j++) { - n.push_back({dist(generator), sizeGenerator.nextSize()}); + // Decide whether to allocate or free: + // - If no allocations exist, allocation is forced. + // - If there is maximum number of allocation, free is forced + // - Otherwise, use a binary random choice (0 or 1) + if (allocated.empty() || + (dist_opt_type(generator) == 0 && !free.empty())) { + // Allocation: + std::swap(free[dist_offset(generator) % free.size()], + free.back()); + auto offset = free.back(); + free.pop_back(); + + n.push_back({true, offset, sizeGenerator.nextSize()}); + allocated.push_back(offset); + } else { + // Free + std::swap(allocated[dist_offset(generator) % allocated.size()], + allocated.back()); + auto offset = allocated.back(); + allocated.pop_back(); + + n.push_back({false, offset, 0}); + free.push_back(offset); + } } + next_iter[tid] = std::make_unique(n.cbegin()); } }; +// This class benchmarks performance by randomly allocating and freeing memory. +// Initially, it slowly increases the memory footprint, and later decreases it. +template < + typename Size, typename Alloc, + typename = + std::enable_if_t::value>, + typename = + std::enable_if_t::value>> +class peak_alloc_benchmark + : public multiple_malloc_free_benchmark { + using base = multiple_malloc_free_benchmark; + virtual void prepareWorkload(benchmark::State &state) override { + // Retrieve the thread index and corresponding operation buffer. + auto tid = state.thread_index(); + auto &n = this->next[tid]; + + // Set up the random generators for index selection and decision making. + std::default_random_engine generator; + std::uniform_int_distribution dist_offset(0, + this->max_allocs - 1); + std::uniform_real_distribution dist_opt_type(0, 1); + generator.seed(0); + auto sizeGenerator = this->alloc_sizes[tid]; + + n.clear(); + std::vector free; + std::vector allocated; + free.reserve(this->max_allocs); + // Initially, all indices are available. + for (size_t i = 0; i < this->max_allocs; i++) { + free.push_back(i); + } + + // Total number of allocation/free operations to simulate. + int64_t operations_number = + state.max_iterations * this->allocsPerIterations; + for (int64_t j = 0; j < operations_number; j++) { + int64_t target_allocation; + + // Determine the target number of allocations based on the progress of the iterations. + // In the first half of the iterations, the target allocation increases linearly. + // In the second half, it decreases linearly. + if (j < operations_number / 2) { + target_allocation = 2 * static_cast(this->max_allocs) * + j / operations_number; + } else { + target_allocation = -2 * + static_cast(this->max_allocs) * + j / operations_number + + 2 * static_cast(this->max_allocs); + } + + // x represents the gap between the target and current allocations. + auto x = static_cast(target_allocation - + static_cast(allocated.size())); + + // Use a normal CDF with high sigma so that when x is positive, + // we are slightly more likely to allocate, + // and when x is negative, slightly more likely to free memory, + // keeping the overall change gradual. + + const double sigma = 1000; + auto cdf = normalCDF(x, sigma); + + // Decide whether to allocate or free: + // - If no allocations exist, allocation is forced. + // - If there is maximum number of allocation, free is forced + // - Otherwise, Based on the computed probability, choose whether to allocate or free + if (allocated.empty() || + (!free.empty() && cdf > dist_opt_type(generator))) { + // Allocation + std::swap(free[dist_offset(generator) % free.size()], + free.back()); + auto offset = free.back(); + free.pop_back(); + n.push_back({true, offset, sizeGenerator.nextSize()}); + allocated.push_back(offset); + } else { + // Free + std::swap(allocated[dist_offset(generator) % allocated.size()], + allocated.back()); + auto offset = allocated.back(); + allocated.pop_back(); + n.push_back({false, offset, 0}); + free.push_back(offset); + } + } + + this->next_iter[tid] = + std::make_unique::const_iterator>( + n.cbegin()); + } + + virtual void prealloc(benchmark::State &state) { + auto tid = state.thread_index(); + auto &i = base::allocations[tid]; + i.resize(base::max_allocs); + } + virtual std::string name() { return base::base::name() + "/peak_alloc"; } + + private: + // Function to calculate the CDF of a normal distribution + double normalCDF(double x, double sigma = 1.0, double mu = 0.0) { + return 0.5 * (1 + std::erf((x - mu) / (sigma * std::sqrt(2.0)))); + } +};