benchmark workload redesign

lplewa · lplewa · commit 653309e96c6b · 2025-04-07T12:56:18.000+02:00
Changed existing workload to include multiple alloc/free in the row.
Added workload that firstly increases number of allocations, and
decreases it.
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
@@ -32,10 +32,9 @@
 // The exact meaning of each argument depends on the benchmark, allocator, and size components used.
 // Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments.
 
-template <size_t max_threads = 12>
 static void multithreaded(benchmark::internal::Benchmark *benchmark) {
     benchmark->Threads(1);
-    benchmark->DenseThreadRange(4, max_threads, 4);
+    benchmark->Threads(4);
 }
 
 static void singlethreaded(benchmark::internal::Benchmark *benchmark) {
@@ -92,16 +91,14 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix,
                               pool_allocator<disjoint_pool<os_provider>>);
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix)
     ->Apply(&default_multiple_alloc_fix_size)
-    // Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
-    ->Apply(&multithreaded<4>);
+    ->Apply(&multithreaded);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
                               disjoint_pool_uniform, uniform_alloc_size,
                               pool_allocator<disjoint_pool<os_provider>>);
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_uniform)
     ->Apply(&default_multiple_alloc_uniform_size)
-    // Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
-    ->Apply(&multithreaded<4>);
+    ->Apply(&multithreaded);
 
 #ifdef UMF_POOL_JEMALLOC_ENABLED
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix,
@@ -159,6 +156,70 @@ UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, fixed_provider)
     // reduce iterations, to match os_provider benchmark
     ->Iterations(50000);
 
+// peak
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_fix, fixed_alloc_size,
+                              glibc_malloc);
+
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_fix)
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_uniform,
+                              uniform_alloc_size, glibc_malloc);
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_uniform)
+    ->Apply(&default_multiple_alloc_uniform_size)
+    ->Apply(&multithreaded);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<disjoint_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_fix)
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_uniform,
+                              uniform_alloc_size,
+                              pool_allocator<disjoint_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_uniform)
+    ->Apply(&default_multiple_alloc_uniform_size)
+    ->Apply(&multithreaded);
+
+#ifdef UMF_POOL_JEMALLOC_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<jemalloc_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_fix)
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_uniform,
+                              uniform_alloc_size,
+                              pool_allocator<jemalloc_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_uniform)
+    ->Apply(&default_multiple_alloc_uniform_size)
+    ->Apply(&multithreaded);
+
+#endif
+
+#ifdef UMF_POOL_SCALABLE_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<scalable_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_fix)
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_uniform,
+                              uniform_alloc_size,
+                              pool_allocator<scalable_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_uniform)
+    ->Apply(&default_multiple_alloc_uniform_size)
+    ->Apply(&multithreaded);
+
+#endif
+
 //BENCHMARK_MAIN();
 int main(int argc, char **argv) {
     if (initAffinityMask()) {
diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp
@@ -70,6 +70,7 @@
  * - Additional benchmarking scenarios can be created by extending `benchmark_interface`.
  */
 
+#include <list>
 #include <malloc.h>
 #include <random>
 
@@ -86,6 +87,7 @@ struct alloc_data {
 };
 
 struct next_alloc_data {
+    bool alloc; // true if allocation, false if deallocation
     size_t offset;
     size_t size;
 };
@@ -288,18 +290,17 @@ template <
     typename =
         std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
 class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
-    using distribution = std::uniform_int_distribution<size_t>;
+  protected:
     template <class T> using vector2d = std::vector<std::vector<T>>;
     using base = benchmark_interface<Size, Alloc>;
-
     int allocsPerIterations = 10;
     bool thread_local_allocations = true;
     size_t max_allocs = 0;
 
     vector2d<alloc_data> allocations;
     vector2d<next_alloc_data> next;
     using next_alloc_data_iterator =
-        std::vector<next_alloc_data>::const_iterator;
+        typename std::vector<next_alloc_data>::const_iterator;
     std::vector<std::unique_ptr<next_alloc_data_iterator>> next_iter;
     int64_t iterations;
 
@@ -386,15 +387,20 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
         auto tid = state.thread_index();
         auto &allocation = allocations[tid];
         auto &iter = next_iter[tid];
+
         for (int i = 0; i < allocsPerIterations; i++) {
             auto &n = *(*iter)++;
             auto &alloc = allocation[n.offset];
-            base::allocator.benchFree(alloc.ptr, alloc.size);
-            alloc.size = n.size;
-            alloc.ptr = base::allocator.benchAlloc(alloc.size);
-
-            if (alloc.ptr == NULL) {
-                state.SkipWithError("allocation failed");
+            if (n.alloc) {
+                alloc.ptr = base::allocator.benchAlloc(n.size);
+                if (alloc.ptr == NULL) {
+                    state.SkipWithError("allocation failed");
+                }
+                alloc.size = n.size;
+            } else {
+                base::allocator.benchFree(alloc.ptr, alloc.size);
+                alloc.ptr = NULL;
+                alloc.size = 0;
             }
         }
     }
@@ -412,13 +418,13 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
     }
 
   private:
-    void prealloc(benchmark::State &state) {
+    virtual void prealloc(benchmark::State &state) {
         auto tid = state.thread_index();
         auto &i = allocations[tid];
         i.resize(max_allocs);
         auto sizeGenerator = base::alloc_sizes[tid];
 
-        for (size_t j = 0; j < max_allocs; j++) {
+        for (size_t j = 0; j < max_allocs / 2; j++) {
             auto size = sizeGenerator.nextSize();
             i[j].ptr = base::allocator.benchAlloc(size);
             if (i[j].ptr == NULL) {
@@ -441,20 +447,141 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
         }
     }
 
-    void prepareWorkload(benchmark::State &state) {
+    virtual void prepareWorkload(benchmark::State &state) {
         auto tid = state.thread_index();
         auto &n = next[tid];
+        using distribution = std::uniform_int_distribution<size_t>;
         std::default_random_engine generator;
-        distribution dist;
+        distribution dist_offset(0, max_allocs - 1);
+        distribution dist_opt_type(0, 1);
         generator.seed(0);
-        dist.param(distribution::param_type(0, max_allocs - 1));
         auto sizeGenerator = base::alloc_sizes[tid];
+        std::vector<size_t> free;
+        std::vector<size_t> allocated;
+
+        // this benchmark prealloc memory, so we start with some allocation
+        size_t i = 0;
+        for (; i < max_allocs / 2; i++) {
+            allocated.push_back(i);
+        }
+        for (; i < max_allocs; i++) {
+            free.push_back(i);
+        }
 
         n.clear();
         for (int64_t j = 0; j < state.max_iterations * allocsPerIterations;
              j++) {
-            n.push_back({dist(generator), sizeGenerator.nextSize()});
+            if (allocated.empty() ||
+                (dist_opt_type(generator) == 0 && !free.empty())) {
+
+                std::swap(free[dist_offset(generator) % free.size()],
+                          free.back());
+                auto offset = free.back();
+                free.pop_back();
+
+                n.push_back({true, offset, sizeGenerator.nextSize()});
+                allocated.push_back(offset);
+            } else {
+                std::swap(allocated[dist_offset(generator) % allocated.size()],
+                          allocated.back());
+                auto offset = allocated.back();
+                allocated.pop_back();
+
+                n.push_back({false, offset, 0});
+                free.push_back(offset);
+            }
         }
+
         next_iter[tid] = std::make_unique<next_alloc_data_iterator>(n.cbegin());
     }
 };
+
+// This class benchmarks performance randomly allocates and frees,
+// Firstly slowly increasing memory footprint, and later decreasing
+template <
+    typename Size, typename Alloc,
+    typename =
+        std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
+    typename =
+        std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
+class peak_alloc_benchmark
+    : public multiple_malloc_free_benchmark<Size, Alloc> {
+    using base = multiple_malloc_free_benchmark<Size, Alloc>;
+    virtual void prepareWorkload(benchmark::State &state) override {
+
+        auto tid = state.thread_index();
+        auto &n = this->next[tid];
+        std::default_random_engine generator;
+        std::uniform_int_distribution<size_t> dist_offset(0,
+                                                          this->max_allocs - 1);
+        std::uniform_real_distribution<double> dist_opt_type(0, 1);
+        generator.seed(0);
+        auto sizeGenerator = this->alloc_sizes[tid];
+
+        n.clear();
+        std::vector<size_t> free;
+        std::vector<size_t> allocated;
+        // we start without any allocations
+        for (size_t i = 0; i < this->max_allocs; i++) {
+            free.push_back(i);
+        }
+
+        int64_t iterations = state.max_iterations * this->allocsPerIterations;
+        for (int64_t j = 0; j < iterations; j++) {
+            int64_t target_allocation;
+            int64_t max_allocs = static_cast<int64_t>(this->max_allocs);
+            if (j < iterations / 2) {
+                target_allocation = 2 * max_allocs * j / iterations;
+            } else {
+                target_allocation =
+                    -2 * max_allocs * j / iterations + 2 * max_allocs;
+            }
+
+            auto x = static_cast<double>(target_allocation -
+                                         static_cast<double>(allocated.size()));
+            // high sigma value cause small changes in probability between alloc and free
+            // based on offset x from target_allocation number.
+            // sigma == 1000 causes that actual number of allocation are +/-40 of target number
+            const double sigma = 1000;
+            auto cdf = normalCDF(x, sigma);
+
+            if (allocated.empty() ||
+                (!free.empty() && cdf > dist_opt_type(generator))) {
+                // allocate
+                std::swap(free[dist_offset(generator) % free.size()],
+                          free.back());
+                auto offset = free.back();
+                free.pop_back();
+                n.push_back({true, offset, sizeGenerator.nextSize()});
+                allocated.push_back(offset);
+
+            } else {
+                // free
+                std::swap(allocated[dist_offset(generator) % allocated.size()],
+                          allocated.back());
+                auto offset = allocated.back();
+                allocated.pop_back();
+
+                n.push_back({false, offset, 0});
+                free.push_back(offset);
+            }
+        }
+
+        this->next_iter[tid] =
+            std::make_unique<std::vector<next_alloc_data>::const_iterator>(
+                n.cbegin());
+    }
+
+    virtual void prealloc(benchmark::State &state) {
+        auto tid = state.thread_index();
+        auto &i = base::allocations[tid];
+        i.resize(base::max_allocs);
+    }
+    virtual std::string name() { return base::base::name() + "/peak_alloc"; }
+
+  private:
+    // Function to calculate the CDF of a normal distribution
+    double normalCDF(double x, double sigma = 1.0, double mu = 0.0) {
+        return 0.5 * (1 + std::erf((x - mu) / (sigma * std::sqrt(2.0))));
+    }
+};