diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 401b06d268..3969b6068f 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -30,129 +30,60 @@
 // The exact meaning of each argument depends on the benchmark, allocator, and size components used.
 // Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments.
 
-static void default_alloc_fix_size(benchmark::internal::Benchmark *benchmark) {
-    benchmark->Args({10000, 0, 4096});
-    benchmark->Args({10000, 100000, 4096});
+static void multithreaded(benchmark::internal::Benchmark *benchmark) {
     benchmark->Threads(4);
     benchmark->Threads(1);
 }
 
-static void
-default_alloc_uniform_size(benchmark::internal::Benchmark *benchmark) {
-    benchmark->Args({10000, 0, 8, 64 * 1024, 8});
-    benchmark->Threads(4);
-    benchmark->Threads(1);
-}
-
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_fix, fixed_alloc_size,
-                              glibc_malloc);
-
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_fix)
-    ->Apply(&default_alloc_fix_size);
-
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_uniform,
-                              uniform_alloc_size, glibc_malloc);
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_uniform)
-    ->Apply(&default_alloc_uniform_size);
-
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, os_provider, fixed_alloc_size,
-                              provider_allocator<os_provider>);
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, os_provider)
-    ->Apply(&default_alloc_fix_size);
-
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, proxy_pool, fixed_alloc_size,
-                              pool_allocator<proxy_pool<os_provider>>);
-
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, proxy_pool)
-    ->Apply(&default_alloc_fix_size);
-
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_fix,
-                              fixed_alloc_size,
-                              pool_allocator<disjoint_pool<os_provider>>);
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_fix)
-    ->Apply(&default_alloc_fix_size);
-
-// TODO: debug why this crashes
-/*UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_uniform,
-                              uniform_alloc_size,
-                              pool_allocator<disjoint_pool<os_provider>>);
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_uniform)
-    ->Apply(&default_alloc_uniform_size);
-*/
-
-#ifdef UMF_POOL_JEMALLOC_ENABLED
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_fix,
-                              fixed_alloc_size,
-                              pool_allocator<jemalloc_pool<os_provider>>);
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_fix)
-    ->Apply(&default_alloc_fix_size);
-
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_uniform,
-                              uniform_alloc_size,
-                              pool_allocator<jemalloc_pool<os_provider>>);
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_uniform)
-    ->Apply(&default_alloc_uniform_size);
-
-#endif
-#ifdef UMF_POOL_SCALABLE_ENABLED
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_fix,
-                              fixed_alloc_size,
-                              pool_allocator<scalable_pool<os_provider>>);
-
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_fix)
-    ->Apply(&default_alloc_fix_size);
-
-UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_uniform,
-                              uniform_alloc_size,
-                              pool_allocator<scalable_pool<os_provider>>);
-
-UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_uniform)
-    ->Apply(&default_alloc_uniform_size);
-#endif
-// Multiple allocs/free
 static void
 default_multiple_alloc_fix_size(benchmark::internal::Benchmark *benchmark) {
-    benchmark->Args({10000, 4096});
-    benchmark->Threads(4);
-    benchmark->Threads(1);
+    benchmark->Args({10000, 1, 4096});
+    benchmark->Iterations(500000);
 }
 
 static void
 default_multiple_alloc_uniform_size(benchmark::internal::Benchmark *benchmark) {
-    benchmark->Args({10000, 8, 64 * 1024, 8});
-    benchmark->Threads(4);
-    benchmark->Threads(1);
+    benchmark->Args({10000, 1, 8, 4096, 8});
+    benchmark->Args({10000, 1, 8, 128, 8});
+    benchmark->Iterations(500000);
 }
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_fix,
                               fixed_alloc_size, glibc_malloc);
 
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_fix)
-    ->Apply(&default_multiple_alloc_fix_size);
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_uniform,
                               uniform_alloc_size, glibc_malloc);
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_uniform)
-    ->Apply(&default_multiple_alloc_uniform_size);
+    ->Apply(&default_multiple_alloc_uniform_size)
+    ->Apply(&multithreaded);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, proxy_pool,
                               fixed_alloc_size,
                               pool_allocator<proxy_pool<os_provider>>);
 
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, proxy_pool)
-    ->Apply(&default_multiple_alloc_fix_size);
+    ->Apply(&default_multiple_alloc_fix_size)
+    // reduce iterations, as this benchmark is slower than others
+    ->Iterations(50000);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, os_provider,
                               fixed_alloc_size,
                               provider_allocator<os_provider>);
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, os_provider)
-    ->Apply(&default_multiple_alloc_fix_size);
+    ->Apply(&default_multiple_alloc_fix_size)
+    // reduce iterations, as this benchmark is slower than others
+    ->Iterations(50000);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix,
                               fixed_alloc_size,
                               pool_allocator<disjoint_pool<os_provider>>);
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix)
-    ->Apply(&default_multiple_alloc_fix_size);
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
                               disjoint_pool_uniform, uniform_alloc_size,
@@ -165,13 +96,15 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix,
                               fixed_alloc_size,
                               pool_allocator<jemalloc_pool<os_provider>>);
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_fix)
-    ->Apply(&default_multiple_alloc_fix_size);
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
                               jemalloc_pool_uniform, uniform_alloc_size,
                               pool_allocator<jemalloc_pool<os_provider>>);
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_uniform)
-    ->Apply(&default_multiple_alloc_uniform_size);
+    ->Apply(&default_multiple_alloc_uniform_size)
+    ->Apply(&multithreaded);
 
 #endif
 
@@ -181,14 +114,25 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, scalable_pool_fix,
                               pool_allocator<scalable_pool<os_provider>>);
 
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_fix)
-    ->Apply(&default_multiple_alloc_fix_size);
+    ->Apply(&default_multiple_alloc_fix_size)
+    ->Apply(&multithreaded);
 
 UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
                               scalable_pool_uniform, uniform_alloc_size,
                               pool_allocator<scalable_pool<os_provider>>);
 
 UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_uniform)
-    ->Apply(&default_multiple_alloc_uniform_size);
+    ->Apply(&default_multiple_alloc_uniform_size)
+    ->Apply(&multithreaded);
 
 #endif
-BENCHMARK_MAIN();
+
+//BENCHMARK_MAIN();
+int main(int argc, char **argv) {
+    if (initAffinityMask()) {
+        return -1;
+    }
+    benchmark::Initialize(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+    benchmark::Shutdown();
+}
diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp
index 50e75f8fb2..a960d89bca 100644
--- a/benchmark/benchmark.hpp
+++ b/benchmark/benchmark.hpp
@@ -70,8 +70,10 @@
  * - Additional benchmarking scenarios can be created by extending `benchmark_interface`.
  */
 
-#include <benchmark/benchmark.h>
+#include <malloc.h>
 #include <random>
+
+#include <benchmark/benchmark.h>
 #include <umf/memory_pool.h>
 #include <umf/memory_provider.h>
 
@@ -83,13 +85,92 @@ struct alloc_data {
     size_t size;
 };
 
+struct next_alloc_data {
+    size_t offset;
+    size_t size;
+};
+
+#ifndef WIN32
+std::vector<cpu_set_t> affinityMask;
+
+int initAffinityMask() {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+
+    if (sched_getaffinity(0, sizeof(mask), &mask) == -1) {
+        perror("sched_getaffinity");
+        return 1;
+    }
+
+    for (int cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+        if (CPU_ISSET(cpu, &mask)) {
+            cpu_set_t mask;
+            CPU_ZERO(&mask);
+            CPU_SET(cpu, &mask);
+            affinityMask.push_back(mask);
+        }
+    }
+    // we reverse affinityMask to avoid using cpu 0 if possible.
+    // CPU 0 is usually the most used one by other applications on the system.
+    std::reverse(affinityMask.begin(), affinityMask.end());
+    return 0;
+}
+
+void setAffinity(benchmark::State &state) {
+    size_t tid = state.thread_index();
+    if (tid >= affinityMask.size()) {
+        state.SkipWithError("Not enough CPUs available to set affinity");
+    }
+
+    auto &mask = affinityMask[tid];
+
+    if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
+        state.SkipWithError("Failed to set affinity");
+    }
+}
+
+#else
+int initAffinityMask() {
+    printf(
+        "Affinity set not supported on Windows, benchmark can be unstable\n");
+    return 0;
+}
+
+void setAffinity([[maybe_unused]] benchmark::State &state) {
+    // Not implemented for Windows
+}
+
+#endif
+
+// function that ensures that all threads have reached the same point
+inline void waitForAllThreads(const benchmark::State &state) {
+    static std::atomic<int> count{0};
+    static std::atomic<int> generation{0};
+
+    const int totalThreads = state.threads();
+    int gen = generation.load(std::memory_order_relaxed);
+
+    int c = count.fetch_add(1, std::memory_order_acq_rel) + 1;
+
+    if (c == totalThreads) {
+        // Last thread - reset count and bump generation
+        count.store(0, std::memory_order_relaxed);
+        generation.fetch_add(1, std::memory_order_acq_rel);
+    } else {
+        // Not the last thread: spin until the generation changes
+        while (generation.load(std::memory_order_acquire) == gen) {
+            std::this_thread::yield();
+        }
+    }
+}
+
 template <typename Provider, typename = std::enable_if_t<std::is_base_of<
                                  provider_interface, Provider>::value>>
 class provider_allocator : public allocator_interface {
   public:
-    unsigned SetUp(::benchmark::State &state, unsigned r) override {
+    unsigned SetUp(::benchmark::State &state, unsigned argPos) override {
         provider.SetUp(state);
-        return r;
+        return argPos;
     }
 
     void TearDown(::benchmark::State &state) override {
@@ -118,9 +199,9 @@ class provider_allocator : public allocator_interface {
 // TODO: assert Pool to be a pool_interface<provider_interface>.
 template <typename Pool> class pool_allocator : public allocator_interface {
   public:
-    unsigned SetUp(::benchmark::State &state, unsigned r) override {
+    unsigned SetUp(::benchmark::State &state, unsigned argPos) override {
         pool.SetUp(state);
-        return r;
+        return argPos;
     }
 
     void TearDown(::benchmark::State &state) override { pool.TearDown(state); }
@@ -141,19 +222,28 @@ template <typename Pool> class pool_allocator : public allocator_interface {
 
 template <typename Size, typename Allocator>
 struct benchmark_interface : public benchmark::Fixture {
-    void SetUp(::benchmark::State &state) {
-        int argPos = alloc_size.SetUp(state, 0);
-        allocator.SetUp(state, argPos);
+    int parseArgs(::benchmark::State &state, int argPos) {
+        Size generator;
+        argPos = generator.SetUp(state, argPos);
+        argPos = allocator.SetUp(state, argPos);
+        alloc_sizes.resize(state.threads());
+        for (auto &i : alloc_sizes) {
+            i = generator;
+        }
+        return argPos;
     }
+    void SetUp(::benchmark::State &state) { parseArgs(state, 0); }
 
     void TearDown(::benchmark::State &state) {
-        alloc_size.TearDown(state);
+        for (auto &i : alloc_sizes) {
+            i.TearDown(state);
+        }
         allocator.TearDown(state);
     }
 
     virtual void bench(::benchmark::State &state) = 0;
 
-    static std::vector<std::string> argsName() {
+    virtual std::vector<std::string> argsName() {
         auto s = Size::argsName();
         auto a = Allocator::argsName();
         std::vector<std::string> res = {};
@@ -163,209 +253,167 @@ struct benchmark_interface : public benchmark::Fixture {
     }
 
     virtual std::string name() { return Allocator::name(); }
-    virtual int64_t iterations() { return 10000; }
+
     static void defaultArgs(Benchmark *benchmark) {
         auto *bench =
             static_cast<benchmark_interface<Size, Allocator> *>(benchmark);
-        benchmark->ArgNames(bench->argsName())
-            ->Name(bench->name())
-            ->Iterations(bench->iterations());
+        benchmark->ArgNames(bench->argsName())->Name(bench->name());
     }
-    Size alloc_size;
+
+    std::vector<Size> alloc_sizes;
     Allocator allocator;
 };
 
-// This class benchmarks speed of alloc() operations.
+// This class benchmarks performance of random deallocations and (re)allocations
 template <
     typename Size, typename Alloc,
     typename =
         std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
     typename =
         std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
-class alloc_benchmark : public benchmark_interface<Size, Alloc> {
+class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
+    using distribution = std::uniform_int_distribution<size_t>;
+    template <class T> using vector2d = std::vector<std::vector<T>>;
+    using base = benchmark_interface<Size, Alloc>;
+
+    int allocsPerIterations = 10;
+    bool thread_local_allocations = true;
+    size_t max_allocs = 0;
+
+    vector2d<alloc_data> allocations;
+    std::vector<unsigned> iters;
+
+    vector2d<next_alloc_data> next;
+    std::vector<std::vector<next_alloc_data>::const_iterator> next_iter;
+    int64_t iterations;
+
   public:
-    size_t max_allocs = 1000;
-    size_t pre_allocs = 0;
     void SetUp(::benchmark::State &state) override {
-        if (state.thread_index() != 0) {
-            return;
-        }
+        auto tid = state.thread_index();
 
-        // unpack arguments
-        int argPos = 0;
-        max_allocs = state.range(argPos++);
-        pre_allocs = state.range(argPos++);
-        // pass rest of the arguments to "alloc_size" and "allocator"
-        argPos = base::alloc_size.SetUp(state, argPos);
-        base::allocator.SetUp(state, argPos);
-
-        // initialize allocations tracking vectors (one per thread)
-        // and iterators for these vectors.
-        allocations.resize(state.threads());
-        iters.resize(state.threads());
-
-        for (auto &i : iters) {
-            i = pre_allocs;
+        if (tid == 0) {
+            // unpack arguments
+            iterations = state.max_iterations;
+            int argPos = 0;
+            max_allocs = state.range(argPos++);
+            thread_local_allocations = state.range(argPos++);
+            base::parseArgs(state, argPos);
+
+            allocations.resize(state.threads());
+            next.resize(state.threads());
+            next_iter.resize(state.threads());
+
+#ifndef WIN32
+            // Ensure that system malloc does not have memory pooled on the heap
+            malloc_trim(0);
+#endif
         }
-
-        // do "pre_alloc" allocations before actual benchmark.
-        for (auto &i : allocations) {
-            i.resize(max_allocs + pre_allocs);
-
-            for (size_t j = 0; j < pre_allocs; j++) {
-                i[j].ptr =
-                    base::allocator.benchAlloc(base::alloc_size.nextSize());
-                if (i[j].ptr == NULL) {
-                    state.SkipWithError("preallocation failed");
-                    return;
-                }
-                i[j].size = base::alloc_size.nextSize();
-            }
+        setAffinity(state);
+        // sync thread to ensure that thread 0 parsed args and did all initialization
+        waitForAllThreads(state);
+        // Prepare workload for warp up
+        prealloc(state);
+        prepareWorkload(state);
+        // Start warm up with all threads at once
+        waitForAllThreads(state);
+        // warm up
+        for (int j = 0; j < iterations; j++) {
+            bench(state);
         }
+        waitForAllThreads(state);
+        // prepare workload for actual benchmark.
+        freeAllocs(state);
+        prealloc(state);
+        prepareWorkload(state);
     }
 
     void TearDown(::benchmark::State &state) override {
-        if (state.thread_index() != 0) {
-            return;
-        }
-        for (auto &i : allocations) {
-            for (auto &j : i) {
-                if (j.ptr != NULL) {
-                    base::allocator.benchFree(j.ptr, j.size);
-                    j.ptr = NULL;
-                    j.size = 0;
-                }
-            }
-        }
+        auto tid = state.thread_index();
 
+        freeAllocs(state);
+        waitForAllThreads(state);
+        if (tid == 0) {
+            // release memory used by benchmark
+            next.clear();
+            next_iter.clear();
+            allocations.clear();
+            iters.clear();
+        }
         base::TearDown(state);
     }
 
     void bench(benchmark::State &state) override {
         auto tid = state.thread_index();
-        auto s = base::alloc_size.nextSize();
-        auto &i = iters[tid];
-        allocations[tid][i].ptr = base::allocator.benchAlloc(s);
-        if (allocations[tid][i].ptr == NULL) {
-            state.SkipWithError("allocation failed");
-            return;
-        }
-        allocations[tid][i].size = s;
-        i++;
-        if (i >= max_allocs + pre_allocs) {
-            // This benchmark tests only allocations -
-            // if allocation tracker is full we pause benchmark to dealloc all allocations -
-            // excluding pre-allocated ones.
-            state.PauseTiming();
-            while (i > pre_allocs) {
-                auto &allocation = allocations[tid][--i];
-                base::allocator.benchFree(allocation.ptr, allocation.size);
-                allocation.ptr = NULL;
-                allocation.size = 0;
+        auto &allocation = allocations[tid];
+        for (int i = 0; i < allocsPerIterations; i++) {
+            auto &n = *next_iter[tid]++;
+            auto &alloc = allocation[n.offset];
+            base::allocator.benchFree(alloc.ptr, alloc.size);
+
+            alloc.size = n.size;
+            alloc.ptr = base::allocator.benchAlloc(alloc.size);
+
+            if (alloc.ptr == NULL) {
+                state.SkipWithError("allocation failed");
             }
-            state.ResumeTiming();
         }
     }
 
+    virtual std::string name() {
+        return base::name() + "/multiple_malloc_free";
+    }
+
     virtual std::vector<std::string> argsName() {
         auto n = benchmark_interface<Size, Alloc>::argsName();
-        std::vector<std::string> res = {"max_allocs", "pre_allocs"};
+        std::vector<std::string> res = {"max_allocs",
+                                        "thread_local_allocations"};
         res.insert(res.end(), n.begin(), n.end());
         return res;
     }
 
-    virtual std::string name() { return base::name() + "/alloc"; }
-    virtual int64_t iterations() { return 200000; }
-
-  protected:
-    using base = benchmark_interface<Size, Alloc>;
-    std::vector<std::vector<alloc_data>> allocations;
-    std::vector<size_t> iters;
-};
-
-// This class benchmarks performance of random deallocations and (re)allocations
-template <
-    typename Size, typename Alloc,
-    typename =
-        std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
-    typename =
-        std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
-class multiple_malloc_free_benchmark : public alloc_benchmark<Size, Alloc> {
-    using distribution = std::uniform_int_distribution<size_t>;
-    using base = alloc_benchmark<Size, Alloc>;
-
-  public:
-    int reallocs = 100;
-    void SetUp(::benchmark::State &state) override {
-        if (state.thread_index() != 0) {
-            return;
-        }
-        // unpack arguments
-        int argPos = 0;
-        base::max_allocs = state.range(argPos++);
-
-        // pass rest of the arguments to "alloc_size" and "allocator"
-        argPos = base::alloc_size.SetUp(state, argPos);
-        base::allocator.SetUp(state, argPos);
-
-        // perform initial allocations which will be later freed and reallocated
-        base::allocations.resize(state.threads());
-        for (auto &i : base::allocations) {
-            i.resize(base::max_allocs);
-
-            for (size_t j = 0; j < base::max_allocs; j++) {
-                i[j].ptr =
-                    base::allocator.benchAlloc(base::alloc_size.nextSize());
-                if (i[j].ptr == NULL) {
-                    state.SkipWithError("preallocation failed");
-                    return;
-                }
-                i[j].size = base::alloc_size.nextSize();
+  private:
+    void prealloc(benchmark::State &state) {
+        auto tid = state.thread_index();
+        auto &i = allocations[tid];
+        i.resize(max_allocs);
+        auto sizeGenerator = base::alloc_sizes[tid];
+        for (size_t j = 0; j < max_allocs; j++) {
+            auto size = sizeGenerator.nextSize();
+            i[j].ptr = base::allocator.benchAlloc(size);
+            if (i[j].ptr == NULL) {
+                state.SkipWithError("preallocation failed");
+                return;
             }
+            i[j].size = size;
         }
-        dist.param(distribution::param_type(0, base::max_allocs - 1));
     }
 
-    void bench(benchmark::State &state) override {
+    void freeAllocs(benchmark::State &state) {
         auto tid = state.thread_index();
-        auto &allocation = base::allocations[tid];
-        std::vector<size_t> to_alloc;
-        for (int j = 0; j < reallocs; j++) {
-            auto idx = dist(generator);
-            if (allocation[idx].ptr == NULL) {
-                continue;
-            }
-            to_alloc.push_back(idx);
-
-            base::allocator.benchFree(allocation[idx].ptr,
-                                      allocation[idx].size);
-            allocation[idx].ptr = NULL;
-            allocation[idx].size = 0;
-        }
-
-        for (auto idx : to_alloc) {
-            auto s = base::alloc_size.nextSize();
-            allocation[idx].ptr = base::allocator.benchAlloc(s);
-            if (allocation[idx].ptr == NULL) {
-                state.SkipWithError("allocation failed");
+        auto &i = allocations[tid];
+        for (auto &j : i) {
+            if (j.ptr != NULL) {
+                base::allocator.benchFree(j.ptr, j.size);
+                j.ptr = NULL;
+                j.size = 0;
             }
-            allocation[idx].size = s;
         }
     }
 
-    virtual std::string name() {
-        return base::base::name() + "/multiple_malloc_free";
-    }
-
-    virtual std::vector<std::string> argsName() {
-        auto n = benchmark_interface<Size, Alloc>::argsName();
-        std::vector<std::string> res = {"max_allocs"};
-        res.insert(res.end(), n.begin(), n.end());
-        return res;
+    void prepareWorkload(benchmark::State &state) {
+        auto tid = state.thread_index();
+        auto &n = next[tid];
+        std::default_random_engine generator;
+        distribution dist;
+        generator.seed(0);
+        dist.param(distribution::param_type(0, max_allocs - 1));
+        auto sizeGenerator = base::alloc_sizes[tid];
+
+        n.clear();
+        for (int64_t j = 0; j < state.max_iterations * allocsPerIterations;
+             j++) {
+            n.push_back({dist(generator), sizeGenerator.nextSize()});
+        }
+        next_iter[tid] = n.cbegin();
     }
-
-    virtual int64_t iterations() { return 2000; }
-
-    std::default_random_engine generator;
-    distribution dist;
 };
diff --git a/benchmark/benchmark_size.hpp b/benchmark/benchmark_size.hpp
index d17a6b2869..44e4bf1da8 100644
--- a/benchmark/benchmark_size.hpp
+++ b/benchmark/benchmark_size.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2024 Intel Corporation
+ * Copyright (C) 2024-2025 Intel Corporation
  *
  * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -45,7 +45,7 @@ class uniform_alloc_size : public alloc_size_interface {
             state.SkipWithError("min and max must be divisible by granularity");
             return argPos;
         }
-
+        generator.seed(0);
         dist.param(distribution::param_type(min / gran, max / gran));
         multiplier = gran;
         return argPos;
@@ -53,11 +53,11 @@ class uniform_alloc_size : public alloc_size_interface {
     void TearDown([[maybe_unused]] ::benchmark::State &state) override {}
     size_t nextSize() override { return dist(generator) * multiplier; }
     static std::vector<std::string> argsName() {
-        return {"min size", "max size", "granularity"};
+        return {"min_size", "max_size", "granularity"};
     }
 
   private:
     std::default_random_engine generator;
     distribution dist;
-    size_t multiplier;
+    size_t multiplier = 1;
 };