diff --git a/.github/workflows/reusable_gpu.yml b/.github/workflows/reusable_gpu.yml
index a09f43e6d9..739aab9e18 100644
--- a/.github/workflows/reusable_gpu.yml
+++ b/.github/workflows/reusable_gpu.yml
@@ -112,6 +112,7 @@ jobs:
         run: ctest --output-on-failure --test-dir examples -C ${{matrix.build_type}}
 
       - name: Run benchmarks
+        if: matrix.build_type == 'Release'
         working-directory: ${{env.BUILD_DIR}}
         run: ctest --output-on-failure --test-dir benchmark -C ${{matrix.build_type}} --exclude-regex umf-bench-multithreaded
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc3a24e5fe..4dcc293d2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -320,7 +320,7 @@ endif()
 # compiler is required. Moreover, if these options are not set, CMake will set
 # up a strict C build, without C++ support.
 set(OPTIONS_REQUIRING_CXX "UMF_BUILD_TESTS" "UMF_BUILD_LIBUMF_POOL_DISJOINT"
-                          "UMF_BUILD_BENCHMARKS_MT")
+                          "UMF_BUILD_BENCHMARKS_MT" "UMF_BUILD_BENCHMARKS")
 foreach(option_name ${OPTIONS_REQUIRING_CXX})
     if(${option_name})
         enable_language(CXX)
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index aaf50c1c0f..5605519ee2 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,7 +1,24 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+include(FetchContent)
+FetchContent_Declare(
+    googlebenchmark
+    GIT_REPOSITORY https://github.com/google/benchmark.git
+    GIT_TAG v1.9.0)
+
+set(BENCHMARK_ENABLE_GTEST_TESTS
+    OFF
+    CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_TESTING
+    OFF
+    CACHE BOOL "" FORCE)
+set(BENCHMARK_ENABLE_INSTALL
+    OFF
+    CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googlebenchmark)
+
 # In MSVC builds, there is no way to determine the actual build type during the
 # CMake configuration step. Therefore, this message is printed in all MSVC
 # builds.
@@ -32,7 +49,7 @@ function(add_umf_benchmark)
         "${multiValueArgs}"
         ${ARGN})
 
-    set(BENCH_NAME umf-bench-${ARG_NAME})
+    set(BENCH_NAME umf-${ARG_NAME})
 
     set(BENCH_LIBS ${ARG_LIBS} umf)
 
@@ -52,13 +69,17 @@ function(add_umf_benchmark)
         COMMAND ${BENCH_NAME}
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-    # Benchmark passes if it prints "PASSED" in the output, because ubench of
-    # scalable pool fails if the confidence interval exceeds maximum permitted
-    # 2.5%.
-    set_tests_properties(
-        ${BENCH_NAME} PROPERTIES
-        LABELS "benchmark"
-        PASS_REGULAR_EXPRESSION "PASSED")
+    if("${BENCH_NAME}" STREQUAL "umf-ubench")
+        # Benchmark passes if it prints "PASSED" in the output, because ubench
+        # of scalable pool fails if the confidence interval exceeds maximum
+        # permitted 2.5%.
+        set_tests_properties(
+            ${BENCH_NAME} PROPERTIES
+            LABELS "benchmark"
+            PASS_REGULAR_EXPRESSION "PASSED")
+    else()
+        set_tests_properties(${BENCH_NAME} PROPERTIES LABELS "benchmark")
+    endif()
 
     if(WINDOWS)
         # append PATH to DLLs
@@ -68,11 +89,11 @@ function(add_umf_benchmark)
 
     if(UMF_BUILD_LIBUMF_POOL_DISJOINT)
         target_compile_definitions(${BENCH_NAME}
-                                   PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1)
+                                   PRIVATE UMF_POOL_DISJOINT_ENABLED=1)
     endif()
-    if(UMF_BUILD_LIBUMF_POOL_JEMALLOC)
+    if(UMF_POOL_JEMALLOC_ENABLED)
         target_compile_definitions(${BENCH_NAME}
-                                   PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1)
+                                   PRIVATE UMF_POOL_JEMALLOC_ENABLED=1)
     endif()
     if(UMF_POOL_SCALABLE_ENABLED)
         target_compile_definitions(${BENCH_NAME}
@@ -80,7 +101,7 @@ function(add_umf_benchmark)
     endif()
     if(UMF_BUILD_LEVEL_ZERO_PROVIDER)
         target_compile_definitions(${BENCH_NAME}
-                                   PRIVATE UMF_BUILD_LEVEL_ZERO_PROVIDER=1)
+                                   PRIVATE UMF_PROVIDER_LEVEL_ZERO_ENABLED=1)
         target_include_directories(
             ${BENCH_NAME} PRIVATE ${UMF_CMAKE_SOURCE_DIR}/test/common
                                   ${LEVEL_ZERO_INCLUDE_DIRS})
@@ -121,6 +142,12 @@ add_umf_benchmark(
     LIBS ${LIBS_OPTIONAL}
     LIBDIRS ${LIB_DIRS})
 
+add_umf_benchmark(
+    NAME benchmark
+    SRCS benchmark.cpp
+    LIBS ${LIBS_OPTIONAL} benchmark::benchmark
+    LIBDIRS ${LIB_DIRS})
+
 if(UMF_BUILD_BENCHMARKS_MT)
     add_umf_benchmark(
         NAME multithreaded
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
new file mode 100644
index 0000000000..c10bbda877
--- /dev/null
+++ b/benchmark/benchmark.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ */
+
+#include <benchmark/benchmark.h>
+#include <umf/pools/pool_proxy.h>
+#ifdef UMF_POOL_SCALABLE_ENABLED
+#include <umf/pools/pool_scalable.h>
+#endif
+#include <umf/providers/provider_os_memory.h>
+
+#ifdef UMF_POOL_DISJOINT_ENABLED
+#include <umf/pools/pool_disjoint.h>
+#endif
+
+#ifdef UMF_POOL_JEMALLOC_ENABLED
+#include <umf/pools/pool_jemalloc.h>
+#endif
+
+#include "benchmark.hpp"
+
+struct glibc_malloc : public allocator_interface {
+    unsigned SetUp([[maybe_unused]] ::benchmark::State &state,
+                   unsigned argPos) override {
+        return argPos;
+    }
+    void TearDown([[maybe_unused]] ::benchmark::State &state) override{};
+    void *benchAlloc(size_t size) override { return malloc(size); }
+    void benchFree(void *ptr, [[maybe_unused]] size_t size) override {
+        free(ptr);
+    }
+    static std::string name() { return "glibc"; }
+};
+
+struct os_provider : public provider_interface {
+    umf_os_memory_provider_params_handle_t params = NULL;
+    os_provider() {
+        umfOsMemoryProviderParamsCreate(&params);
+        return;
+    }
+
+    ~os_provider() {
+        if (params != NULL) {
+            umfOsMemoryProviderParamsDestroy(params);
+        }
+    }
+
+    void *getParams() override { return params; }
+    umf_memory_provider_ops_t *getOps() override {
+        return umfOsMemoryProviderOps();
+    }
+    static std::string name() { return "os_provider"; }
+};
+
+template <typename Provider>
+struct proxy_pool : public pool_interface<Provider> {
+    umf_memory_pool_ops_t *
+    getOps([[maybe_unused]] ::benchmark::State &state) override {
+        return umfProxyPoolOps();
+    }
+    void *getParams([[maybe_unused]] ::benchmark::State &state) override {
+        return nullptr;
+    }
+    static std::string name() { return "proxy_pool<" + Provider::name() + ">"; }
+};
+
+#ifdef UMF_POOL_DISJOINT_ENABLED
+template <typename Provider>
+struct disjoint_pool : public pool_interface<Provider> {
+    umf_disjoint_pool_params_handle_t disjoint_memory_pool_params;
+
+    disjoint_pool() {
+        disjoint_memory_pool_params = NULL;
+        auto ret = umfDisjointPoolParamsCreate(&disjoint_memory_pool_params);
+        if (ret != UMF_RESULT_SUCCESS) {
+            return;
+        }
+
+        // those function should never fail, so error handling is minimal.
+        ret = umfDisjointPoolParamsSetSlabMinSize(disjoint_memory_pool_params,
+                                                  4096);
+        if (ret != UMF_RESULT_SUCCESS) {
+            goto err;
+        }
+
+        ret = umfDisjointPoolParamsSetCapacity(disjoint_memory_pool_params, 4);
+        if (ret != UMF_RESULT_SUCCESS) {
+            goto err;
+        }
+
+        ret = umfDisjointPoolParamsSetMinBucketSize(disjoint_memory_pool_params,
+                                                    4096);
+        if (ret != UMF_RESULT_SUCCESS) {
+            goto err;
+        }
+
+        ret = umfDisjointPoolParamsSetMaxPoolableSize(
+            disjoint_memory_pool_params, 4096 * 16);
+
+        if (ret != UMF_RESULT_SUCCESS) {
+            goto err;
+        }
+        return;
+    err:
+
+        umfDisjointPoolParamsDestroy(disjoint_memory_pool_params);
+        disjoint_memory_pool_params = NULL;
+    }
+
+    ~disjoint_pool() {
+        if (disjoint_memory_pool_params != NULL) {
+            umfDisjointPoolParamsDestroy(disjoint_memory_pool_params);
+        }
+    }
+
+    umf_memory_pool_ops_t *
+    getOps([[maybe_unused]] ::benchmark::State &state) override {
+        return umfDisjointPoolOps();
+    }
+    void *getParams([[maybe_unused]] ::benchmark::State &state) override {
+
+        if (disjoint_memory_pool_params == NULL) {
+            state.SkipWithError("Failed to create disjoint pool params");
+        }
+
+        return disjoint_memory_pool_params;
+    }
+    static std::string name() {
+        return "disjoint_pool<" + Provider::name() + ">";
+    }
+};
+#endif
+
+#ifdef UMF_POOL_JEMALLOC_ENABLED
+template <typename Provider>
+struct jemalloc_pool : public pool_interface<Provider> {
+    umf_memory_pool_ops_t *
+    getOps([[maybe_unused]] ::benchmark::State &state) override {
+        return umfJemallocPoolOps();
+    }
+    void *getParams([[maybe_unused]] ::benchmark::State &state) override {
+        return NULL;
+    }
+    static std::string name() {
+        return "jemalloc_pool<" + Provider::name() + ">";
+    }
+};
+#endif
+
+#ifdef UMF_POOL_SCALABLE_ENABLED
+template <typename Provider>
+struct scalable_pool : public pool_interface<Provider> {
+    virtual umf_memory_pool_ops_t *
+    getOps([[maybe_unused]] ::benchmark::State &state) override {
+        return umfScalablePoolOps();
+    }
+    virtual void *
+    getParams([[maybe_unused]] ::benchmark::State &state) override {
+        return NULL;
+    }
+    static std::string name() {
+        return "scalable_pool<" + Provider::name() + ">";
+    }
+};
+#endif
+// Benchmarks scenarios:
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_fix, fixed_alloc_size,
+                              glibc_malloc);
+
+// The benchmark arguments specified in Args() are, in order:
+// benchmark arguments, allocator arguments, size generator arguments.
+// The exact meaning of each argument depends on the benchmark, allocator, and size components used.
+// Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments.
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_fix)
+    ->Args({10000, 0, 4096})
+    ->Args({10000, 100000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_uniform,
+                              uniform_alloc_size, glibc_malloc);
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_uniform)
+    ->Args({10000, 0, 8, 64 * 1024, 8})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, os_provider, fixed_alloc_size,
+                              provider_allocator<os_provider>);
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, os_provider)
+    ->Args({10000, 0, 4096})
+    ->Args({10000, 100000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, proxy_pool, fixed_alloc_size,
+                              pool_allocator<proxy_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, proxy_pool)
+    ->Args({1000, 0, 4096})
+    ->Args({1000, 100000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+#ifdef UMF_POOL_DISJOINT_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<disjoint_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_fix)
+    ->Args({10000, 0, 4096})
+    ->Args({10000, 100000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+// TODO: debug why this crashes
+/*UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_uniform,
+                              uniform_alloc_size,
+                              pool_allocator<disjoint_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_uniform)
+    ->Args({10000, 0, 8, 64 * 1024, 8})
+    //    ->Threads(4)
+    ->Threads(1);
+*/
+#endif
+
+#ifdef UMF_POOL_JEMALLOC_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<jemalloc_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_fix)
+    ->Args({10000, 0, 4096})
+    ->Args({10000, 100000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_uniform,
+                              uniform_alloc_size,
+                              pool_allocator<jemalloc_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_uniform)
+    ->Args({10000, 0, 8, 64 * 1024, 8})
+    ->Threads(4)
+    ->Threads(1);
+
+#endif
+#ifdef UMF_POOL_SCALABLE_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<scalable_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_fix)
+    ->Args({10000, 0, 4096})
+    ->Args({10000, 100000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_uniform,
+                              uniform_alloc_size,
+                              pool_allocator<scalable_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_uniform)
+    ->Args({10000, 0, 8, 64 * 1024, 8})
+    ->Threads(4)
+    ->Threads(1);
+#endif
+// Multiple allocs/free
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_fix,
+                              fixed_alloc_size, glibc_malloc);
+
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_fix)
+    ->Args({10000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_uniform,
+                              uniform_alloc_size, glibc_malloc);
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_uniform)
+    ->Args({10000, 8, 64 * 1024, 8})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, proxy_pool,
+                              fixed_alloc_size,
+                              pool_allocator<proxy_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, proxy_pool)
+    ->Args({10000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, os_provider,
+                              fixed_alloc_size,
+                              provider_allocator<os_provider>);
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, os_provider)
+    ->Args({10000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+#ifdef UMF_POOL_DISJOINT_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<disjoint_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix)
+    ->Args({10000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+// TODO: debug why this crashes
+/*UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
+                              disjoint_pool_uniform, uniform_alloc_size,
+                              pool_allocator<disjoint_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_uniform)
+    ->Args({10000, 0, 8, 64 * 1024, 8})
+    ->Threads(4)
+    ->Threads(1);
+*/
+#endif
+
+#ifdef UMF_POOL_JEMALLOC_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<jemalloc_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_fix)
+    ->Args({10000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
+                              jemalloc_pool_uniform, uniform_alloc_size,
+                              pool_allocator<jemalloc_pool<os_provider>>);
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_uniform)
+    ->Args({1000, 8, 64 * 1024, 8})
+    ->Threads(4)
+    ->Threads(1);
+
+#endif
+
+#ifdef UMF_POOL_SCALABLE_ENABLED
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, scalable_pool_fix,
+                              fixed_alloc_size,
+                              pool_allocator<scalable_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_fix)
+    ->Args({10000, 4096})
+    ->Threads(4)
+    ->Threads(1);
+
+UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
+                              scalable_pool_uniform, uniform_alloc_size,
+                              pool_allocator<scalable_pool<os_provider>>);
+
+UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_uniform)
+    ->Args({10000, 8, 64 * 1024, 8})
+    ->Threads(4)
+    ->Threads(1);
+
+#endif
+BENCHMARK_MAIN();
diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp
new file mode 100644
index 0000000000..ead6b39e75
--- /dev/null
+++ b/benchmark/benchmark.hpp
@@ -0,0 +1,382 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ */
+
+/*
+ * This file defines a benchmarking framework for evaluating memory allocation
+ * and deallocation performance using the Unified Memory Framework (UMF). The
+ * design is modular and extensible, allowing for flexible benchmarking of different
+ * allocation strategies, size distributions, and memory providers.
+ *
+ * **Key Design Features:**
+ * - **Modular Components**: The framework is built using interfaces and templates,
+ *   which allows for easy extension and customization of allocation strategies,
+ *   size distributions, and memory providers.
+ * - **Flexible Allocation Size Generators**: Includes classes like `fixed_alloc_size`
+ *   and `uniform_alloc_size` that generate allocation sizes based on different
+ *   strategies. These classes implement the `alloc_size_interface`.
+ * - **Abstract Allocator Interface**: The `allocator_interface` defines the basic
+ *   methods for memory allocation and deallocation. Concrete allocators like
+ *   `provider_allocator` and `pool_allocator` implement this interface to work
+ *   with different memory providers and pools.
+ * - **Benchmarking Classes**: Classes like `alloc_benchmark` and `multiple_malloc_free_benchmark`
+ *   templates the allocation size generator and allocator to perform benchmarks.
+ *   It manages the setup, execution, and teardown of the benchmark.
+ * - **Threaded Execution Support**: The benchmarks support multi-threaded execution
+ *   by maintaining thread-specific allocation data and synchronization.
+ *
+ * **Component Interactions:**
+ * - **Size Generators and Allocators**: The `alloc_benchmark` class uses a size
+ *   generator (e.g., `fixed_alloc_size` or `uniform_alloc_size`) to determine the
+ *   sizes of memory allocations, and an allocator (e.g., `provider_allocator` or
+ *   `pool_allocator`) to perform the actual memory operations.
+ * - **Benchmark Execution**: During the benchmark, `alloc_benchmark` repeatedly
+ *   calls the `bench` method, which performs allocations and deallocations using
+ *   the allocator and size generator.
+ * - **Allocator Adapters**: The `provider_allocator` and `pool_allocator` adapt
+ *   specific memory providers and pools to the `allocator_interface`, allowing
+ *   them to be used interchangeably in the benchmark classes. This abstraction
+ *   enables benchmarking different memory management strategies without changing
+ *   the core benchmarking logic.
+ * - **Pre-allocations and Iterations**: The `alloc_benchmark` can perform a set
+ *   number of pre-allocations before the benchmark starts, and manages allocation
+ *   and deallocation cycles to simulate memory pressure and fragmentation.
+ * - **Derived Benchmarks**: `multiple_malloc_free_benchmark` extends
+ *   `alloc_benchmark` to perform multiple random deallocations and reallocations
+ *   in each iteration, using a uniform distribution to select which allocations
+ *   to free and reallocate. This models workloads with frequent memory churn.
+ *
+ * **Execution Flow:**
+ * 1. **Setup Phase**:
+ *    - The benchmark class initializes the size generator and allocator.
+ *    - Pre-allocations are performed if specified.
+ *    - Thread-specific data structures for allocations are prepared.
+ * 2. **Benchmark Loop**:
+ *    - For each iteration, the `bench` method is called.
+ *    - The size generator provides the next allocation size.
+ *    - The allocator performs the allocation.
+ *    - Allocations are tracked per thread.
+ * 3. **Teardown Phase**:
+ *    - All remaining allocations are freed.
+ *    - Allocator and size generator are cleaned up.
+ *
+ * **Customization and Extension:**
+ * - New size generators can be created by implementing the `alloc_size_interface`.
+ * - New allocators can be adapted by implementing the `allocator_interface`.
+ * - Additional benchmarking scenarios can be created by extending `benchmark_interface`.
+ */
+
+#include <benchmark/benchmark.h>
+#include <random>
+#include <umf/memory_pool.h>
+#include <umf/memory_provider.h>
+
+#include "benchmark_interfaces.hpp"
+
+struct alloc_data {
+    void *ptr;
+    size_t size;
+};
+
+#define UMF_BENCHMARK_TEMPLATE_DEFINE(BaseClass, Method, ...)                  \
+    BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, __VA_ARGS__)                \
+    (benchmark::State & state) {                                               \
+        for (auto _ : state) {                                                 \
+            bench(state);                                                      \
+        }                                                                      \
+    }
+
+#define UMF_BENCHMARK_REGISTER_F(BaseClass, Method)                            \
+    BENCHMARK_REGISTER_F(BaseClass, Method)                                    \
+        ->ArgNames(                                                            \
+            BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::argsName())      \
+        ->Name(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::name())       \
+        ->MinWarmUpTime(1)
+
+class fixed_alloc_size : public alloc_size_interface {
+  public:
+    unsigned SetUp(::benchmark::State &state, unsigned argPos) override {
+        size = state.range(argPos);
+        return argPos + 1;
+    }
+    void TearDown([[maybe_unused]] ::benchmark::State &state) override {}
+    size_t nextSize() override { return size; };
+    static std::vector<std::string> argsName() { return {"size"}; }
+
+  private:
+    size_t size;
+};
+
+class uniform_alloc_size : public alloc_size_interface {
+    using distribution = std::uniform_int_distribution<int64_t>;
+
+  public:
+    unsigned SetUp(::benchmark::State &state, unsigned argPos) override {
+        auto min = state.range(argPos++);
+        auto max = state.range(argPos++);
+        auto gran = state.range(argPos++);
+        if (min % gran != 0 && max % gran != 0) {
+            state.SkipWithError("min and max must be divisible by granularity");
+            return argPos;
+        }
+
+        dist.param(distribution::param_type(min / gran, max / gran));
+        multiplier = gran;
+        return argPos;
+    }
+    void TearDown([[maybe_unused]] ::benchmark::State &state) override {}
+    size_t nextSize() override { return dist(generator) * multiplier; }
+    static std::vector<std::string> argsName() {
+        return {"min size", "max size", "granularity"};
+    }
+
+  private:
+    std::default_random_engine generator;
+    distribution dist;
+    size_t multiplier;
+};
+
+// This class benchmarks speed of alloc() operations.
+template <
+    typename Size, typename Alloc,
+    typename =
+        std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
+    typename =
+        std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
+class alloc_benchmark : public benchmark_interface<Size, Alloc> {
+  public:
+    size_t max_allocs = 1000;
+    size_t pre_allocs = 0;
+    void SetUp(::benchmark::State &state) override {
+        if (state.thread_index() != 0) {
+            return;
+        }
+
+        // unpack arguments
+        int argPos = 0;
+        max_allocs = state.range(argPos++);
+        pre_allocs = state.range(argPos++);
+        // pass rest of the arguments to "alloc_size" and "allocator"
+        argPos = base::alloc_size.SetUp(state, argPos);
+        base::allocator.SetUp(state, argPos);
+
+        // initialize allocations tracking vectors (one per thread)
+        // and iterators for these vectors.
+        allocations.resize(state.threads());
+        iters.resize(state.threads());
+
+        for (auto &i : iters) {
+            i = pre_allocs;
+        }
+
+        // do "pre_alloc" allocations before actual benchmark.
+        for (auto &i : allocations) {
+            i.resize(max_allocs + pre_allocs);
+
+            for (size_t j = 0; j < pre_allocs; j++) {
+                i[j].ptr =
+                    base::allocator.benchAlloc(base::alloc_size.nextSize());
+                if (i[j].ptr == NULL) {
+                    state.SkipWithError("preallocation failed");
+                    return;
+                }
+                i[j].size = base::alloc_size.nextSize();
+            }
+        }
+    }
+
+    void TearDown(::benchmark::State &state) override {
+        if (state.thread_index() != 0) {
+            return;
+        }
+        for (auto &i : allocations) {
+            for (auto &j : i) {
+                if (j.ptr != NULL) {
+                    base::allocator.benchFree(j.ptr, j.size);
+                    j.ptr = NULL;
+                    j.size = 0;
+                }
+            }
+        }
+
+        base::TearDown(state);
+    }
+
+    void bench(benchmark::State &state) override {
+        auto tid = state.thread_index();
+        auto s = base::alloc_size.nextSize();
+        auto &i = iters[tid];
+        allocations[tid][i].ptr = base::allocator.benchAlloc(s);
+        if (allocations[tid][i].ptr == NULL) {
+            state.SkipWithError("allocation failed");
+            return;
+        }
+        allocations[tid][i].size = s;
+        i++;
+        if (i >= max_allocs + pre_allocs) {
+            // This benchmark tests only allocations -
+            // if allocation tracker is full we pause benchmark to dealloc all allocations -
+            // excluding pre-allocated ones.
+            state.PauseTiming();
+            while (i > pre_allocs) {
+                auto &allocation = allocations[tid][--i];
+                base::allocator.benchFree(allocation.ptr, allocation.size);
+                allocation.ptr = NULL;
+                allocation.size = 0;
+            }
+            state.ResumeTiming();
+        }
+    }
+    static std::vector<std::string> argsName() {
+        auto n = benchmark_interface<Size, Alloc>::argsName();
+        std::vector<std::string> res = {"max_allocs", "pre_allocs"};
+        res.insert(res.end(), n.begin(), n.end());
+        return res;
+    }
+    static std::string name() { return base::name() + "/alloc"; }
+
+  protected:
+    using base = benchmark_interface<Size, Alloc>;
+    std::vector<std::vector<alloc_data>> allocations;
+    std::vector<size_t> iters;
+};
+
+// This class benchmarks performance of random deallocations and (re)allocations
+template <
+    typename Size, typename Alloc,
+    typename =
+        std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
+    typename =
+        std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
+class multiple_malloc_free_benchmark : public alloc_benchmark<Size, Alloc> {
+    using distribution = std::uniform_int_distribution<size_t>;
+    using base = alloc_benchmark<Size, Alloc>;
+
+  public:
+    int reallocs = 100;
+    void SetUp(::benchmark::State &state) override {
+        if (state.thread_index() != 0) {
+            return;
+        }
+        // unpack arguments
+        int argPos = 0;
+        base::max_allocs = state.range(argPos++);
+
+        // pass rest of the arguments to "alloc_size" and "allocator"
+        argPos = base::alloc_size.SetUp(state, argPos);
+        base::allocator.SetUp(state, argPos);
+
+        // perform initial allocations which will be later freed and reallocated
+        base::allocations.resize(state.threads());
+        for (auto &i : base::allocations) {
+            i.resize(base::max_allocs);
+
+            for (size_t j = 0; j < base::max_allocs; j++) {
+                i[j].ptr =
+                    base::allocator.benchAlloc(base::alloc_size.nextSize());
+                if (i[j].ptr == NULL) {
+                    state.SkipWithError("preallocation failed");
+                    return;
+                }
+                i[j].size = base::alloc_size.nextSize();
+            }
+        }
+        dist.param(distribution::param_type(0, base::max_allocs - 1));
+    }
+
+    void bench(benchmark::State &state) override {
+        auto tid = state.thread_index();
+        auto &allocation = base::allocations[tid];
+        std::vector<size_t> to_alloc;
+        for (int j = 0; j < reallocs; j++) {
+            auto idx = dist(generator);
+            if (allocation[idx].ptr == NULL) {
+                continue;
+            }
+            to_alloc.push_back(idx);
+
+            base::allocator.benchFree(allocation[idx].ptr,
+                                      allocation[idx].size);
+            allocation[idx].ptr = NULL;
+            allocation[idx].size = 0;
+        }
+
+        for (auto idx : to_alloc) {
+            auto s = base::alloc_size.nextSize();
+            allocation[idx].ptr = base::allocator.benchAlloc(s);
+            if (allocation[idx].ptr == NULL) {
+                state.SkipWithError("allocation failed");
+            }
+            allocation[idx].size = s;
+        }
+    }
+
+    static std::string name() {
+        return base::base::name() + "/multiple_malloc_free";
+    }
+    static std::vector<std::string> argsName() {
+        auto n = benchmark_interface<Size, Alloc>::argsName();
+        std::vector<std::string> res = {"max_allocs"};
+        res.insert(res.end(), n.begin(), n.end());
+        return res;
+    }
+    std::default_random_engine generator;
+    distribution dist;
+};
+
+template <typename Provider, typename = std::enable_if_t<std::is_base_of<
+                                 provider_interface, Provider>::value>>
+class provider_allocator : public allocator_interface {
+  public:
+    unsigned SetUp(::benchmark::State &state, unsigned r) override {
+        provider.SetUp(state);
+        return r;
+    }
+
+    void TearDown(::benchmark::State &state) override {
+        provider.TearDown(state);
+    }
+
+    void *benchAlloc(size_t size) override {
+        void *ptr;
+        if (umfMemoryProviderAlloc(provider.provider, size, 0, &ptr) !=
+            UMF_RESULT_SUCCESS) {
+            return NULL;
+        }
+        return ptr;
+    }
+    void benchFree(void *ptr, size_t size) override {
+        umfMemoryProviderFree(provider.provider, ptr, size);
+    }
+    static std::string name() { return Provider::name(); }
+
+  private:
+    Provider provider;
+};
+
+// TODO: assert Pool to be a pool_interface<provider_interface>.
+template <typename Pool> class pool_allocator : public allocator_interface {
+  public:
+    unsigned SetUp(::benchmark::State &state, unsigned r) override {
+        pool.SetUp(state);
+        return r;
+    }
+
+    void TearDown(::benchmark::State &state) override { pool.TearDown(state); }
+
+    virtual void *benchAlloc(size_t size) override {
+        return umfPoolMalloc(pool.pool, size);
+    }
+    virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) override {
+        umfPoolFree(pool.pool, ptr);
+    }
+
+    static std::string name() { return Pool::name(); }
+
+  private:
+    Pool pool;
+};
diff --git a/benchmark/benchmark_interfaces.hpp b/benchmark/benchmark_interfaces.hpp
new file mode 100644
index 0000000000..8681160626
--- /dev/null
+++ b/benchmark/benchmark_interfaces.hpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ */
+
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include <umf/memory_pool.h>
+#include <umf/memory_provider.h>
+
+class alloc_size_interface {
+  public:
+    virtual unsigned SetUp([[maybe_unused]] ::benchmark::State &state,
+                           [[maybe_unused]] unsigned argPos) = 0;
+    virtual void TearDown([[maybe_unused]] ::benchmark::State &state) = 0;
+    virtual size_t nextSize() = 0;
+    static std::vector<std::string> argsName() { return {""}; };
+};
+
+class allocator_interface {
+  public:
+    virtual unsigned SetUp([[maybe_unused]] ::benchmark::State &state,
+                           [[maybe_unused]] unsigned argPos) = 0;
+    virtual void TearDown([[maybe_unused]] ::benchmark::State &state) = 0;
+    virtual void *benchAlloc(size_t size) = 0;
+    virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) = 0;
+    static std::vector<std::string> argsName() { return {}; }
+};
+
+template <typename Size, typename Allocator>
+struct benchmark_interface : public benchmark::Fixture {
+    void SetUp(::benchmark::State &state) {
+        int argPos = alloc_size.SetUp(state, 0);
+        allocator.SetUp(state, argPos);
+    }
+    void TearDown(::benchmark::State &state) {
+        alloc_size.TearDown(state);
+        allocator.TearDown(state);
+    }
+
+    virtual void bench(::benchmark::State &state) = 0;
+
+    static std::vector<std::string> argsName() {
+        auto s = Size::argsName();
+        auto a = Allocator::argsName();
+        std::vector<std::string> res = {};
+        res.insert(res.end(), s.begin(), s.end());
+        res.insert(res.end(), a.begin(), a.end());
+        return res;
+    }
+    static std::string name() { return Allocator::name(); }
+
+    Size alloc_size;
+    Allocator allocator;
+};
+
+struct provider_interface {
+    umf_memory_provider_handle_t provider = NULL;
+    virtual void SetUp(::benchmark::State &state) {
+        if (state.thread_index() != 0) {
+            return;
+        }
+        auto umf_result =
+            umfMemoryProviderCreate(getOps(), getParams(), &provider);
+        if (umf_result != UMF_RESULT_SUCCESS) {
+            state.SkipWithError("umfMemoryProviderCreate() failed");
+        }
+    }
+
+    virtual void TearDown([[maybe_unused]] ::benchmark::State &state) {
+        if (state.thread_index() != 0) {
+            return;
+        }
+
+        if (provider) {
+            umfMemoryProviderDestroy(provider);
+        }
+    }
+
+    virtual umf_memory_provider_ops_t *getOps() { return nullptr; }
+    virtual void *getParams() { return nullptr; }
+};
+
+template <typename T,
+          typename =
+              std::enable_if_t<std::is_base_of<provider_interface, T>::value>>
+struct pool_interface {
+    virtual void SetUp(::benchmark::State &state) {
+        provider.SetUp(state);
+        if (state.thread_index() != 0) {
+            return;
+        }
+        auto umf_result = umfPoolCreate(getOps(state), provider.provider,
+                                        getParams(state), 0, &pool);
+        if (umf_result != UMF_RESULT_SUCCESS) {
+            state.SkipWithError("umfPoolCreate() failed");
+        }
+    }
+    virtual void TearDown([[maybe_unused]] ::benchmark::State &state) {
+        if (state.thread_index() != 0) {
+            return;
+        }
+        // TODO: The scalable pool destruction process can race with other threads
+        // performing TLS (Thread-Local Storage) destruction.
+        // As a temporary workaround, we introduce a delay (sleep)
+        // to ensure the pool is destroyed only after all threads have completed.
+        // Issue: #933
+        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+        if (pool) {
+            umfPoolDestroy(pool);
+        }
+    };
+
+    virtual umf_memory_pool_ops_t *
+    getOps([[maybe_unused]] ::benchmark::State &state) {
+        return nullptr;
+    }
+    virtual void *getParams([[maybe_unused]] ::benchmark::State &state) {
+        return nullptr;
+    }
+    T provider;
+    umf_memory_pool_handle_t pool;
+};
diff --git a/benchmark/multithread.cpp b/benchmark/multithread.cpp
index efb46729c3..4558942ecb 100644
--- a/benchmark/multithread.cpp
+++ b/benchmark/multithread.cpp
@@ -113,7 +113,7 @@ int main() {
     std::cout << "skipping scalable_pool mt_alloc_free" << std::endl;
 #endif
 
-#if defined(UMF_BUILD_LIBUMF_POOL_JEMALLOC)
+#if defined(UMF_POOL_JEMALLOC_ENABLED)
     std::cout << "jemalloc_pool mt_alloc_free: ";
     mt_alloc_free(poolCreateExtParams{umfJemallocPoolOps(), nullptr,
                                       umfOsMemoryProviderOps(), osParams});
@@ -121,7 +121,7 @@ int main() {
     std::cout << "skipping jemalloc_pool mt_alloc_free" << std::endl;
 #endif
 
-#if defined(UMF_BUILD_LIBUMF_POOL_DISJOINT)
+#if defined(UMF_POOL_DISJOINT_ENABLED)
     umf_disjoint_pool_params_handle_t hDisjointParams = nullptr;
     umf_result_t ret = umfDisjointPoolParamsCreate(&hDisjointParams);
     if (ret != UMF_RESULT_SUCCESS) {
diff --git a/benchmark/ubench.c b/benchmark/ubench.c
index 142112e83a..5f1bfe9e48 100644
--- a/benchmark/ubench.c
+++ b/benchmark/ubench.c
@@ -20,11 +20,11 @@
 #include <umf/providers/provider_level_zero.h>
 #include <umf/providers/provider_os_memory.h>
 
-#ifdef UMF_BUILD_LIBUMF_POOL_DISJOINT
+#ifdef UMF_POOL_DISJOINT_ENABLED
 #include <umf/pools/pool_disjoint.h>
 #endif
 
-#ifdef UMF_BUILD_LIBUMF_POOL_JEMALLOC
+#ifdef UMF_POOL_JEMALLOC_ENABLED
 #include <umf/pools/pool_jemalloc.h>
 #endif
 
@@ -244,7 +244,7 @@ UBENCH_EX(simple, proxy_pool_with_os_memory_provider) {
     free(array);
 }
 
-#if (defined UMF_BUILD_LIBUMF_POOL_DISJOINT)
+#if (defined UMF_POOL_DISJOINT_ENABLED)
 ////////////////// DISJOINT POOL WITH OS MEMORY PROVIDER
 
 UBENCH_EX(simple, disjoint_pool_with_os_memory_provider) {
@@ -327,9 +327,9 @@ UBENCH_EX(simple, disjoint_pool_with_os_memory_provider) {
     umfMemoryProviderDestroy(os_memory_provider);
     free(array);
 }
-#endif /* (defined UMF_BUILD_LIBUMF_POOL_DISJOINT) */
+#endif /* (defined UMF_POOL_DISJOINT_ENABLED) */
 
-#if (defined UMF_BUILD_LIBUMF_POOL_JEMALLOC)
+#if (defined UMF_POOL_JEMALLOC_ENABLED)
 ////////////////// JEMALLOC POOL WITH OS MEMORY PROVIDER
 
 UBENCH_EX(simple, jemalloc_pool_with_os_memory_provider) {
@@ -373,7 +373,7 @@ UBENCH_EX(simple, jemalloc_pool_with_os_memory_provider) {
     umfMemoryProviderDestroy(os_memory_provider);
     free(array);
 }
-#endif /* (defined UMF_BUILD_LIBUMF_POOL_JEMALLOC) */
+#endif /* (defined UMF_POOL_JEMALLOC_ENABLED) */
 
 #if (defined UMF_POOL_SCALABLE_ENABLED)
 ////////////////// SCALABLE (TBB) POOL WITH OS MEMORY PROVIDER
@@ -421,7 +421,7 @@ UBENCH_EX(simple, scalable_pool_with_os_memory_provider) {
 }
 #endif /* (defined UMF_POOL_SCALABLE_ENABLED) */
 
-#if (defined UMF_BUILD_LIBUMF_POOL_DISJOINT &&                                 \
+#if (defined UMF_POOL_DISJOINT_ENABLED &&                                      \
      defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS)
 static void do_ipc_get_put_benchmark(alloc_t *allocs, size_t num_allocs,
                                      size_t repeats,
@@ -630,7 +630,7 @@ UBENCH_EX(ipc, disjoint_pool_with_level_zero_provider) {
 err_destroy_context:
     utils_ze_destroy_context(context);
 }
-#endif /* (defined UMF_BUILD_LIBUMF_POOL_DISJOINT && defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS) */
+#endif /* (defined UMF_POLL_DISJOINT_ENABLED && defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS) */
 
 // TODO add IPC benchmark for CUDA