diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp
index a960d89bca..b096716b3c 100644
--- a/benchmark/benchmark.hpp
+++ b/benchmark/benchmark.hpp
@@ -173,6 +173,14 @@ class provider_allocator : public allocator_interface {
         return argPos;
     }
 
+    void preBench(::benchmark::State &state) override {
+        provider.preBench(state);
+    }
+
+    void postBench(::benchmark::State &state) override {
+        provider.postBench(state);
+    }
+
     void TearDown(::benchmark::State &state) override {
         provider.TearDown(state);
     }
@@ -204,13 +212,18 @@ template <typename Pool> class pool_allocator : public allocator_interface {
         return argPos;
     }
 
+    void preBench(::benchmark::State &state) override { pool.preBench(state); }
+    void postBench(::benchmark::State &state) override {
+        pool.postBench(state);
+    }
+
     void TearDown(::benchmark::State &state) override { pool.TearDown(state); }
 
-    virtual void *benchAlloc(size_t size) override {
+    void *benchAlloc(size_t size) override {
         return umfPoolMalloc(pool.pool, size);
     }
 
-    virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) override {
+    void benchFree(void *ptr, [[maybe_unused]] size_t size) override {
         umfPoolFree(pool.pool, ptr);
     }
 
@@ -241,7 +254,7 @@ struct benchmark_interface : public benchmark::Fixture {
         allocator.TearDown(state);
     }
 
-    virtual void bench(::benchmark::State &state) = 0;
+    void bench([[maybe_unused]] ::benchmark::State &state){};
 
     virtual std::vector<std::string> argsName() {
         auto s = Size::argsName();
@@ -260,6 +273,9 @@ struct benchmark_interface : public benchmark::Fixture {
         benchmark->ArgNames(bench->argsName())->Name(bench->name());
     }
 
+    void custom_counters(::benchmark::State &state) {
+        allocator.custom_counters(state);
+    }
     std::vector<Size> alloc_sizes;
     Allocator allocator;
 };
@@ -282,7 +298,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
 
     vector2d<alloc_data> allocations;
     std::vector<unsigned> iters;
-
+    std::vector<size_t> memused;
     vector2d<next_alloc_data> next;
     std::vector<std::vector<next_alloc_data>::const_iterator> next_iter;
     int64_t iterations;
@@ -302,6 +318,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
             allocations.resize(state.threads());
             next.resize(state.threads());
             next_iter.resize(state.threads());
+            memused.assign(state.threads(), 0);
 
 #ifndef WIN32
             // Ensure that system malloc does not have memory pooled on the heap
@@ -323,13 +340,36 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
         waitForAllThreads(state);
         // prepare workload for actual benchmark.
         freeAllocs(state);
+
         prealloc(state);
         prepareWorkload(state);
+        waitForAllThreads(state);
+        base::allocator.preBench(state);
     }
 
     void TearDown(::benchmark::State &state) override {
+        base::allocator.postBench(state);
         auto tid = state.thread_index();
+        if (tid == 0) {
+            size_t current_memory_allocated = 0;
+            for (const auto &used : memused) {
+                current_memory_allocated += used;
+            }
+
+            auto memory_used = state.counters["provider_memory_allocated"];
+
+            if (memory_used != 0) {
+                state.counters["benchmark_memory_allocated"] =
+                    static_cast<double>(current_memory_allocated);
+                state.counters["memory_overhead"] =
+                    100.0 * (memory_used - current_memory_allocated) /
+                    memory_used;
+            } else {
+                state.counters.erase("provider_memory_allocated");
+            }
+        }
 
+        waitForAllThreads(state);
         freeAllocs(state);
         waitForAllThreads(state);
         if (tid == 0) {
@@ -342,20 +382,22 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
         base::TearDown(state);
     }
 
-    void bench(benchmark::State &state) override {
+    void bench(benchmark::State &state) {
         auto tid = state.thread_index();
         auto &allocation = allocations[tid];
+        auto &memuse = memused[tid];
         for (int i = 0; i < allocsPerIterations; i++) {
             auto &n = *next_iter[tid]++;
             auto &alloc = allocation[n.offset];
             base::allocator.benchFree(alloc.ptr, alloc.size);
-
+            memuse -= alloc.size;
             alloc.size = n.size;
             alloc.ptr = base::allocator.benchAlloc(alloc.size);
 
             if (alloc.ptr == NULL) {
                 state.SkipWithError("allocation failed");
             }
+            memuse += alloc.size;
         }
     }
 
@@ -376,7 +418,9 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
         auto tid = state.thread_index();
         auto &i = allocations[tid];
         i.resize(max_allocs);
+        auto &memuse = memused[tid];
         auto sizeGenerator = base::alloc_sizes[tid];
+
         for (size_t j = 0; j < max_allocs; j++) {
             auto size = sizeGenerator.nextSize();
             i[j].ptr = base::allocator.benchAlloc(size);
@@ -385,6 +429,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
                 return;
             }
             i[j].size = size;
+            memuse += size;
         }
     }
 
@@ -394,6 +439,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
         for (auto &j : i) {
             if (j.ptr != NULL) {
                 base::allocator.benchFree(j.ptr, j.size);
+                memused[tid] -= j.size;
                 j.ptr = NULL;
                 j.size = 0;
             }
diff --git a/benchmark/benchmark_umf.hpp b/benchmark/benchmark_umf.hpp
index cfc9982d2c..9553d6fdb5 100644
--- a/benchmark/benchmark_umf.hpp
+++ b/benchmark/benchmark_umf.hpp
@@ -11,8 +11,6 @@
 #include <benchmark/benchmark.h>
 #include <umf/memory_pool.h>
 #include <umf/memory_provider.h>
-
-#include <benchmark/benchmark.h>
 #include <umf/pools/pool_disjoint.h>
 #include <umf/pools/pool_proxy.h>
 
@@ -30,7 +28,7 @@ struct provider_interface {
     using params_ptr = std::unique_ptr<void, void (*)(void *)>;
 
     umf_memory_provider_handle_t provider = NULL;
-    virtual void SetUp(::benchmark::State &state) {
+    void SetUp(::benchmark::State &state) {
         if (state.thread_index() != 0) {
             return;
         }
@@ -42,7 +40,27 @@ struct provider_interface {
         }
     }
 
-    virtual void TearDown([[maybe_unused]] ::benchmark::State &state) {
+    void preBench([[maybe_unused]] ::benchmark::State &state) {
+        if (state.thread_index() != 0) {
+            return;
+        }
+        umfCtlExec("umf.provider.by_handle.stats.reset", provider, NULL);
+    }
+
+    void postBench([[maybe_unused]] ::benchmark::State &state) {
+        if (state.thread_index() != 0) {
+            return;
+        }
+        size_t arg;
+        umf_result_t ret = umfCtlGet(
+            "umf.provider.by_handle.stats.allocated_memory", provider, &arg);
+        if (ret == UMF_RESULT_SUCCESS) {
+            state.counters["provider_memory_allocated"] =
+                static_cast<double>(arg);
+        }
+    }
+
+    void TearDown([[maybe_unused]] ::benchmark::State &state) {
         if (state.thread_index() != 0) {
             return;
         }
@@ -53,9 +71,7 @@ struct provider_interface {
     }
 
     virtual umf_memory_provider_ops_t *
-    getOps([[maybe_unused]] ::benchmark::State &state) {
-        return nullptr;
-    }
+    getOps([[maybe_unused]] ::benchmark::State &state) = 0;
 
     virtual params_ptr getParams([[maybe_unused]] ::benchmark::State &state) {
         return {nullptr, [](void *) {}};
@@ -68,7 +84,7 @@ template <typename T,
 struct pool_interface {
     using params_ptr = std::unique_ptr<void, void (*)(void *)>;
 
-    virtual void SetUp(::benchmark::State &state) {
+    void SetUp(::benchmark::State &state) {
         provider.SetUp(state);
         if (state.thread_index() != 0) {
             return;
@@ -80,7 +96,22 @@ struct pool_interface {
             state.SkipWithError("umfPoolCreate() failed");
         }
     }
-    virtual void TearDown([[maybe_unused]] ::benchmark::State &state) {
+
+    void preBench([[maybe_unused]] ::benchmark::State &state) {
+        provider.preBench(state);
+        if (state.thread_index() != 0) {
+            return;
+        }
+    }
+
+    void postBench([[maybe_unused]] ::benchmark::State &state) {
+        provider.postBench(state);
+        if (state.thread_index() != 0) {
+            return;
+        }
+    }
+
+    void TearDown([[maybe_unused]] ::benchmark::State &state) {
         if (state.thread_index() != 0) {
             return;
         }
@@ -93,15 +124,17 @@ struct pool_interface {
         if (pool) {
             umfPoolDestroy(pool);
         }
+
+        provider.TearDown(state);
     };
 
     virtual umf_memory_pool_ops_t *
-    getOps([[maybe_unused]] ::benchmark::State &state) {
-        return nullptr;
-    }
+    getOps([[maybe_unused]] ::benchmark::State &state) = 0;
+
     virtual params_ptr getParams([[maybe_unused]] ::benchmark::State &state) {
         return {nullptr, [](void *) {}};
     }
+
     T provider;
     umf_memory_pool_handle_t pool;
 };
@@ -110,6 +143,8 @@ class allocator_interface {
   public:
     virtual unsigned SetUp([[maybe_unused]] ::benchmark::State &state,
                            [[maybe_unused]] unsigned argPos) = 0;
+    virtual void preBench([[maybe_unused]] ::benchmark::State &state) = 0;
+    virtual void postBench([[maybe_unused]] ::benchmark::State &state) = 0;
     virtual void TearDown([[maybe_unused]] ::benchmark::State &state) = 0;
     virtual void *benchAlloc(size_t size) = 0;
     virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) = 0;
@@ -121,7 +156,9 @@ struct glibc_malloc : public allocator_interface {
                    unsigned argPos) override {
         return argPos;
     }
-    void TearDown([[maybe_unused]] ::benchmark::State &state) override{};
+    void preBench([[maybe_unused]] ::benchmark::State &state) override {}
+    void postBench([[maybe_unused]] ::benchmark::State &state) override {}
+    void TearDown([[maybe_unused]] ::benchmark::State &state) override {}
     void *benchAlloc(size_t size) override { return malloc(size); }
     void benchFree(void *ptr, [[maybe_unused]] size_t size) override {
         free(ptr);
@@ -163,7 +200,7 @@ struct fixed_provider : public provider_interface {
     char *mem = NULL;
     const size_t size = 1024 * 1024 * 1024; // 1GB
   public:
-    virtual void SetUp(::benchmark::State &state) override {
+    void SetUp(::benchmark::State &state) {
         if (state.thread_index() != 0) {
             return;
         }
@@ -175,7 +212,7 @@ struct fixed_provider : public provider_interface {
         provider_interface::SetUp(state);
     }
 
-    virtual void TearDown(::benchmark::State &state) override {
+    void TearDown(::benchmark::State &state) {
         if (state.thread_index() != 0) {
             return;
         }
@@ -295,7 +332,7 @@ struct jemalloc_pool : public pool_interface<Provider> {
 #ifdef UMF_POOL_SCALABLE_ENABLED
 template <typename Provider>
 struct scalable_pool : public pool_interface<Provider> {
-    virtual umf_memory_pool_ops_t *
+    umf_memory_pool_ops_t *
     getOps([[maybe_unused]] ::benchmark::State &state) override {
         return umfScalablePoolOps();
     }
diff --git a/src/provider/provider_os_memory.c b/src/provider/provider_os_memory.c
index f3e5c7fa02..1ecb397fe8 100644
--- a/src/provider/provider_os_memory.c
+++ b/src/provider/provider_os_memory.c
@@ -6,19 +6,21 @@
 */
 
 #include <assert.h>
+#include <ctl/ctl.h>
 #include <errno.h>
 #include <limits.h>
+
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
-#include <ctl/ctl.h>
 #include <umf.h>
 #include <umf/base.h>
 #include <umf/memory_provider.h>
 #include <umf/memory_provider_ops.h>
 #include <umf/providers/provider_os_memory.h>
+
+#include "utils_assert.h"
 // OS Memory Provider requires HWLOC
 #if defined(UMF_NO_HWLOC)
 
@@ -187,12 +189,77 @@ static int CTL_READ_HANDLER(ipc_enabled)(void *ctx,
     return 0;
 }
 
+static int CTL_READ_HANDLER(peak_memory)(void *ctx,
+                                         umf_ctl_query_source_t source,
+                                         void *arg,
+                                         umf_ctl_index_utlist_t *indexes,
+                                         const char *extra_name,
+                                         umf_ctl_query_type_t query_type) {
+    /* suppress unused-parameter errors */
+    (void)source, (void)indexes, (void)ctx, (void)extra_name, (void)query_type;
+
+    size_t *arg_out = arg;
+    os_memory_provider_t *os_provider = (os_memory_provider_t *)ctx;
+    COMPILE_ERROR_ON(sizeof(os_provider->stats.peak_memory) !=
+                     sizeof(uint64_t));
+    utils_atomic_load_acquire_u64((uint64_t *)&os_provider->stats.peak_memory,
+                                  (uint64_t *)arg_out);
+    return 0;
+}
+
+static int CTL_READ_HANDLER(allocated_memory)(void *ctx,
+                                              umf_ctl_query_source_t source,
+                                              void *arg,
+                                              umf_ctl_index_utlist_t *indexes,
+                                              const char *extra_name,
+                                              umf_ctl_query_type_t query_type) {
+    /* suppress unused-parameter errors */
+    (void)source, (void)indexes, (void)ctx, (void)extra_name, (void)query_type;
+
+    size_t *arg_out = arg;
+    os_memory_provider_t *os_provider = (os_memory_provider_t *)ctx;
+    COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) !=
+                     sizeof(uint64_t));
+    COMPILE_ERROR_ON(sizeof(*arg_out) != sizeof(uint64_t));
+    utils_atomic_load_acquire_u64(
+        (uint64_t *)&os_provider->stats.allocated_memory, (uint64_t *)arg_out);
+    return 0;
+}
+
+static int CTL_RUNNABLE_HANDLER(reset)(void *ctx, umf_ctl_query_source_t source,
+                                       void *arg,
+                                       umf_ctl_index_utlist_t *indexes,
+                                       const char *extra_name,
+                                       umf_ctl_query_type_t query_type) {
+    /* suppress unused-parameter errors */
+    (void)source, (void)indexes, (void)arg, (void)extra_name, (void)query_type;
+
+    os_memory_provider_t *os_provider = (os_memory_provider_t *)ctx;
+    size_t allocated;
+
+    COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) !=
+                     sizeof(uint64_t));
+    COMPILE_ERROR_ON(sizeof(allocated) != sizeof(uint64_t));
+
+    utils_atomic_load_acquire_u64(
+        (uint64_t *)&os_provider->stats.allocated_memory,
+        (uint64_t *)&allocated);
+    utils_atomic_store_release_u64((uint64_t *)&os_provider->stats.peak_memory,
+                                   (uint64_t)allocated);
+
+    return 0;
+}
+static const umf_ctl_node_t CTL_NODE(stats)[] = {
+    CTL_LEAF_RO(allocated_memory), CTL_LEAF_RO(peak_memory),
+    CTL_LEAF_RUNNABLE(reset), CTL_NODE_END};
+
 static const umf_ctl_node_t CTL_NODE(params)[] = {CTL_LEAF_RO(ipc_enabled),
                                                   CTL_NODE_END};
 
 static void initialize_os_ctl(void) {
     os_memory_ctl_root = ctl_new();
     CTL_REGISTER_MODULE(os_memory_ctl_root, params);
+    CTL_REGISTER_MODULE(os_memory_ctl_root, stats);
 }
 
 static void os_store_last_native_error(int32_t native_error, int errno_value) {
@@ -1109,6 +1176,29 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment,
 
     *resultPtr = addr;
 
+    COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) !=
+                     sizeof(uint64_t));
+    COMPILE_ERROR_ON(sizeof(os_provider->stats.peak_memory) !=
+                     sizeof(uint64_t));
+    COMPILE_ERROR_ON(sizeof(size) != sizeof(uint64_t));
+    // TODO: Change to memory_order_relaxed when we will have a proper wrapper
+    size_t allocated =
+        utils_fetch_and_add_u64(
+            (uint64_t *)&os_provider->stats.allocated_memory, (uint64_t)size) +
+        size;
+
+    uint64_t peak;
+    utils_atomic_load_acquire_u64((uint64_t *)&os_provider->stats.peak_memory,
+                                  &peak);
+
+    while (allocated > peak && !utils_compare_exchange_u64(
+                                   (uint64_t *)&os_provider->stats.peak_memory,
+                                   &peak, (uint64_t *)&allocated)) {
+        /* If the compare-exchange fails, 'peak' is updated to the current value of peak_memory.
+       We then re-check whether allocated is still greater than the updated peak value. */
+        ;
+    }
+
     return UMF_RESULT_SUCCESS;
 
 err_unmap:
@@ -1136,6 +1226,14 @@ static umf_result_t os_free(void *provider, void *ptr, size_t size) {
         return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC;
     }
 
+    COMPILE_ERROR_ON(sizeof(size) != sizeof(uint64_t));
+    COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) !=
+                     sizeof(uint64_t));
+
+    // TODO: Change it to memory_order_relaxed when we will have a proper wrapper
+    utils_fetch_and_sub_u64((uint64_t *)&os_provider->stats.allocated_memory,
+                            size);
+
     return UMF_RESULT_SUCCESS;
 }
 
diff --git a/src/provider/provider_os_memory_internal.h b/src/provider/provider_os_memory_internal.h
index 4a603b1dad..a3f35cbd3e 100644
--- a/src/provider/provider_os_memory_internal.h
+++ b/src/provider/provider_os_memory_internal.h
@@ -10,7 +10,6 @@
 
 #include <limits.h>
 #include <stdbool.h>
-
 #if defined(_WIN32) && !defined(NAME_MAX)
 #include <stdlib.h>
 #define NAME_MAX _MAX_FNAME
@@ -68,6 +67,10 @@ typedef struct os_memory_provider_t {
     size_t partitions_weight_sum;
 
     hwloc_topology_t topo;
+    struct {
+        size_t allocated_memory;
+        size_t peak_memory;
+    } stats;
 } os_memory_provider_t;
 
 #ifdef __cplusplus
diff --git a/src/utils/utils_concurrency.h b/src/utils/utils_concurrency.h
index 0104b86468..638c1c4262 100644
--- a/src/utils/utils_concurrency.h
+++ b/src/utils/utils_concurrency.h
@@ -120,11 +120,15 @@ static inline void utils_atomic_load_acquire_ptr(void **ptr, void **out) {
     *(uintptr_t *)out = ret;
 }
 
+static inline void utils_atomic_store_release_u64(uint64_t *ptr, uint64_t val) {
+    ASSERT_IS_ALIGNED((uintptr_t)ptr, 8);
+    InterlockedExchange64((LONG64 volatile *)ptr, val);
+}
+
 static inline void utils_atomic_store_release_ptr(void **ptr, void *val) {
     ASSERT_IS_ALIGNED((uintptr_t)ptr, 8);
     InterlockedExchangePointer(ptr, val);
 }
-
 static inline uint64_t utils_atomic_increment_u64(uint64_t *ptr) {
     ASSERT_IS_ALIGNED((uintptr_t)ptr, 8);
     // return incremented value
@@ -183,6 +187,12 @@ static inline void utils_atomic_load_acquire_ptr(void **ptr, void **out) {
     utils_annotate_acquire(ptr);
 }
 
+static inline void utils_atomic_store_release_u64(uint64_t *ptr, uint64_t val) {
+    ASSERT_IS_ALIGNED((uintptr_t)ptr, 8);
+    utils_annotate_release(ptr);
+    __atomic_store_n(ptr, val, memory_order_release);
+}
+
 static inline void utils_atomic_store_release_ptr(void **ptr, void *val) {
     ASSERT_IS_ALIGNED((uintptr_t)ptr, 8);
     utils_annotate_release(ptr);