diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp index a960d89bca..b096716b3c 100644 --- a/benchmark/benchmark.hpp +++ b/benchmark/benchmark.hpp @@ -173,6 +173,14 @@ class provider_allocator : public allocator_interface { return argPos; } + void preBench(::benchmark::State &state) override { + provider.preBench(state); + } + + void postBench(::benchmark::State &state) override { + provider.postBench(state); + } + void TearDown(::benchmark::State &state) override { provider.TearDown(state); } @@ -204,13 +212,18 @@ template class pool_allocator : public allocator_interface { return argPos; } + void preBench(::benchmark::State &state) override { pool.preBench(state); } + void postBench(::benchmark::State &state) override { + pool.postBench(state); + } + void TearDown(::benchmark::State &state) override { pool.TearDown(state); } - virtual void *benchAlloc(size_t size) override { + void *benchAlloc(size_t size) override { return umfPoolMalloc(pool.pool, size); } - virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) override { + void benchFree(void *ptr, [[maybe_unused]] size_t size) override { umfPoolFree(pool.pool, ptr); } @@ -241,7 +254,7 @@ struct benchmark_interface : public benchmark::Fixture { allocator.TearDown(state); } - virtual void bench(::benchmark::State &state) = 0; + void bench([[maybe_unused]] ::benchmark::State &state){}; virtual std::vector argsName() { auto s = Size::argsName(); @@ -260,6 +273,9 @@ struct benchmark_interface : public benchmark::Fixture { benchmark->ArgNames(bench->argsName())->Name(bench->name()); } + void custom_counters(::benchmark::State &state) { + allocator.custom_counters(state); + } std::vector alloc_sizes; Allocator allocator; }; @@ -282,7 +298,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface { vector2d allocations; std::vector iters; - + std::vector memused; vector2d next; std::vector::const_iterator> next_iter; int64_t iterations; @@ -302,6 +318,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface { allocations.resize(state.threads()); next.resize(state.threads()); next_iter.resize(state.threads()); + memused.assign(state.threads(), 0); #ifndef WIN32 // Ensure that system malloc does not have memory pooled on the heap @@ -323,13 +340,36 @@ class multiple_malloc_free_benchmark : public benchmark_interface { waitForAllThreads(state); // prepare workload for actual benchmark. freeAllocs(state); + prealloc(state); prepareWorkload(state); + waitForAllThreads(state); + base::allocator.preBench(state); } void TearDown(::benchmark::State &state) override { + base::allocator.postBench(state); auto tid = state.thread_index(); + if (tid == 0) { + size_t current_memory_allocated = 0; + for (const auto &used : memused) { + current_memory_allocated += used; + } + + auto memory_used = state.counters["provider_memory_allocated"]; + + if (memory_used != 0) { + state.counters["benchmark_memory_allocated"] = + static_cast(current_memory_allocated); + state.counters["memory_overhead"] = + 100.0 * (memory_used - current_memory_allocated) / + memory_used; + } else { + state.counters.erase("provider_memory_allocated"); + } + } + waitForAllThreads(state); freeAllocs(state); waitForAllThreads(state); if (tid == 0) { @@ -342,20 +382,22 @@ class multiple_malloc_free_benchmark : public benchmark_interface { base::TearDown(state); } - void bench(benchmark::State &state) override { + void bench(benchmark::State &state) { auto tid = state.thread_index(); auto &allocation = allocations[tid]; + auto &memuse = memused[tid]; for (int i = 0; i < allocsPerIterations; i++) { auto &n = *next_iter[tid]++; auto &alloc = allocation[n.offset]; base::allocator.benchFree(alloc.ptr, alloc.size); - + memuse -= alloc.size; alloc.size = n.size; alloc.ptr = base::allocator.benchAlloc(alloc.size); if (alloc.ptr == NULL) { state.SkipWithError("allocation failed"); } + memuse += alloc.size; } } @@ -376,7 +418,9 @@ class multiple_malloc_free_benchmark : public benchmark_interface { auto tid = state.thread_index(); auto &i = allocations[tid]; i.resize(max_allocs); + auto &memuse = memused[tid]; auto sizeGenerator = base::alloc_sizes[tid]; + for (size_t j = 0; j < max_allocs; j++) { auto size = sizeGenerator.nextSize(); i[j].ptr = base::allocator.benchAlloc(size); @@ -385,6 +429,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface { return; } i[j].size = size; + memuse += size; } } @@ -394,6 +439,7 @@ class multiple_malloc_free_benchmark : public benchmark_interface { for (auto &j : i) { if (j.ptr != NULL) { base::allocator.benchFree(j.ptr, j.size); + memused[tid] -= j.size; j.ptr = NULL; j.size = 0; } diff --git a/benchmark/benchmark_umf.hpp b/benchmark/benchmark_umf.hpp index cfc9982d2c..9553d6fdb5 100644 --- a/benchmark/benchmark_umf.hpp +++ b/benchmark/benchmark_umf.hpp @@ -11,8 +11,6 @@ #include #include #include - -#include #include #include @@ -30,7 +28,7 @@ struct provider_interface { using params_ptr = std::unique_ptr; umf_memory_provider_handle_t provider = NULL; - virtual void SetUp(::benchmark::State &state) { + void SetUp(::benchmark::State &state) { if (state.thread_index() != 0) { return; } @@ -42,7 +40,27 @@ struct provider_interface { } } - virtual void TearDown([[maybe_unused]] ::benchmark::State &state) { + void preBench([[maybe_unused]] ::benchmark::State &state) { + if (state.thread_index() != 0) { + return; + } + umfCtlExec("umf.provider.by_handle.stats.reset", provider, NULL); + } + + void postBench([[maybe_unused]] ::benchmark::State &state) { + if (state.thread_index() != 0) { + return; + } + size_t arg; + umf_result_t ret = umfCtlGet( + "umf.provider.by_handle.stats.allocated_memory", provider, &arg); + if (ret == UMF_RESULT_SUCCESS) { + state.counters["provider_memory_allocated"] = + static_cast(arg); + } + } + + void TearDown([[maybe_unused]] ::benchmark::State &state) { if (state.thread_index() != 0) { return; } @@ -53,9 +71,7 @@ struct provider_interface { } virtual umf_memory_provider_ops_t * - getOps([[maybe_unused]] ::benchmark::State &state) { - return nullptr; - } + getOps([[maybe_unused]] ::benchmark::State &state) = 0; virtual params_ptr getParams([[maybe_unused]] ::benchmark::State &state) { return {nullptr, [](void *) {}}; @@ -68,7 +84,7 @@ template ; - virtual void SetUp(::benchmark::State &state) { + void SetUp(::benchmark::State &state) { provider.SetUp(state); if (state.thread_index() != 0) { return; @@ -80,7 +96,22 @@ struct pool_interface { state.SkipWithError("umfPoolCreate() failed"); } } - virtual void TearDown([[maybe_unused]] ::benchmark::State &state) { + + void preBench([[maybe_unused]] ::benchmark::State &state) { + provider.preBench(state); + if (state.thread_index() != 0) { + return; + } + } + + void postBench([[maybe_unused]] ::benchmark::State &state) { + provider.postBench(state); + if (state.thread_index() != 0) { + return; + } + } + + void TearDown([[maybe_unused]] ::benchmark::State &state) { if (state.thread_index() != 0) { return; } @@ -93,15 +124,17 @@ struct pool_interface { if (pool) { umfPoolDestroy(pool); } + + provider.TearDown(state); }; virtual umf_memory_pool_ops_t * - getOps([[maybe_unused]] ::benchmark::State &state) { - return nullptr; - } + getOps([[maybe_unused]] ::benchmark::State &state) = 0; + virtual params_ptr getParams([[maybe_unused]] ::benchmark::State &state) { return {nullptr, [](void *) {}}; } + T provider; umf_memory_pool_handle_t pool; }; @@ -110,6 +143,8 @@ class allocator_interface { public: virtual unsigned SetUp([[maybe_unused]] ::benchmark::State &state, [[maybe_unused]] unsigned argPos) = 0; + virtual void preBench([[maybe_unused]] ::benchmark::State &state) = 0; + virtual void postBench([[maybe_unused]] ::benchmark::State &state) = 0; virtual void TearDown([[maybe_unused]] ::benchmark::State &state) = 0; virtual void *benchAlloc(size_t size) = 0; virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) = 0; @@ -121,7 +156,9 @@ struct glibc_malloc : public allocator_interface { unsigned argPos) override { return argPos; } - void TearDown([[maybe_unused]] ::benchmark::State &state) override{}; + void preBench([[maybe_unused]] ::benchmark::State &state) override {} + void postBench([[maybe_unused]] ::benchmark::State &state) override {} + void TearDown([[maybe_unused]] ::benchmark::State &state) override {} void *benchAlloc(size_t size) override { return malloc(size); } void benchFree(void *ptr, [[maybe_unused]] size_t size) override { free(ptr); @@ -163,7 +200,7 @@ struct fixed_provider : public provider_interface { char *mem = NULL; const size_t size = 1024 * 1024 * 1024; // 1GB public: - virtual void SetUp(::benchmark::State &state) override { + void SetUp(::benchmark::State &state) { if (state.thread_index() != 0) { return; } @@ -175,7 +212,7 @@ struct fixed_provider : public provider_interface { provider_interface::SetUp(state); } - virtual void TearDown(::benchmark::State &state) override { + void TearDown(::benchmark::State &state) { if (state.thread_index() != 0) { return; } @@ -295,7 +332,7 @@ struct jemalloc_pool : public pool_interface { #ifdef UMF_POOL_SCALABLE_ENABLED template struct scalable_pool : public pool_interface { - virtual umf_memory_pool_ops_t * + umf_memory_pool_ops_t * getOps([[maybe_unused]] ::benchmark::State &state) override { return umfScalablePoolOps(); } diff --git a/src/provider/provider_os_memory.c b/src/provider/provider_os_memory.c index f3e5c7fa02..1ecb397fe8 100644 --- a/src/provider/provider_os_memory.c +++ b/src/provider/provider_os_memory.c @@ -6,19 +6,21 @@ */ #include +#include #include #include + #include #include #include #include - -#include #include #include #include #include #include + +#include "utils_assert.h" // OS Memory Provider requires HWLOC #if defined(UMF_NO_HWLOC) @@ -187,12 +189,77 @@ static int CTL_READ_HANDLER(ipc_enabled)(void *ctx, return 0; } +static int CTL_READ_HANDLER(peak_memory)(void *ctx, + umf_ctl_query_source_t source, + void *arg, + umf_ctl_index_utlist_t *indexes, + const char *extra_name, + umf_ctl_query_type_t query_type) { + /* suppress unused-parameter errors */ + (void)source, (void)indexes, (void)ctx, (void)extra_name, (void)query_type; + + size_t *arg_out = arg; + os_memory_provider_t *os_provider = (os_memory_provider_t *)ctx; + COMPILE_ERROR_ON(sizeof(os_provider->stats.peak_memory) != + sizeof(uint64_t)); + utils_atomic_load_acquire_u64((uint64_t *)&os_provider->stats.peak_memory, + (uint64_t *)arg_out); + return 0; +} + +static int CTL_READ_HANDLER(allocated_memory)(void *ctx, + umf_ctl_query_source_t source, + void *arg, + umf_ctl_index_utlist_t *indexes, + const char *extra_name, + umf_ctl_query_type_t query_type) { + /* suppress unused-parameter errors */ + (void)source, (void)indexes, (void)ctx, (void)extra_name, (void)query_type; + + size_t *arg_out = arg; + os_memory_provider_t *os_provider = (os_memory_provider_t *)ctx; + COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) != + sizeof(uint64_t)); + COMPILE_ERROR_ON(sizeof(*arg_out) != sizeof(uint64_t)); + utils_atomic_load_acquire_u64( + (uint64_t *)&os_provider->stats.allocated_memory, (uint64_t *)arg_out); + return 0; +} + +static int CTL_RUNNABLE_HANDLER(reset)(void *ctx, umf_ctl_query_source_t source, + void *arg, + umf_ctl_index_utlist_t *indexes, + const char *extra_name, + umf_ctl_query_type_t query_type) { + /* suppress unused-parameter errors */ + (void)source, (void)indexes, (void)arg, (void)extra_name, (void)query_type; + + os_memory_provider_t *os_provider = (os_memory_provider_t *)ctx; + size_t allocated; + + COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) != + sizeof(uint64_t)); + COMPILE_ERROR_ON(sizeof(allocated) != sizeof(uint64_t)); + + utils_atomic_load_acquire_u64( + (uint64_t *)&os_provider->stats.allocated_memory, + (uint64_t *)&allocated); + utils_atomic_store_release_u64((uint64_t *)&os_provider->stats.peak_memory, + (uint64_t)allocated); + + return 0; +} +static const umf_ctl_node_t CTL_NODE(stats)[] = { + CTL_LEAF_RO(allocated_memory), CTL_LEAF_RO(peak_memory), + CTL_LEAF_RUNNABLE(reset), CTL_NODE_END}; + static const umf_ctl_node_t CTL_NODE(params)[] = {CTL_LEAF_RO(ipc_enabled), CTL_NODE_END}; static void initialize_os_ctl(void) { os_memory_ctl_root = ctl_new(); CTL_REGISTER_MODULE(os_memory_ctl_root, params); + CTL_REGISTER_MODULE(os_memory_ctl_root, stats); } static void os_store_last_native_error(int32_t native_error, int errno_value) { @@ -1109,6 +1176,29 @@ static umf_result_t os_alloc(void *provider, size_t size, size_t alignment, *resultPtr = addr; + COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) != + sizeof(uint64_t)); + COMPILE_ERROR_ON(sizeof(os_provider->stats.peak_memory) != + sizeof(uint64_t)); + COMPILE_ERROR_ON(sizeof(size) != sizeof(uint64_t)); + // TODO: Change to memory_order_relaxed when we will have a proper wrapper + size_t allocated = + utils_fetch_and_add_u64( + (uint64_t *)&os_provider->stats.allocated_memory, (uint64_t)size) + + size; + + uint64_t peak; + utils_atomic_load_acquire_u64((uint64_t *)&os_provider->stats.peak_memory, + &peak); + + while (allocated > peak && !utils_compare_exchange_u64( + (uint64_t *)&os_provider->stats.peak_memory, + &peak, (uint64_t *)&allocated)) { + /* If the compare-exchange fails, 'peak' is updated to the current value of peak_memory. + We then re-check whether allocated is still greater than the updated peak value. */ + ; + } + return UMF_RESULT_SUCCESS; err_unmap: @@ -1136,6 +1226,14 @@ static umf_result_t os_free(void *provider, void *ptr, size_t size) { return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; } + COMPILE_ERROR_ON(sizeof(size) != sizeof(uint64_t)); + COMPILE_ERROR_ON(sizeof(os_provider->stats.allocated_memory) != + sizeof(uint64_t)); + + // TODO: Change it to memory_order_relaxed when we will have a proper wrapper + utils_fetch_and_sub_u64((uint64_t *)&os_provider->stats.allocated_memory, + size); + return UMF_RESULT_SUCCESS; } diff --git a/src/provider/provider_os_memory_internal.h b/src/provider/provider_os_memory_internal.h index 4a603b1dad..a3f35cbd3e 100644 --- a/src/provider/provider_os_memory_internal.h +++ b/src/provider/provider_os_memory_internal.h @@ -10,7 +10,6 @@ #include #include - #if defined(_WIN32) && !defined(NAME_MAX) #include #define NAME_MAX _MAX_FNAME @@ -68,6 +67,10 @@ typedef struct os_memory_provider_t { size_t partitions_weight_sum; hwloc_topology_t topo; + struct { + size_t allocated_memory; + size_t peak_memory; + } stats; } os_memory_provider_t; #ifdef __cplusplus diff --git a/src/utils/utils_concurrency.h b/src/utils/utils_concurrency.h index 0104b86468..638c1c4262 100644 --- a/src/utils/utils_concurrency.h +++ b/src/utils/utils_concurrency.h @@ -120,11 +120,15 @@ static inline void utils_atomic_load_acquire_ptr(void **ptr, void **out) { *(uintptr_t *)out = ret; } +static inline void utils_atomic_store_release_u64(uint64_t *ptr, uint64_t val) { + ASSERT_IS_ALIGNED((uintptr_t)ptr, 8); + InterlockedExchange64((LONG64 volatile *)ptr, val); +} + static inline void utils_atomic_store_release_ptr(void **ptr, void *val) { ASSERT_IS_ALIGNED((uintptr_t)ptr, 8); InterlockedExchangePointer(ptr, val); } - static inline uint64_t utils_atomic_increment_u64(uint64_t *ptr) { ASSERT_IS_ALIGNED((uintptr_t)ptr, 8); // return incremented value @@ -183,6 +187,12 @@ static inline void utils_atomic_load_acquire_ptr(void **ptr, void **out) { utils_annotate_acquire(ptr); } +static inline void utils_atomic_store_release_u64(uint64_t *ptr, uint64_t val) { + ASSERT_IS_ALIGNED((uintptr_t)ptr, 8); + utils_annotate_release(ptr); + __atomic_store_n(ptr, val, memory_order_release); +} + static inline void utils_atomic_store_release_ptr(void **ptr, void *val) { ASSERT_IS_ALIGNED((uintptr_t)ptr, 8); utils_annotate_release(ptr);