diff --git a/.github/workflows/reusable_gpu.yml b/.github/workflows/reusable_gpu.yml index a09f43e6d9..739aab9e18 100644 --- a/.github/workflows/reusable_gpu.yml +++ b/.github/workflows/reusable_gpu.yml @@ -112,6 +112,7 @@ jobs: run: ctest --output-on-failure --test-dir examples -C ${{matrix.build_type}} - name: Run benchmarks + if: matrix.build_type == 'Release' working-directory: ${{env.BUILD_DIR}} run: ctest --output-on-failure --test-dir benchmark -C ${{matrix.build_type}} --exclude-regex umf-bench-multithreaded diff --git a/CMakeLists.txt b/CMakeLists.txt index cc3a24e5fe..4dcc293d2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -320,7 +320,7 @@ endif() # compiler is required. Moreover, if these options are not set, CMake will set # up a strict C build, without C++ support. set(OPTIONS_REQUIRING_CXX "UMF_BUILD_TESTS" "UMF_BUILD_LIBUMF_POOL_DISJOINT" - "UMF_BUILD_BENCHMARKS_MT") + "UMF_BUILD_BENCHMARKS_MT" "UMF_BUILD_BENCHMARKS") foreach(option_name ${OPTIONS_REQUIRING_CXX}) if(${option_name}) enable_language(CXX) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index aaf50c1c0f..5605519ee2 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,7 +1,24 @@ -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +include(FetchContent) +FetchContent_Declare( + googlebenchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.9.0) + +set(BENCHMARK_ENABLE_GTEST_TESTS + OFF + CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_TESTING + OFF + CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_INSTALL + OFF + CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googlebenchmark) + # In MSVC builds, there is no way to determine the actual build type during the # CMake configuration step. Therefore, this message is printed in all MSVC # builds. @@ -32,7 +49,7 @@ function(add_umf_benchmark) "${multiValueArgs}" ${ARGN}) - set(BENCH_NAME umf-bench-${ARG_NAME}) + set(BENCH_NAME umf-${ARG_NAME}) set(BENCH_LIBS ${ARG_LIBS} umf) @@ -52,13 +69,17 @@ function(add_umf_benchmark) COMMAND ${BENCH_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - # Benchmark passes if it prints "PASSED" in the output, because ubench of - # scalable pool fails if the confidence interval exceeds maximum permitted - # 2.5%. - set_tests_properties( - ${BENCH_NAME} PROPERTIES - LABELS "benchmark" - PASS_REGULAR_EXPRESSION "PASSED") + if("${BENCH_NAME}" STREQUAL "umf-ubench") + # Benchmark passes if it prints "PASSED" in the output, because ubench + # of scalable pool fails if the confidence interval exceeds maximum + # permitted 2.5%. + set_tests_properties( + ${BENCH_NAME} PROPERTIES + LABELS "benchmark" + PASS_REGULAR_EXPRESSION "PASSED") + else() + set_tests_properties(${BENCH_NAME} PROPERTIES LABELS "benchmark") + endif() if(WINDOWS) # append PATH to DLLs @@ -68,11 +89,11 @@ function(add_umf_benchmark) if(UMF_BUILD_LIBUMF_POOL_DISJOINT) target_compile_definitions(${BENCH_NAME} - PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1) + PRIVATE UMF_POOL_DISJOINT_ENABLED=1) endif() - if(UMF_BUILD_LIBUMF_POOL_JEMALLOC) + if(UMF_POOL_JEMALLOC_ENABLED) target_compile_definitions(${BENCH_NAME} - PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1) + PRIVATE UMF_POOL_JEMALLOC_ENABLED=1) endif() if(UMF_POOL_SCALABLE_ENABLED) target_compile_definitions(${BENCH_NAME} @@ -80,7 +101,7 @@ function(add_umf_benchmark) endif() if(UMF_BUILD_LEVEL_ZERO_PROVIDER) target_compile_definitions(${BENCH_NAME} - PRIVATE UMF_BUILD_LEVEL_ZERO_PROVIDER=1) + PRIVATE UMF_PROVIDER_LEVEL_ZERO_ENABLED=1) target_include_directories( ${BENCH_NAME} PRIVATE ${UMF_CMAKE_SOURCE_DIR}/test/common ${LEVEL_ZERO_INCLUDE_DIRS}) @@ -121,6 +142,12 @@ add_umf_benchmark( LIBS ${LIBS_OPTIONAL} LIBDIRS ${LIB_DIRS}) +add_umf_benchmark( + NAME benchmark + SRCS benchmark.cpp + LIBS ${LIBS_OPTIONAL} benchmark::benchmark + LIBDIRS ${LIB_DIRS}) + if(UMF_BUILD_BENCHMARKS_MT) add_umf_benchmark( NAME multithreaded diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp new file mode 100644 index 0000000000..c10bbda877 --- /dev/null +++ b/benchmark/benchmark.cpp @@ -0,0 +1,362 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ + +#include +#include +#ifdef UMF_POOL_SCALABLE_ENABLED +#include +#endif +#include + +#ifdef UMF_POOL_DISJOINT_ENABLED +#include +#endif + +#ifdef UMF_POOL_JEMALLOC_ENABLED +#include +#endif + +#include "benchmark.hpp" + +struct glibc_malloc : public allocator_interface { + unsigned SetUp([[maybe_unused]] ::benchmark::State &state, + unsigned argPos) override { + return argPos; + } + void TearDown([[maybe_unused]] ::benchmark::State &state) override{}; + void *benchAlloc(size_t size) override { return malloc(size); } + void benchFree(void *ptr, [[maybe_unused]] size_t size) override { + free(ptr); + } + static std::string name() { return "glibc"; } +}; + +struct os_provider : public provider_interface { + umf_os_memory_provider_params_handle_t params = NULL; + os_provider() { + umfOsMemoryProviderParamsCreate(¶ms); + return; + } + + ~os_provider() { + if (params != NULL) { + umfOsMemoryProviderParamsDestroy(params); + } + } + + void *getParams() override { return params; } + umf_memory_provider_ops_t *getOps() override { + return umfOsMemoryProviderOps(); + } + static std::string name() { return "os_provider"; } +}; + +template +struct proxy_pool : public pool_interface { + umf_memory_pool_ops_t * + getOps([[maybe_unused]] ::benchmark::State &state) override { + return umfProxyPoolOps(); + } + void *getParams([[maybe_unused]] ::benchmark::State &state) override { + return nullptr; + } + static std::string name() { return "proxy_pool<" + Provider::name() + ">"; } +}; + +#ifdef UMF_POOL_DISJOINT_ENABLED +template +struct disjoint_pool : public pool_interface { + umf_disjoint_pool_params_handle_t disjoint_memory_pool_params; + + disjoint_pool() { + disjoint_memory_pool_params = NULL; + auto ret = umfDisjointPoolParamsCreate(&disjoint_memory_pool_params); + if (ret != UMF_RESULT_SUCCESS) { + return; + } + + // those function should never fail, so error handling is minimal. + ret = umfDisjointPoolParamsSetSlabMinSize(disjoint_memory_pool_params, + 4096); + if (ret != UMF_RESULT_SUCCESS) { + goto err; + } + + ret = umfDisjointPoolParamsSetCapacity(disjoint_memory_pool_params, 4); + if (ret != UMF_RESULT_SUCCESS) { + goto err; + } + + ret = umfDisjointPoolParamsSetMinBucketSize(disjoint_memory_pool_params, + 4096); + if (ret != UMF_RESULT_SUCCESS) { + goto err; + } + + ret = umfDisjointPoolParamsSetMaxPoolableSize( + disjoint_memory_pool_params, 4096 * 16); + + if (ret != UMF_RESULT_SUCCESS) { + goto err; + } + return; + err: + + umfDisjointPoolParamsDestroy(disjoint_memory_pool_params); + disjoint_memory_pool_params = NULL; + } + + ~disjoint_pool() { + if (disjoint_memory_pool_params != NULL) { + umfDisjointPoolParamsDestroy(disjoint_memory_pool_params); + } + } + + umf_memory_pool_ops_t * + getOps([[maybe_unused]] ::benchmark::State &state) override { + return umfDisjointPoolOps(); + } + void *getParams([[maybe_unused]] ::benchmark::State &state) override { + + if (disjoint_memory_pool_params == NULL) { + state.SkipWithError("Failed to create disjoint pool params"); + } + + return disjoint_memory_pool_params; + } + static std::string name() { + return "disjoint_pool<" + Provider::name() + ">"; + } +}; +#endif + +#ifdef UMF_POOL_JEMALLOC_ENABLED +template +struct jemalloc_pool : public pool_interface { + umf_memory_pool_ops_t * + getOps([[maybe_unused]] ::benchmark::State &state) override { + return umfJemallocPoolOps(); + } + void *getParams([[maybe_unused]] ::benchmark::State &state) override { + return NULL; + } + static std::string name() { + return "jemalloc_pool<" + Provider::name() + ">"; + } +}; +#endif + +#ifdef UMF_POOL_SCALABLE_ENABLED +template +struct scalable_pool : public pool_interface { + virtual umf_memory_pool_ops_t * + getOps([[maybe_unused]] ::benchmark::State &state) override { + return umfScalablePoolOps(); + } + virtual void * + getParams([[maybe_unused]] ::benchmark::State &state) override { + return NULL; + } + static std::string name() { + return "scalable_pool<" + Provider::name() + ">"; + } +}; +#endif +// Benchmarks scenarios: + +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_fix, fixed_alloc_size, + glibc_malloc); + +// The benchmark arguments specified in Args() are, in order: +// benchmark arguments, allocator arguments, size generator arguments. +// The exact meaning of each argument depends on the benchmark, allocator, and size components used. +// Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments. +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_fix) + ->Args({10000, 0, 4096}) + ->Args({10000, 100000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, glibc_uniform, + uniform_alloc_size, glibc_malloc); +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, glibc_uniform) + ->Args({10000, 0, 8, 64 * 1024, 8}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, os_provider, fixed_alloc_size, + provider_allocator); +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, os_provider) + ->Args({10000, 0, 4096}) + ->Args({10000, 100000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, proxy_pool, fixed_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, proxy_pool) + ->Args({1000, 0, 4096}) + ->Args({1000, 100000, 4096}) + ->Threads(4) + ->Threads(1); + +#ifdef UMF_POOL_DISJOINT_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_fix, + fixed_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_fix) + ->Args({10000, 0, 4096}) + ->Args({10000, 100000, 4096}) + ->Threads(4) + ->Threads(1); + +// TODO: debug why this crashes +/*UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, disjoint_pool_uniform, + uniform_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, disjoint_pool_uniform) + ->Args({10000, 0, 8, 64 * 1024, 8}) + // ->Threads(4) + ->Threads(1); +*/ +#endif + +#ifdef UMF_POOL_JEMALLOC_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_fix, + fixed_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_fix) + ->Args({10000, 0, 4096}) + ->Args({10000, 100000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, jemalloc_pool_uniform, + uniform_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, jemalloc_pool_uniform) + ->Args({10000, 0, 8, 64 * 1024, 8}) + ->Threads(4) + ->Threads(1); + +#endif +#ifdef UMF_POOL_SCALABLE_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_fix, + fixed_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_fix) + ->Args({10000, 0, 4096}) + ->Args({10000, 100000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(alloc_benchmark, scalable_pool_uniform, + uniform_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(alloc_benchmark, scalable_pool_uniform) + ->Args({10000, 0, 8, 64 * 1024, 8}) + ->Threads(4) + ->Threads(1); +#endif +// Multiple allocs/free + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_fix, + fixed_alloc_size, glibc_malloc); + +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_fix) + ->Args({10000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, glibc_uniform, + uniform_alloc_size, glibc_malloc); +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, glibc_uniform) + ->Args({10000, 8, 64 * 1024, 8}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, proxy_pool, + fixed_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, proxy_pool) + ->Args({10000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, os_provider, + fixed_alloc_size, + provider_allocator); +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, os_provider) + ->Args({10000, 4096}) + ->Threads(4) + ->Threads(1); + +#ifdef UMF_POOL_DISJOINT_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix, + fixed_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix) + ->Args({10000, 4096}) + ->Threads(4) + ->Threads(1); + +// TODO: debug why this crashes +/*UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, + disjoint_pool_uniform, uniform_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_uniform) + ->Args({10000, 0, 8, 64 * 1024, 8}) + ->Threads(4) + ->Threads(1); +*/ +#endif + +#ifdef UMF_POOL_JEMALLOC_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix, + fixed_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_fix) + ->Args({10000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, + jemalloc_pool_uniform, uniform_alloc_size, + pool_allocator>); +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, jemalloc_pool_uniform) + ->Args({1000, 8, 64 * 1024, 8}) + ->Threads(4) + ->Threads(1); + +#endif + +#ifdef UMF_POOL_SCALABLE_ENABLED +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, scalable_pool_fix, + fixed_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_fix) + ->Args({10000, 4096}) + ->Threads(4) + ->Threads(1); + +UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, + scalable_pool_uniform, uniform_alloc_size, + pool_allocator>); + +UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, scalable_pool_uniform) + ->Args({10000, 8, 64 * 1024, 8}) + ->Threads(4) + ->Threads(1); + +#endif +BENCHMARK_MAIN(); diff --git a/benchmark/benchmark.hpp b/benchmark/benchmark.hpp new file mode 100644 index 0000000000..ead6b39e75 --- /dev/null +++ b/benchmark/benchmark.hpp @@ -0,0 +1,382 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ + +/* + * This file defines a benchmarking framework for evaluating memory allocation + * and deallocation performance using the Unified Memory Framework (UMF). The + * design is modular and extensible, allowing for flexible benchmarking of different + * allocation strategies, size distributions, and memory providers. + * + * **Key Design Features:** + * - **Modular Components**: The framework is built using interfaces and templates, + * which allows for easy extension and customization of allocation strategies, + * size distributions, and memory providers. + * - **Flexible Allocation Size Generators**: Includes classes like `fixed_alloc_size` + * and `uniform_alloc_size` that generate allocation sizes based on different + * strategies. These classes implement the `alloc_size_interface`. + * - **Abstract Allocator Interface**: The `allocator_interface` defines the basic + * methods for memory allocation and deallocation. Concrete allocators like + * `provider_allocator` and `pool_allocator` implement this interface to work + * with different memory providers and pools. + * - **Benchmarking Classes**: Classes like `alloc_benchmark` and `multiple_malloc_free_benchmark` + * templates the allocation size generator and allocator to perform benchmarks. + * It manages the setup, execution, and teardown of the benchmark. + * - **Threaded Execution Support**: The benchmarks support multi-threaded execution + * by maintaining thread-specific allocation data and synchronization. + * + * **Component Interactions:** + * - **Size Generators and Allocators**: The `alloc_benchmark` class uses a size + * generator (e.g., `fixed_alloc_size` or `uniform_alloc_size`) to determine the + * sizes of memory allocations, and an allocator (e.g., `provider_allocator` or + * `pool_allocator`) to perform the actual memory operations. + * - **Benchmark Execution**: During the benchmark, `alloc_benchmark` repeatedly + * calls the `bench` method, which performs allocations and deallocations using + * the allocator and size generator. + * - **Allocator Adapters**: The `provider_allocator` and `pool_allocator` adapt + * specific memory providers and pools to the `allocator_interface`, allowing + * them to be used interchangeably in the benchmark classes. This abstraction + * enables benchmarking different memory management strategies without changing + * the core benchmarking logic. + * - **Pre-allocations and Iterations**: The `alloc_benchmark` can perform a set + * number of pre-allocations before the benchmark starts, and manages allocation + * and deallocation cycles to simulate memory pressure and fragmentation. + * - **Derived Benchmarks**: `multiple_malloc_free_benchmark` extends + * `alloc_benchmark` to perform multiple random deallocations and reallocations + * in each iteration, using a uniform distribution to select which allocations + * to free and reallocate. This models workloads with frequent memory churn. + * + * **Execution Flow:** + * 1. **Setup Phase**: + * - The benchmark class initializes the size generator and allocator. + * - Pre-allocations are performed if specified. + * - Thread-specific data structures for allocations are prepared. + * 2. **Benchmark Loop**: + * - For each iteration, the `bench` method is called. + * - The size generator provides the next allocation size. + * - The allocator performs the allocation. + * - Allocations are tracked per thread. + * 3. **Teardown Phase**: + * - All remaining allocations are freed. + * - Allocator and size generator are cleaned up. + * + * **Customization and Extension:** + * - New size generators can be created by implementing the `alloc_size_interface`. + * - New allocators can be adapted by implementing the `allocator_interface`. + * - Additional benchmarking scenarios can be created by extending `benchmark_interface`. + */ + +#include +#include +#include +#include + +#include "benchmark_interfaces.hpp" + +struct alloc_data { + void *ptr; + size_t size; +}; + +#define UMF_BENCHMARK_TEMPLATE_DEFINE(BaseClass, Method, ...) \ + BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, __VA_ARGS__) \ + (benchmark::State & state) { \ + for (auto _ : state) { \ + bench(state); \ + } \ + } + +#define UMF_BENCHMARK_REGISTER_F(BaseClass, Method) \ + BENCHMARK_REGISTER_F(BaseClass, Method) \ + ->ArgNames( \ + BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::argsName()) \ + ->Name(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::name()) \ + ->MinWarmUpTime(1) + +class fixed_alloc_size : public alloc_size_interface { + public: + unsigned SetUp(::benchmark::State &state, unsigned argPos) override { + size = state.range(argPos); + return argPos + 1; + } + void TearDown([[maybe_unused]] ::benchmark::State &state) override {} + size_t nextSize() override { return size; }; + static std::vector argsName() { return {"size"}; } + + private: + size_t size; +}; + +class uniform_alloc_size : public alloc_size_interface { + using distribution = std::uniform_int_distribution; + + public: + unsigned SetUp(::benchmark::State &state, unsigned argPos) override { + auto min = state.range(argPos++); + auto max = state.range(argPos++); + auto gran = state.range(argPos++); + if (min % gran != 0 && max % gran != 0) { + state.SkipWithError("min and max must be divisible by granularity"); + return argPos; + } + + dist.param(distribution::param_type(min / gran, max / gran)); + multiplier = gran; + return argPos; + } + void TearDown([[maybe_unused]] ::benchmark::State &state) override {} + size_t nextSize() override { return dist(generator) * multiplier; } + static std::vector argsName() { + return {"min size", "max size", "granularity"}; + } + + private: + std::default_random_engine generator; + distribution dist; + size_t multiplier; +}; + +// This class benchmarks speed of alloc() operations. +template < + typename Size, typename Alloc, + typename = + std::enable_if_t::value>, + typename = + std::enable_if_t::value>> +class alloc_benchmark : public benchmark_interface { + public: + size_t max_allocs = 1000; + size_t pre_allocs = 0; + void SetUp(::benchmark::State &state) override { + if (state.thread_index() != 0) { + return; + } + + // unpack arguments + int argPos = 0; + max_allocs = state.range(argPos++); + pre_allocs = state.range(argPos++); + // pass rest of the arguments to "alloc_size" and "allocator" + argPos = base::alloc_size.SetUp(state, argPos); + base::allocator.SetUp(state, argPos); + + // initialize allocations tracking vectors (one per thread) + // and iterators for these vectors. + allocations.resize(state.threads()); + iters.resize(state.threads()); + + for (auto &i : iters) { + i = pre_allocs; + } + + // do "pre_alloc" allocations before actual benchmark. + for (auto &i : allocations) { + i.resize(max_allocs + pre_allocs); + + for (size_t j = 0; j < pre_allocs; j++) { + i[j].ptr = + base::allocator.benchAlloc(base::alloc_size.nextSize()); + if (i[j].ptr == NULL) { + state.SkipWithError("preallocation failed"); + return; + } + i[j].size = base::alloc_size.nextSize(); + } + } + } + + void TearDown(::benchmark::State &state) override { + if (state.thread_index() != 0) { + return; + } + for (auto &i : allocations) { + for (auto &j : i) { + if (j.ptr != NULL) { + base::allocator.benchFree(j.ptr, j.size); + j.ptr = NULL; + j.size = 0; + } + } + } + + base::TearDown(state); + } + + void bench(benchmark::State &state) override { + auto tid = state.thread_index(); + auto s = base::alloc_size.nextSize(); + auto &i = iters[tid]; + allocations[tid][i].ptr = base::allocator.benchAlloc(s); + if (allocations[tid][i].ptr == NULL) { + state.SkipWithError("allocation failed"); + return; + } + allocations[tid][i].size = s; + i++; + if (i >= max_allocs + pre_allocs) { + // This benchmark tests only allocations - + // if allocation tracker is full we pause benchmark to dealloc all allocations - + // excluding pre-allocated ones. + state.PauseTiming(); + while (i > pre_allocs) { + auto &allocation = allocations[tid][--i]; + base::allocator.benchFree(allocation.ptr, allocation.size); + allocation.ptr = NULL; + allocation.size = 0; + } + state.ResumeTiming(); + } + } + static std::vector argsName() { + auto n = benchmark_interface::argsName(); + std::vector res = {"max_allocs", "pre_allocs"}; + res.insert(res.end(), n.begin(), n.end()); + return res; + } + static std::string name() { return base::name() + "/alloc"; } + + protected: + using base = benchmark_interface; + std::vector> allocations; + std::vector iters; +}; + +// This class benchmarks performance of random deallocations and (re)allocations +template < + typename Size, typename Alloc, + typename = + std::enable_if_t::value>, + typename = + std::enable_if_t::value>> +class multiple_malloc_free_benchmark : public alloc_benchmark { + using distribution = std::uniform_int_distribution; + using base = alloc_benchmark; + + public: + int reallocs = 100; + void SetUp(::benchmark::State &state) override { + if (state.thread_index() != 0) { + return; + } + // unpack arguments + int argPos = 0; + base::max_allocs = state.range(argPos++); + + // pass rest of the arguments to "alloc_size" and "allocator" + argPos = base::alloc_size.SetUp(state, argPos); + base::allocator.SetUp(state, argPos); + + // perform initial allocations which will be later freed and reallocated + base::allocations.resize(state.threads()); + for (auto &i : base::allocations) { + i.resize(base::max_allocs); + + for (size_t j = 0; j < base::max_allocs; j++) { + i[j].ptr = + base::allocator.benchAlloc(base::alloc_size.nextSize()); + if (i[j].ptr == NULL) { + state.SkipWithError("preallocation failed"); + return; + } + i[j].size = base::alloc_size.nextSize(); + } + } + dist.param(distribution::param_type(0, base::max_allocs - 1)); + } + + void bench(benchmark::State &state) override { + auto tid = state.thread_index(); + auto &allocation = base::allocations[tid]; + std::vector to_alloc; + for (int j = 0; j < reallocs; j++) { + auto idx = dist(generator); + if (allocation[idx].ptr == NULL) { + continue; + } + to_alloc.push_back(idx); + + base::allocator.benchFree(allocation[idx].ptr, + allocation[idx].size); + allocation[idx].ptr = NULL; + allocation[idx].size = 0; + } + + for (auto idx : to_alloc) { + auto s = base::alloc_size.nextSize(); + allocation[idx].ptr = base::allocator.benchAlloc(s); + if (allocation[idx].ptr == NULL) { + state.SkipWithError("allocation failed"); + } + allocation[idx].size = s; + } + } + + static std::string name() { + return base::base::name() + "/multiple_malloc_free"; + } + static std::vector argsName() { + auto n = benchmark_interface::argsName(); + std::vector res = {"max_allocs"}; + res.insert(res.end(), n.begin(), n.end()); + return res; + } + std::default_random_engine generator; + distribution dist; +}; + +template ::value>> +class provider_allocator : public allocator_interface { + public: + unsigned SetUp(::benchmark::State &state, unsigned r) override { + provider.SetUp(state); + return r; + } + + void TearDown(::benchmark::State &state) override { + provider.TearDown(state); + } + + void *benchAlloc(size_t size) override { + void *ptr; + if (umfMemoryProviderAlloc(provider.provider, size, 0, &ptr) != + UMF_RESULT_SUCCESS) { + return NULL; + } + return ptr; + } + void benchFree(void *ptr, size_t size) override { + umfMemoryProviderFree(provider.provider, ptr, size); + } + static std::string name() { return Provider::name(); } + + private: + Provider provider; +}; + +// TODO: assert Pool to be a pool_interface. +template class pool_allocator : public allocator_interface { + public: + unsigned SetUp(::benchmark::State &state, unsigned r) override { + pool.SetUp(state); + return r; + } + + void TearDown(::benchmark::State &state) override { pool.TearDown(state); } + + virtual void *benchAlloc(size_t size) override { + return umfPoolMalloc(pool.pool, size); + } + virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) override { + umfPoolFree(pool.pool, ptr); + } + + static std::string name() { return Pool::name(); } + + private: + Pool pool; +}; diff --git a/benchmark/benchmark_interfaces.hpp b/benchmark/benchmark_interfaces.hpp new file mode 100644 index 0000000000..8681160626 --- /dev/null +++ b/benchmark/benchmark_interfaces.hpp @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ + +#include +#include +#include + +#include +#include +#include + +class alloc_size_interface { + public: + virtual unsigned SetUp([[maybe_unused]] ::benchmark::State &state, + [[maybe_unused]] unsigned argPos) = 0; + virtual void TearDown([[maybe_unused]] ::benchmark::State &state) = 0; + virtual size_t nextSize() = 0; + static std::vector argsName() { return {""}; }; +}; + +class allocator_interface { + public: + virtual unsigned SetUp([[maybe_unused]] ::benchmark::State &state, + [[maybe_unused]] unsigned argPos) = 0; + virtual void TearDown([[maybe_unused]] ::benchmark::State &state) = 0; + virtual void *benchAlloc(size_t size) = 0; + virtual void benchFree(void *ptr, [[maybe_unused]] size_t size) = 0; + static std::vector argsName() { return {}; } +}; + +template +struct benchmark_interface : public benchmark::Fixture { + void SetUp(::benchmark::State &state) { + int argPos = alloc_size.SetUp(state, 0); + allocator.SetUp(state, argPos); + } + void TearDown(::benchmark::State &state) { + alloc_size.TearDown(state); + allocator.TearDown(state); + } + + virtual void bench(::benchmark::State &state) = 0; + + static std::vector argsName() { + auto s = Size::argsName(); + auto a = Allocator::argsName(); + std::vector res = {}; + res.insert(res.end(), s.begin(), s.end()); + res.insert(res.end(), a.begin(), a.end()); + return res; + } + static std::string name() { return Allocator::name(); } + + Size alloc_size; + Allocator allocator; +}; + +struct provider_interface { + umf_memory_provider_handle_t provider = NULL; + virtual void SetUp(::benchmark::State &state) { + if (state.thread_index() != 0) { + return; + } + auto umf_result = + umfMemoryProviderCreate(getOps(), getParams(), &provider); + if (umf_result != UMF_RESULT_SUCCESS) { + state.SkipWithError("umfMemoryProviderCreate() failed"); + } + } + + virtual void TearDown([[maybe_unused]] ::benchmark::State &state) { + if (state.thread_index() != 0) { + return; + } + + if (provider) { + umfMemoryProviderDestroy(provider); + } + } + + virtual umf_memory_provider_ops_t *getOps() { return nullptr; } + virtual void *getParams() { return nullptr; } +}; + +template ::value>> +struct pool_interface { + virtual void SetUp(::benchmark::State &state) { + provider.SetUp(state); + if (state.thread_index() != 0) { + return; + } + auto umf_result = umfPoolCreate(getOps(state), provider.provider, + getParams(state), 0, &pool); + if (umf_result != UMF_RESULT_SUCCESS) { + state.SkipWithError("umfPoolCreate() failed"); + } + } + virtual void TearDown([[maybe_unused]] ::benchmark::State &state) { + if (state.thread_index() != 0) { + return; + } + // TODO: The scalable pool destruction process can race with other threads + // performing TLS (Thread-Local Storage) destruction. + // As a temporary workaround, we introduce a delay (sleep) + // to ensure the pool is destroyed only after all threads have completed. + // Issue: #933 + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + if (pool) { + umfPoolDestroy(pool); + } + }; + + virtual umf_memory_pool_ops_t * + getOps([[maybe_unused]] ::benchmark::State &state) { + return nullptr; + } + virtual void *getParams([[maybe_unused]] ::benchmark::State &state) { + return nullptr; + } + T provider; + umf_memory_pool_handle_t pool; +}; diff --git a/benchmark/multithread.cpp b/benchmark/multithread.cpp index efb46729c3..4558942ecb 100644 --- a/benchmark/multithread.cpp +++ b/benchmark/multithread.cpp @@ -113,7 +113,7 @@ int main() { std::cout << "skipping scalable_pool mt_alloc_free" << std::endl; #endif -#if defined(UMF_BUILD_LIBUMF_POOL_JEMALLOC) +#if defined(UMF_POOL_JEMALLOC_ENABLED) std::cout << "jemalloc_pool mt_alloc_free: "; mt_alloc_free(poolCreateExtParams{umfJemallocPoolOps(), nullptr, umfOsMemoryProviderOps(), osParams}); @@ -121,7 +121,7 @@ int main() { std::cout << "skipping jemalloc_pool mt_alloc_free" << std::endl; #endif -#if defined(UMF_BUILD_LIBUMF_POOL_DISJOINT) +#if defined(UMF_POOL_DISJOINT_ENABLED) umf_disjoint_pool_params_handle_t hDisjointParams = nullptr; umf_result_t ret = umfDisjointPoolParamsCreate(&hDisjointParams); if (ret != UMF_RESULT_SUCCESS) { diff --git a/benchmark/ubench.c b/benchmark/ubench.c index 142112e83a..5f1bfe9e48 100644 --- a/benchmark/ubench.c +++ b/benchmark/ubench.c @@ -20,11 +20,11 @@ #include #include -#ifdef UMF_BUILD_LIBUMF_POOL_DISJOINT +#ifdef UMF_POOL_DISJOINT_ENABLED #include #endif -#ifdef UMF_BUILD_LIBUMF_POOL_JEMALLOC +#ifdef UMF_POOL_JEMALLOC_ENABLED #include #endif @@ -244,7 +244,7 @@ UBENCH_EX(simple, proxy_pool_with_os_memory_provider) { free(array); } -#if (defined UMF_BUILD_LIBUMF_POOL_DISJOINT) +#if (defined UMF_POOL_DISJOINT_ENABLED) ////////////////// DISJOINT POOL WITH OS MEMORY PROVIDER UBENCH_EX(simple, disjoint_pool_with_os_memory_provider) { @@ -327,9 +327,9 @@ UBENCH_EX(simple, disjoint_pool_with_os_memory_provider) { umfMemoryProviderDestroy(os_memory_provider); free(array); } -#endif /* (defined UMF_BUILD_LIBUMF_POOL_DISJOINT) */ +#endif /* (defined UMF_POOL_DISJOINT_ENABLED) */ -#if (defined UMF_BUILD_LIBUMF_POOL_JEMALLOC) +#if (defined UMF_POOL_JEMALLOC_ENABLED) ////////////////// JEMALLOC POOL WITH OS MEMORY PROVIDER UBENCH_EX(simple, jemalloc_pool_with_os_memory_provider) { @@ -373,7 +373,7 @@ UBENCH_EX(simple, jemalloc_pool_with_os_memory_provider) { umfMemoryProviderDestroy(os_memory_provider); free(array); } -#endif /* (defined UMF_BUILD_LIBUMF_POOL_JEMALLOC) */ +#endif /* (defined UMF_POOL_JEMALLOC_ENABLED) */ #if (defined UMF_POOL_SCALABLE_ENABLED) ////////////////// SCALABLE (TBB) POOL WITH OS MEMORY PROVIDER @@ -421,7 +421,7 @@ UBENCH_EX(simple, scalable_pool_with_os_memory_provider) { } #endif /* (defined UMF_POOL_SCALABLE_ENABLED) */ -#if (defined UMF_BUILD_LIBUMF_POOL_DISJOINT && \ +#if (defined UMF_POOL_DISJOINT_ENABLED && \ defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS) static void do_ipc_get_put_benchmark(alloc_t *allocs, size_t num_allocs, size_t repeats, @@ -630,7 +630,7 @@ UBENCH_EX(ipc, disjoint_pool_with_level_zero_provider) { err_destroy_context: utils_ze_destroy_context(context); } -#endif /* (defined UMF_BUILD_LIBUMF_POOL_DISJOINT && defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS) */ +#endif /* (defined UMF_POLL_DISJOINT_ENABLED && defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS) */ // TODO add IPC benchmark for CUDA