diff --git a/cpp/benchmarks/async_priming/async_priming_bench.cpp b/cpp/benchmarks/async_priming/async_priming_bench.cpp index fe641b004..1b5204bcd 100644 --- a/cpp/benchmarks/async_priming/async_priming_bench.cpp +++ b/cpp/benchmarks/async_priming/async_priming_bench.cpp @@ -1,11 +1,10 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #include #include -#include #include #include diff --git a/cpp/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/cpp/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu index d34a55d71..3687a3081 100644 --- a/cpp/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu +++ b/cpp/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu @@ -37,7 +37,8 @@ __global__ void compute_bound_kernel(int64_t* out) *out = static_cast(clock_current); } -using MRFactoryFunc = std::function()>; +using any_device_resource = cuda::mr::any_resource; +using MRFactoryFunc = std::function; static void run_prewarm(rmm::cuda_stream_pool& stream_pool, rmm::device_async_resource_ref mr) { @@ -63,7 +64,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con { auto mr = factory(); - rmm::mr::set_current_device_resource_ref(mr.get()); + rmm::mr::set_current_device_resource_ref(mr); auto num_streams = state.range(0); auto num_kernels = state.range(1); @@ -71,10 +72,10 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con auto stream_pool = rmm::cuda_stream_pool(static_cast(num_streams)); - if (do_prewarm) { run_prewarm(stream_pool, mr.get()); } + if (do_prewarm) { run_prewarm(stream_pool, mr); } for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) - run_test(static_cast(num_kernels), stream_pool, mr.get()); + run_test(static_cast(num_kernels), stream_pool, mr); cudaDeviceSynchronize(); } @@ -83,31 +84,29 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con rmm::mr::reset_current_device_resource_ref(); } -inline auto make_cuda() { return std::make_shared(); } +inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; } -inline auto make_cuda_async() { return std::make_shared(); } +inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; } -inline auto make_pool() +inline any_device_resource make_pool() { - return std::make_shared(*make_cuda(), - rmm::percent_of_free_device_memory(50)); + rmm::mr::cuda_memory_resource cuda{}; + return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)}; } -inline auto make_arena() +inline any_device_resource make_arena() { - return std::make_shared( - rmm::mr::get_current_device_resource_ref()); + return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()}; } -inline auto make_binning() +inline any_device_resource make_binning() { // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB // Larger allocations will use the pool resource constexpr auto min_bin_pow2{18}; constexpr auto max_bin_pow2{22}; - auto mr = - std::make_shared(*make_pool(), min_bin_pow2, max_bin_pow2); - return mr; + auto pool = make_pool(); + return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2}; } static void benchmark_range(benchmark::internal::Benchmark* bench) @@ -171,9 +170,9 @@ void run_profile(std::string const& resource_name, int kernel_count, int stream_ auto mr = mr_factory(); auto stream_pool = rmm::cuda_stream_pool(static_cast(stream_count)); - if (prewarm) { run_prewarm(stream_pool, mr.get()); } + if (prewarm) { run_prewarm(stream_pool, mr); } - run_test(static_cast(kernel_count), stream_pool, mr.get()); + run_test(static_cast(kernel_count), stream_pool, mr); } int main(int argc, char** argv) @@ -193,7 +192,7 @@ int main(int argc, char** argv) options.add_options()( // "r,resource", - "Type of device_memory_resource", + "Type of memory resource", cxxopts::value()->default_value("pool")); options.add_options()( // diff --git a/cpp/benchmarks/random_allocations/random_allocations.cpp b/cpp/benchmarks/random_allocations/random_allocations.cpp index db6b394e9..5ab201cab 100644 --- a/cpp/benchmarks/random_allocations/random_allocations.cpp +++ b/cpp/benchmarks/random_allocations/random_allocations.cpp @@ -8,9 +8,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -49,7 +49,7 @@ allocation remove_at(allocation_vector& allocs, std::size_t index) } template -void random_allocation_free(rmm::mr::device_memory_resource& mr, +void random_allocation_free(rmm::device_async_resource_ref mr, SizeDistribution size_distribution, std::size_t num_allocations, std::size_t max_usage, // in MiB @@ -127,7 +127,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr, } // namespace void uniform_random_allocations( - rmm::mr::device_memory_resource& mr, + rmm::device_async_resource_ref mr, std::size_t num_allocations, // NOLINT(bugprone-easily-swappable-parameters) std::size_t max_allocation_size, // size in MiB std::size_t max_usage, @@ -138,7 +138,7 @@ void uniform_random_allocations( } // TODO figure out how to map a normal distribution to integers between 1 and max_allocation_size -/*void normal_random_allocations(rmm::mr::device_memory_resource& mr, +/*void normal_random_allocations(rmm::device_async_resource_ref mr, std::size_t num_allocations = 1000, std::size_t mean_allocation_size = 500, // in MiB std::size_t stddev_allocation_size = 500, // in MiB @@ -148,36 +148,36 @@ void uniform_random_allocations( }*/ /// MR factory functions -inline auto make_cuda() { return std::make_shared(); } +using any_device_resource = cuda::mr::any_resource; -inline auto make_cuda_async() { return std::make_shared(); } +inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; } -inline auto make_pool() +inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; } + +inline any_device_resource make_pool() { - return std::make_shared(*make_cuda(), - rmm::percent_of_free_device_memory(50)); + rmm::mr::cuda_memory_resource cuda{}; + return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)}; } -inline auto make_arena() +inline any_device_resource make_arena() { auto free = rmm::available_device_memory().first; constexpr auto reserve{64UL << 20}; // Leave some space for CUDA overhead. - return std::make_shared( - rmm::mr::get_current_device_resource_ref(), free - reserve); + return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(), free - reserve}; } -inline auto make_binning() +inline any_device_resource make_binning() { // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB // Larger allocations will use the pool resource constexpr auto min_bin_pow2{18}; constexpr auto max_bin_pow2{22}; - auto mr = - std::make_shared(*make_pool(), min_bin_pow2, max_bin_pow2); - return mr; + auto pool = make_pool(); + return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2}; } -using MRFactoryFunc = std::function()>; +using MRFactoryFunc = std::function; constexpr std::size_t max_usage = 16000; @@ -190,7 +190,7 @@ static void BM_RandomAllocations(benchmark::State& state, MRFactoryFunc const& f try { for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) - uniform_random_allocations(*mr, num_allocations, max_size, max_usage); + uniform_random_allocations(mr, num_allocations, max_size, max_usage); } } catch (std::exception const& e) { std::cout << "Error: " << e.what() << "\n"; @@ -243,8 +243,7 @@ void declare_benchmark(std::string const& name) if (name == "cuda") { BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_mr, &make_cuda) // NOLINT ->Apply(benchmark_range); - } - if (name == "cuda_async") { + } else if (name == "cuda_async") { BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_async_mr, &make_cuda_async) // NOLINT ->Apply(benchmark_range); } else if (name == "binning") { @@ -268,7 +267,7 @@ static void profile_random_allocations(MRFactoryFunc const& factory, auto mr = factory(); try { - uniform_random_allocations(*mr, num_allocations, max_size, max_usage); + uniform_random_allocations(mr, num_allocations, max_size, max_usage); } catch (std::exception const& e) { std::cout << "Error: " << e.what() << "\n"; } @@ -288,7 +287,7 @@ int main(int argc, char** argv) options.add_options()( "p,profile", "Profiling mode: run once", cxxopts::value()->default_value("false")); options.add_options()("r,resource", - "Type of device_memory_resource", + "Type of memory resource", cxxopts::value()->default_value("pool")); options.add_options()("n,numallocs", "Number of allocations (default of 0 tests a range)", diff --git a/cpp/benchmarks/replay/replay.cpp b/cpp/benchmarks/replay/replay.cpp index 04e1833ee..b6e95ed6d 100644 --- a/cpp/benchmarks/replay/replay.cpp +++ b/cpp/benchmarks/replay/replay.cpp @@ -3,16 +3,17 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include #include #include #include #include #include #include -#include #include #include #include +#include #include #include @@ -31,56 +32,49 @@ #include #include #include +#include #include #include -/// MR factory functions -std::shared_ptr make_cuda(std::size_t = 0) -{ - return std::make_shared(); -} +using any_device_resource = cuda::mr::any_resource; -std::shared_ptr make_managed(std::size_t = 0) -{ - return std::make_shared(); -} +/// MR factory functions +any_device_resource make_cuda(std::size_t = 0) { return rmm::mr::cuda_memory_resource{}; } -std::shared_ptr make_simulated(std::size_t simulated_size) -{ - return std::make_shared(simulated_size); -} +any_device_resource make_managed(std::size_t = 0) { return rmm::mr::managed_memory_resource{}; } -inline auto make_pool(std::size_t simulated_size) +inline any_device_resource make_pool(std::size_t simulated_size) { if (simulated_size > 0) { - return std::make_shared( - *make_simulated(simulated_size), simulated_size, simulated_size); + rmm::mr::simulated_memory_resource sim{simulated_size}; + return rmm::mr::pool_memory_resource{sim, simulated_size, simulated_size}; } - return std::make_shared(*make_cuda(), 0); + rmm::mr::cuda_memory_resource cuda{}; + return rmm::mr::pool_memory_resource{cuda, 0}; } -inline auto make_arena(std::size_t simulated_size) +inline any_device_resource make_arena(std::size_t simulated_size) { if (simulated_size > 0) { - return std::make_shared( - rmm::mr::get_current_device_resource_ref(), simulated_size); + return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(), + simulated_size}; } - return std::make_shared( - rmm::mr::get_current_device_resource_ref()); + return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()}; } -inline auto make_binning(std::size_t simulated_size) +inline any_device_resource make_binning(std::size_t simulated_size) { - auto mr = std::make_shared(*make_pool(simulated_size)); + auto pool = make_pool(simulated_size); + auto mr = rmm::mr::binning_memory_resource{pool}; const auto min_size_exp{18}; const auto max_size_exp{22}; for (std::size_t i = min_size_exp; i <= max_size_exp; i++) { - mr->add_bin(1 << i); + mr.add_bin(1 << i); } return mr; } -using MRFactoryFunc = std::function(std::size_t)>; +using MRFactoryFunc = std::function; /** * @brief Represents an allocation made during the replay @@ -95,15 +89,15 @@ struct allocation { /** * @brief Function object for running a replay benchmark with the specified - * `device_memory_resource`. + * memory resource. * - * @tparam MR The type of the `device_memory_resource` to use for allocation + * @tparam MR The type of the memory resource to use for allocation * replay */ struct replay_benchmark { MRFactoryFunc factory_; std::size_t simulated_size_; - std::shared_ptr mr_{}; + std::optional mr_{}; std::vector> const& events_{}; // Maps a pointer from the event log to an active allocation @@ -173,7 +167,7 @@ struct replay_benchmark { { if (state.thread_index() == 0) { RMM_LOG_INFO("------ Start of Benchmark -----"); - mr_ = factory_(simulated_size_); + mr_.emplace(factory_(simulated_size_)); } // Can't release threads until MR is set up. barrier_.arrive_and_wait(); @@ -193,7 +187,7 @@ struct replay_benchmark { auto alloc = ptr_alloc.second; num_leaked++; total_leaked += alloc.size; - mr_->deallocate_sync(alloc.ptr, alloc.size); + mr_->deallocate_sync(alloc.ptr, alloc.size, rmm::CUDA_ALLOCATION_ALIGNMENT); } if (num_leaked > 0) { std::cout << "LOG shows leak of " << num_leaked << " allocations of " << total_leaked @@ -225,11 +219,11 @@ struct replay_benchmark { // rmm::detail::action::ALLOCATE_FAILURE is ignored. if (rmm::detail::action::ALLOCATE == event.act) { - auto ptr = mr_->allocate_sync(event.size); + auto ptr = mr_->allocate_sync(event.size, rmm::CUDA_ALLOCATION_ALIGNMENT); set_allocation(event.pointer, allocation{ptr, event.size}); } else if (rmm::detail::action::FREE == event.act) { auto alloc = remove_allocation(event.pointer); - mr_->deallocate_sync(alloc.ptr, event.size); + mr_->deallocate_sync(alloc.ptr, event.size, rmm::CUDA_ALLOCATION_ALIGNMENT); } event_index++; @@ -355,7 +349,7 @@ int main(int argc, char** argv) options.add_options()("f,file", "Name of RMM log file.", cxxopts::value()); options.add_options()("r,resource", - "Type of device_memory_resource", + "Type of memory resource", cxxopts::value()->default_value("pool")); options.add_options()( "s,size", diff --git a/cpp/benchmarks/utilities/log_parser.hpp b/cpp/benchmarks/utilities/log_parser.hpp index 5114b7ea0..886c1f585 100644 --- a/cpp/benchmarks/utilities/log_parser.hpp +++ b/cpp/benchmarks/utilities/log_parser.hpp @@ -8,7 +8,6 @@ #include "rapidcsv.h" #include -#include #include #include diff --git a/cpp/benchmarks/utilities/simulated_memory_resource.hpp b/cpp/benchmarks/utilities/simulated_memory_resource.hpp index 1676924ba..ff5254b83 100644 --- a/cpp/benchmarks/utilities/simulated_memory_resource.hpp +++ b/cpp/benchmarks/utilities/simulated_memory_resource.hpp @@ -4,12 +4,17 @@ */ #pragma once +#include #include +#include #include -#include +#include +#include #include +#include + namespace rmm::mr { /** @@ -20,7 +25,7 @@ namespace rmm::mr { * * Deallocation calls are ignored. */ -class simulated_memory_resource final : public device_memory_resource { +class simulated_memory_resource final { public: /** * @brief Construct a `simulated_memory_resource`. @@ -33,15 +38,13 @@ class simulated_memory_resource final : public device_memory_resource { { } - ~simulated_memory_resource() override = default; + ~simulated_memory_resource() = default; - // Disable copy (and move) semantics. - simulated_memory_resource(simulated_memory_resource const&) = delete; - simulated_memory_resource& operator=(simulated_memory_resource const&) = delete; - simulated_memory_resource(simulated_memory_resource&&) = delete; - simulated_memory_resource& operator=(simulated_memory_resource&&) = delete; + simulated_memory_resource(simulated_memory_resource const&) = default; + simulated_memory_resource& operator=(simulated_memory_resource const&) = default; + simulated_memory_resource(simulated_memory_resource&&) = default; + simulated_memory_resource& operator=(simulated_memory_resource&&) = default; - private: /** * @brief Allocates memory of size at least `bytes`. * @@ -52,7 +55,9 @@ class simulated_memory_resource final : public device_memory_resource { * @param bytes The size, in bytes, of the allocation * @return void* Pointer to the newly allocated memory */ - void* do_allocate(std::size_t bytes, cuda_stream_view) override + void* allocate([[maybe_unused]] cuda::stream_ref stream, + std::size_t bytes, + [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) RMM_EXPECTS(begin_ + bytes <= end_, @@ -68,12 +73,59 @@ class simulated_memory_resource final : public device_memory_resource { * @brief Deallocate memory pointed to by `ptr`. * * @note This call is ignored. + */ + void deallocate([[maybe_unused]] cuda::stream_ref stream, + void* /*ptr*/, + std::size_t /*bytes*/, + [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept + { + } + + /** + * @brief Allocates memory of size at least `bytes` synchronously. + * + * @param bytes The size, in bytes, of the allocation + * @param alignment The alignment of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) + { + return allocate(cuda::stream_ref{cudaStream_t{nullptr}}, bytes, alignment); + } + + /** + * @brief Deallocate memory pointed to by `ptr` synchronously. + * + * @note This call is ignored. * * @param ptr Pointer to be deallocated + * @param bytes The size, in bytes, of the allocation + * @param alignment The alignment of the allocation */ - void do_deallocate(void* /*ptr*/, std::size_t, cuda_stream_view) noexcept override {} + void deallocate_sync(void* ptr, + std::size_t bytes, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept + { + deallocate(cuda::stream_ref{cudaStream_t{nullptr}}, ptr, bytes, alignment); + } + + bool operator==(simulated_memory_resource const&) const noexcept { return true; } + bool operator!=(simulated_memory_resource const&) const noexcept { return false; } + RMM_CONSTEXPR_FRIEND void get_property(simulated_memory_resource const&, + cuda::mr::device_accessible) noexcept + { + } + + private: char* begin_{}; char* end_{}; }; + +static_assert(cuda::mr::synchronous_resource); +static_assert(cuda::mr::resource); +static_assert( + cuda::mr::synchronous_resource_with); +static_assert(cuda::mr::resource_with); + } // namespace rmm::mr diff --git a/cpp/include/rmm/cuda_stream.hpp b/cpp/include/rmm/cuda_stream.hpp index da620f5c5..cddcc45bd 100644 --- a/cpp/include/rmm/cuda_stream.hpp +++ b/cpp/include/rmm/cuda_stream.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -98,6 +98,13 @@ class cuda_stream { */ operator cuda_stream_view() const; + /** + * @brief Implicit conversion to cuda::stream_ref + * + * @return A stream_ref of the owned stream + */ + operator cuda::stream_ref() const; + /** * @brief Synchronize the owned CUDA stream. * diff --git a/cpp/include/rmm/detail/cccl_adaptors.hpp b/cpp/include/rmm/detail/cccl_adaptors.hpp index d8f6999e0..d2b33ec4d 100644 --- a/cpp/include/rmm/detail/cccl_adaptors.hpp +++ b/cpp/include/rmm/detail/cccl_adaptors.hpp @@ -32,6 +32,34 @@ inline constexpr bool is_specialization_of_v = false; template