Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions cpp/benchmarks/async_priming/async_priming_bench.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#include <rmm/cuda_device.hpp>
#include <rmm/mr/cuda_async_memory_resource.hpp>
#include <rmm/mr/device_memory_resource.hpp>

#include <benchmark/benchmark.h>
#include <benchmarks/utilities/cxxopts.hpp>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ __global__ void compute_bound_kernel(int64_t* out)
*out = static_cast<int64_t>(clock_current);
}

using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
Copy link

@vuule vuule Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this include resources that allocate memory that's also host accessible? i.e. does this template param mean "only device accessible"?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, a cuda::mr::any_resource<cuda::mr::device_accessible> can do a narrowing conversion of a host-device accessible resource.

In other words, managed and pinned memory resources are safe to store in a cuda::mr::any_resource<cuda::mr::device_accessible>, but you lose the ability to statically know that they are host-accessible once the property is downcasted away.

using MRFactoryFunc = std::function<any_device_resource()>;

static void run_prewarm(rmm::cuda_stream_pool& stream_pool, rmm::device_async_resource_ref mr)
{
Expand All @@ -63,18 +64,18 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
{
auto mr = factory();

rmm::mr::set_current_device_resource_ref(mr.get());
rmm::mr::set_current_device_resource_ref(mr);

auto num_streams = state.range(0);
auto num_kernels = state.range(1);
bool do_prewarm = state.range(2) != 0;

auto stream_pool = rmm::cuda_stream_pool(static_cast<std::size_t>(num_streams));

if (do_prewarm) { run_prewarm(stream_pool, mr.get()); }
if (do_prewarm) { run_prewarm(stream_pool, mr); }

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
run_test(static_cast<std::size_t>(num_kernels), stream_pool, mr.get());
run_test(static_cast<std::size_t>(num_kernels), stream_pool, mr);
cudaDeviceSynchronize();
}

Expand All @@ -83,31 +84,29 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
rmm::mr::reset_current_device_resource_ref();
}

inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; }

inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; }

inline auto make_pool()
inline any_device_resource make_pool()
{
return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(),
rmm::percent_of_free_device_memory(50));
rmm::mr::cuda_memory_resource cuda{};
return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)};
}

inline auto make_arena()
inline any_device_resource make_arena()
{
return std::make_shared<rmm::mr::arena_memory_resource>(
rmm::mr::get_current_device_resource_ref());
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()};
}

inline auto make_binning()
inline any_device_resource make_binning()
{
// Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
// Larger allocations will use the pool resource
constexpr auto min_bin_pow2{18};
constexpr auto max_bin_pow2{22};
auto mr =
std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(), min_bin_pow2, max_bin_pow2);
return mr;
auto pool = make_pool();
return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2};
}

static void benchmark_range(benchmark::internal::Benchmark* bench)
Expand Down Expand Up @@ -171,9 +170,9 @@ void run_profile(std::string const& resource_name, int kernel_count, int stream_
auto mr = mr_factory();
auto stream_pool = rmm::cuda_stream_pool(static_cast<std::size_t>(stream_count));

if (prewarm) { run_prewarm(stream_pool, mr.get()); }
if (prewarm) { run_prewarm(stream_pool, mr); }

run_test(static_cast<std::size_t>(kernel_count), stream_pool, mr.get());
run_test(static_cast<std::size_t>(kernel_count), stream_pool, mr);
}

int main(int argc, char** argv)
Expand All @@ -193,7 +192,7 @@ int main(int argc, char** argv)

options.add_options()( //
"r,resource",
"Type of device_memory_resource",
"Type of memory resource",
cxxopts::value<std::string>()->default_value("pool"));

options.add_options()( //
Expand Down
43 changes: 21 additions & 22 deletions cpp/benchmarks/random_allocations/random_allocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
#include <rmm/mr/binning_memory_resource.hpp>
#include <rmm/mr/cuda_async_memory_resource.hpp>
#include <rmm/mr/cuda_memory_resource.hpp>
#include <rmm/mr/device_memory_resource.hpp>
#include <rmm/mr/per_device_resource.hpp>
#include <rmm/mr/pool_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <benchmark/benchmark.h>
#include <benchmarks/utilities/cxxopts.hpp>
Expand Down Expand Up @@ -49,7 +49,7 @@ allocation remove_at(allocation_vector& allocs, std::size_t index)
}

template <typename SizeDistribution>
void random_allocation_free(rmm::mr::device_memory_resource& mr,
void random_allocation_free(rmm::device_async_resource_ref mr,
SizeDistribution size_distribution,
std::size_t num_allocations,
std::size_t max_usage, // in MiB
Expand Down Expand Up @@ -127,7 +127,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr,
} // namespace

void uniform_random_allocations(
rmm::mr::device_memory_resource& mr,
rmm::device_async_resource_ref mr,
std::size_t num_allocations, // NOLINT(bugprone-easily-swappable-parameters)
std::size_t max_allocation_size, // size in MiB
std::size_t max_usage,
Expand All @@ -138,7 +138,7 @@ void uniform_random_allocations(
}

// TODO figure out how to map a normal distribution to integers between 1 and max_allocation_size
/*void normal_random_allocations(rmm::mr::device_memory_resource& mr,
/*void normal_random_allocations(rmm::device_async_resource_ref mr,
std::size_t num_allocations = 1000,
std::size_t mean_allocation_size = 500, // in MiB
std::size_t stddev_allocation_size = 500, // in MiB
Expand All @@ -148,36 +148,36 @@ void uniform_random_allocations(
}*/

/// MR factory functions
inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
Copy link
Member

@mhaseeb123 mhaseeb123 Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to define this in a common header. We seem to have it duplicated in a few places

Copy link
Collaborator Author

@bdice bdice Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm trying to keep usage of the any_device_resource alias to a minimum, only for internal usage. I debated removing it entirely and requiring the explicit templated type everywhere. My reasoning for this is that we may want to allow other resource accessibility properties in the future (templating classes on Properties...) and I don't want to force the design into "device only" in too many places.


inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; }

inline auto make_pool()
inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; }

inline any_device_resource make_pool()
{
return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(),
rmm::percent_of_free_device_memory(50));
rmm::mr::cuda_memory_resource cuda{};
return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)};
}

inline auto make_arena()
inline any_device_resource make_arena()
{
auto free = rmm::available_device_memory().first;
constexpr auto reserve{64UL << 20}; // Leave some space for CUDA overhead.
return std::make_shared<rmm::mr::arena_memory_resource>(
rmm::mr::get_current_device_resource_ref(), free - reserve);
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(), free - reserve};
}

inline auto make_binning()
inline any_device_resource make_binning()
{
// Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
// Larger allocations will use the pool resource
constexpr auto min_bin_pow2{18};
constexpr auto max_bin_pow2{22};
auto mr =
std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(), min_bin_pow2, max_bin_pow2);
return mr;
auto pool = make_pool();
return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2};
}

using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
using MRFactoryFunc = std::function<any_device_resource()>;

constexpr std::size_t max_usage = 16000;

Expand All @@ -190,7 +190,7 @@ static void BM_RandomAllocations(benchmark::State& state, MRFactoryFunc const& f

try {
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
uniform_random_allocations(*mr, num_allocations, max_size, max_usage);
uniform_random_allocations(mr, num_allocations, max_size, max_usage);
}
} catch (std::exception const& e) {
std::cout << "Error: " << e.what() << "\n";
Expand Down Expand Up @@ -243,8 +243,7 @@ void declare_benchmark(std::string const& name)
if (name == "cuda") {
BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_mr, &make_cuda) // NOLINT
->Apply(benchmark_range);
}
if (name == "cuda_async") {
} else if (name == "cuda_async") {
BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_async_mr, &make_cuda_async) // NOLINT
->Apply(benchmark_range);
} else if (name == "binning") {
Expand All @@ -268,7 +267,7 @@ static void profile_random_allocations(MRFactoryFunc const& factory,
auto mr = factory();

try {
uniform_random_allocations(*mr, num_allocations, max_size, max_usage);
uniform_random_allocations(mr, num_allocations, max_size, max_usage);
} catch (std::exception const& e) {
std::cout << "Error: " << e.what() << "\n";
}
Expand All @@ -288,7 +287,7 @@ int main(int argc, char** argv)
options.add_options()(
"p,profile", "Profiling mode: run once", cxxopts::value<bool>()->default_value("false"));
options.add_options()("r,resource",
"Type of device_memory_resource",
"Type of memory resource",
cxxopts::value<std::string>()->default_value("pool"));
options.add_options()("n,numallocs",
"Number of allocations (default of 0 tests a range)",
Expand Down
64 changes: 29 additions & 35 deletions cpp/benchmarks/replay/replay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
* SPDX-License-Identifier: Apache-2.0
*/

#include <rmm/aligned.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/logger.hpp>
#include <rmm/mr/arena_memory_resource.hpp>
#include <rmm/mr/binning_memory_resource.hpp>
#include <rmm/mr/cuda_memory_resource.hpp>
#include <rmm/mr/device_memory_resource.hpp>
#include <rmm/mr/managed_memory_resource.hpp>
#include <rmm/mr/per_device_resource.hpp>
#include <rmm/mr/pool_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda/iterator>
#include <thrust/execution_policy.h>
Expand All @@ -31,56 +32,49 @@
#include <iterator>
#include <memory>
#include <numeric>
#include <optional>
#include <string>
#include <thread>

/// MR factory functions
std::shared_ptr<rmm::mr::device_memory_resource> make_cuda(std::size_t = 0)
{
return std::make_shared<rmm::mr::cuda_memory_resource>();
}
using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;

std::shared_ptr<rmm::mr::device_memory_resource> make_managed(std::size_t = 0)
{
return std::make_shared<rmm::mr::managed_memory_resource>();
}
/// MR factory functions
any_device_resource make_cuda(std::size_t = 0) { return rmm::mr::cuda_memory_resource{}; }

std::shared_ptr<rmm::mr::device_memory_resource> make_simulated(std::size_t simulated_size)
{
return std::make_shared<rmm::mr::simulated_memory_resource>(simulated_size);
}
any_device_resource make_managed(std::size_t = 0) { return rmm::mr::managed_memory_resource{}; }

inline auto make_pool(std::size_t simulated_size)
inline any_device_resource make_pool(std::size_t simulated_size)
{
if (simulated_size > 0) {
return std::make_shared<rmm::mr::pool_memory_resource>(
*make_simulated(simulated_size), simulated_size, simulated_size);
rmm::mr::simulated_memory_resource sim{simulated_size};
return rmm::mr::pool_memory_resource{sim, simulated_size, simulated_size};
}
return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(), 0);
rmm::mr::cuda_memory_resource cuda{};
return rmm::mr::pool_memory_resource{cuda, 0};
}

inline auto make_arena(std::size_t simulated_size)
inline any_device_resource make_arena(std::size_t simulated_size)
{
if (simulated_size > 0) {
return std::make_shared<rmm::mr::arena_memory_resource>(
rmm::mr::get_current_device_resource_ref(), simulated_size);
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(),
simulated_size};
}
return std::make_shared<rmm::mr::arena_memory_resource>(
rmm::mr::get_current_device_resource_ref());
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()};
}

inline auto make_binning(std::size_t simulated_size)
inline any_device_resource make_binning(std::size_t simulated_size)
{
auto mr = std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(simulated_size));
auto pool = make_pool(simulated_size);
auto mr = rmm::mr::binning_memory_resource{pool};
const auto min_size_exp{18};
const auto max_size_exp{22};
for (std::size_t i = min_size_exp; i <= max_size_exp; i++) {
mr->add_bin(1 << i);
mr.add_bin(1 << i);
}
return mr;
}

using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>(std::size_t)>;
using MRFactoryFunc = std::function<any_device_resource(std::size_t)>;

/**
* @brief Represents an allocation made during the replay
Expand All @@ -95,15 +89,15 @@ struct allocation {

/**
* @brief Function object for running a replay benchmark with the specified
* `device_memory_resource`.
* memory resource.
*
* @tparam MR The type of the `device_memory_resource` to use for allocation
* @tparam MR The type of the memory resource to use for allocation
* replay
*/
struct replay_benchmark {
MRFactoryFunc factory_;
std::size_t simulated_size_;
std::shared_ptr<rmm::mr::device_memory_resource> mr_{};
std::optional<any_device_resource> mr_{};
std::vector<std::vector<rmm::detail::event>> const& events_{};

// Maps a pointer from the event log to an active allocation
Expand Down Expand Up @@ -173,7 +167,7 @@ struct replay_benchmark {
{
if (state.thread_index() == 0) {
RMM_LOG_INFO("------ Start of Benchmark -----");
mr_ = factory_(simulated_size_);
mr_.emplace(factory_(simulated_size_));
}
// Can't release threads until MR is set up.
barrier_.arrive_and_wait();
Expand All @@ -193,7 +187,7 @@ struct replay_benchmark {
auto alloc = ptr_alloc.second;
num_leaked++;
total_leaked += alloc.size;
mr_->deallocate_sync(alloc.ptr, alloc.size);
mr_->deallocate_sync(alloc.ptr, alloc.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
}
if (num_leaked > 0) {
std::cout << "LOG shows leak of " << num_leaked << " allocations of " << total_leaked
Expand Down Expand Up @@ -225,11 +219,11 @@ struct replay_benchmark {

// rmm::detail::action::ALLOCATE_FAILURE is ignored.
if (rmm::detail::action::ALLOCATE == event.act) {
auto ptr = mr_->allocate_sync(event.size);
auto ptr = mr_->allocate_sync(event.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
set_allocation(event.pointer, allocation{ptr, event.size});
} else if (rmm::detail::action::FREE == event.act) {
auto alloc = remove_allocation(event.pointer);
mr_->deallocate_sync(alloc.ptr, event.size);
mr_->deallocate_sync(alloc.ptr, event.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
}

event_index++;
Expand Down Expand Up @@ -355,7 +349,7 @@ int main(int argc, char** argv)

options.add_options()("f,file", "Name of RMM log file.", cxxopts::value<std::string>());
options.add_options()("r,resource",
"Type of device_memory_resource",
"Type of memory resource",
cxxopts::value<std::string>()->default_value("pool"));
options.add_options()(
"s,size",
Expand Down
1 change: 0 additions & 1 deletion cpp/benchmarks/utilities/log_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "rapidcsv.h"

#include <rmm/detail/error.hpp>
#include <rmm/mr/device_memory_resource.hpp>

#include <chrono>
#include <cstdint>
Expand Down
Loading
Loading