rapidsai · bdice · Mar 24, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 18, 2026
@@ -1,11 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/cuda_async_memory_resource.hpp>
-#include <rmm/mr/device_memory_resource.hpp>
 
 #include <benchmark/benchmark.h>
 #include <benchmarks/utilities/cxxopts.hpp>

@@ -37,7 +37,8 @@ __global__ void compute_bound_kernel(int64_t* out)
   *out = static_cast<int64_t>(clock_current);
 }
 
-using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
+using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
+using MRFactoryFunc       = std::function<any_device_resource()>;
 
 static void run_prewarm(rmm::cuda_stream_pool& stream_pool, rmm::device_async_resource_ref mr)
 {
@@ -63,18 +64,18 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
 {
   auto mr = factory();
 
-  rmm::mr::set_current_device_resource_ref(mr.get());
+  rmm::mr::set_current_device_resource_ref(mr);
 
   auto num_streams = state.range(0);
   auto num_kernels = state.range(1);
   bool do_prewarm  = state.range(2) != 0;
 
   auto stream_pool = rmm::cuda_stream_pool(static_cast<std::size_t>(num_streams));
 
-  if (do_prewarm) { run_prewarm(stream_pool, mr.get()); }
+  if (do_prewarm) { run_prewarm(stream_pool, mr); }
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
-    run_test(static_cast<std::size_t>(num_kernels), stream_pool, mr.get());
+    run_test(static_cast<std::size_t>(num_kernels), stream_pool, mr);
     cudaDeviceSynchronize();
   }
 
@@ -83,31 +84,29 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
   rmm::mr::reset_current_device_resource_ref();
 }
 
-inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; }
 
-inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; }
 
-inline auto make_pool()
+inline any_device_resource make_pool()
 {
-  return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(),
-                                                         rmm::percent_of_free_device_memory(50));
+  rmm::mr::cuda_memory_resource cuda{};
+  return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)};
 }
 
-inline auto make_arena()
+inline any_device_resource make_arena()
 {
-  return std::make_shared<rmm::mr::arena_memory_resource>(
-    rmm::mr::get_current_device_resource_ref());
+  return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()};
 }
 
-inline auto make_binning()
+inline any_device_resource make_binning()
 {
   // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
   // Larger allocations will use the pool resource
   constexpr auto min_bin_pow2{18};
   constexpr auto max_bin_pow2{22};
-  auto mr =
-    std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(), min_bin_pow2, max_bin_pow2);
-  return mr;
+  auto pool = make_pool();
+  return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2};
 }
 
 static void benchmark_range(benchmark::internal::Benchmark* bench)
@@ -171,9 +170,9 @@ void run_profile(std::string const& resource_name, int kernel_count, int stream_
   auto mr          = mr_factory();
   auto stream_pool = rmm::cuda_stream_pool(static_cast<std::size_t>(stream_count));
 
-  if (prewarm) { run_prewarm(stream_pool, mr.get()); }
+  if (prewarm) { run_prewarm(stream_pool, mr); }
 
-  run_test(static_cast<std::size_t>(kernel_count), stream_pool, mr.get());
+  run_test(static_cast<std::size_t>(kernel_count), stream_pool, mr);
 }
 
 int main(int argc, char** argv)
@@ -193,7 +192,7 @@ int main(int argc, char** argv)
 
     options.add_options()(  //
       "r,resource",
-      "Type of device_memory_resource",
+      "Type of memory resource",
       cxxopts::value<std::string>()->default_value("pool"));
 
     options.add_options()(  //

@@ -8,9 +8,9 @@
 #include <rmm/mr/binning_memory_resource.hpp>
 #include <rmm/mr/cuda_async_memory_resource.hpp>
 #include <rmm/mr/cuda_memory_resource.hpp>
-#include <rmm/mr/device_memory_resource.hpp>
 #include <rmm/mr/per_device_resource.hpp>
 #include <rmm/mr/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <benchmark/benchmark.h>
 #include <benchmarks/utilities/cxxopts.hpp>
@@ -49,7 +49,7 @@ allocation remove_at(allocation_vector& allocs, std::size_t index)
 }
 
 template <typename SizeDistribution>
-void random_allocation_free(rmm::mr::device_memory_resource& mr,
+void random_allocation_free(rmm::device_async_resource_ref mr,
                             SizeDistribution size_distribution,
                             std::size_t num_allocations,
                             std::size_t max_usage,  // in MiB
@@ -127,7 +127,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr,
 }  // namespace
 
 void uniform_random_allocations(
-  rmm::mr::device_memory_resource& mr,
+  rmm::device_async_resource_ref mr,
   std::size_t num_allocations,      // NOLINT(bugprone-easily-swappable-parameters)
   std::size_t max_allocation_size,  // size in MiB
   std::size_t max_usage,
@@ -138,7 +138,7 @@ void uniform_random_allocations(
 }
 
 // TODO figure out how to map a normal distribution to integers between 1 and max_allocation_size
-/*void normal_random_allocations(rmm::mr::device_memory_resource& mr,
+/*void normal_random_allocations(rmm::device_async_resource_ref mr,
                                 std::size_t num_allocations = 1000,
                                 std::size_t mean_allocation_size = 500, // in MiB
                                 std::size_t stddev_allocation_size = 500, // in MiB
@@ -148,36 +148,36 @@ void uniform_random_allocations(
 }*/
 
 /// MR factory functions
-inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
 
-inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; }
 
-inline auto make_pool()
+inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; }
+
+inline any_device_resource make_pool()
 {
-  return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(),
-                                                         rmm::percent_of_free_device_memory(50));
+  rmm::mr::cuda_memory_resource cuda{};
+  return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)};
 }
 
-inline auto make_arena()
+inline any_device_resource make_arena()
 {
   auto free = rmm::available_device_memory().first;
   constexpr auto reserve{64UL << 20};  // Leave some space for CUDA overhead.
-  return std::make_shared<rmm::mr::arena_memory_resource>(
-    rmm::mr::get_current_device_resource_ref(), free - reserve);
+  return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(), free - reserve};
 }
 
-inline auto make_binning()
+inline any_device_resource make_binning()
 {
   // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
   // Larger allocations will use the pool resource
   constexpr auto min_bin_pow2{18};
   constexpr auto max_bin_pow2{22};
-  auto mr =
-    std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(), min_bin_pow2, max_bin_pow2);
-  return mr;
+  auto pool = make_pool();
+  return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2};
 }
 
-using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
+using MRFactoryFunc = std::function<any_device_resource()>;
 
 constexpr std::size_t max_usage = 16000;
 
@@ -190,7 +190,7 @@ static void BM_RandomAllocations(benchmark::State& state, MRFactoryFunc const& f
 
   try {
     for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
-      uniform_random_allocations(*mr, num_allocations, max_size, max_usage);
+      uniform_random_allocations(mr, num_allocations, max_size, max_usage);
     }
   } catch (std::exception const& e) {
     std::cout << "Error: " << e.what() << "\n";
@@ -243,8 +243,7 @@ void declare_benchmark(std::string const& name)
   if (name == "cuda") {
     BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_mr, &make_cuda)  // NOLINT
       ->Apply(benchmark_range);
-  }
-  if (name == "cuda_async") {
+  } else if (name == "cuda_async") {
     BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_async_mr, &make_cuda_async)  // NOLINT
       ->Apply(benchmark_range);
   } else if (name == "binning") {
@@ -268,7 +267,7 @@ static void profile_random_allocations(MRFactoryFunc const& factory,
   auto mr = factory();
 
   try {
-    uniform_random_allocations(*mr, num_allocations, max_size, max_usage);
+    uniform_random_allocations(mr, num_allocations, max_size, max_usage);
   } catch (std::exception const& e) {
     std::cout << "Error: " << e.what() << "\n";
   }
@@ -288,7 +287,7 @@ int main(int argc, char** argv)
     options.add_options()(
       "p,profile", "Profiling mode: run once", cxxopts::value<bool>()->default_value("false"));
     options.add_options()("r,resource",
-                          "Type of device_memory_resource",
+                          "Type of memory resource",
                           cxxopts::value<std::string>()->default_value("pool"));
     options.add_options()("n,numallocs",
                           "Number of allocations (default of 0 tests a range)",

@@ -3,16 +3,17 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/arena_memory_resource.hpp>
 #include <rmm/mr/binning_memory_resource.hpp>
 #include <rmm/mr/cuda_memory_resource.hpp>
-#include <rmm/mr/device_memory_resource.hpp>
 #include <rmm/mr/managed_memory_resource.hpp>
 #include <rmm/mr/per_device_resource.hpp>
 #include <rmm/mr/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/iterator>
 #include <thrust/execution_policy.h>
@@ -31,56 +32,49 @@
 #include <iterator>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <string>
 #include <thread>
 
-/// MR factory functions
-std::shared_ptr<rmm::mr::device_memory_resource> make_cuda(std::size_t = 0)
-{
-  return std::make_shared<rmm::mr::cuda_memory_resource>();
-}
+using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
 
-std::shared_ptr<rmm::mr::device_memory_resource> make_managed(std::size_t = 0)
-{
-  return std::make_shared<rmm::mr::managed_memory_resource>();
-}
+/// MR factory functions
+any_device_resource make_cuda(std::size_t = 0) { return rmm::mr::cuda_memory_resource{}; }
 
-std::shared_ptr<rmm::mr::device_memory_resource> make_simulated(std::size_t simulated_size)
-{
-  return std::make_shared<rmm::mr::simulated_memory_resource>(simulated_size);
-}
+any_device_resource make_managed(std::size_t = 0) { return rmm::mr::managed_memory_resource{}; }
 
-inline auto make_pool(std::size_t simulated_size)
+inline any_device_resource make_pool(std::size_t simulated_size)
 {
   if (simulated_size > 0) {
-    return std::make_shared<rmm::mr::pool_memory_resource>(
-      *make_simulated(simulated_size), simulated_size, simulated_size);
+    rmm::mr::simulated_memory_resource sim{simulated_size};
+    return rmm::mr::pool_memory_resource{sim, simulated_size, simulated_size};
   }
-  return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(), 0);
+  rmm::mr::cuda_memory_resource cuda{};
+  return rmm::mr::pool_memory_resource{cuda, 0};
 }
 
-inline auto make_arena(std::size_t simulated_size)
+inline any_device_resource make_arena(std::size_t simulated_size)
 {
   if (simulated_size > 0) {
-    return std::make_shared<rmm::mr::arena_memory_resource>(
-      rmm::mr::get_current_device_resource_ref(), simulated_size);
+    return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(),
+                                          simulated_size};
   }
-  return std::make_shared<rmm::mr::arena_memory_resource>(
-    rmm::mr::get_current_device_resource_ref());
+  return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()};
 }
 
-inline auto make_binning(std::size_t simulated_size)
+inline any_device_resource make_binning(std::size_t simulated_size)
 {
-  auto mr = std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(simulated_size));
+  auto pool = make_pool(simulated_size);
+  auto mr   = rmm::mr::binning_memory_resource{pool};
   const auto min_size_exp{18};
   const auto max_size_exp{22};
   for (std::size_t i = min_size_exp; i <= max_size_exp; i++) {
-    mr->add_bin(1 << i);
+    mr.add_bin(1 << i);
   }
   return mr;
 }
 
-using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>(std::size_t)>;
+using MRFactoryFunc = std::function<any_device_resource(std::size_t)>;
 
 /**
  * @brief Represents an allocation made during the replay
@@ -95,15 +89,15 @@ struct allocation {
 
 /**
  * @brief Function object for running a replay benchmark with the specified
- * `device_memory_resource`.
+ * memory resource.
  *
- * @tparam MR The type of the `device_memory_resource` to use for allocation
+ * @tparam MR The type of the memory resource to use for allocation
  * replay
  */
 struct replay_benchmark {
   MRFactoryFunc factory_;
   std::size_t simulated_size_;
-  std::shared_ptr<rmm::mr::device_memory_resource> mr_{};
+  std::optional<any_device_resource> mr_{};
   std::vector<std::vector<rmm::detail::event>> const& events_{};
 
   // Maps a pointer from the event log to an active allocation
@@ -173,7 +167,7 @@ struct replay_benchmark {
   {
     if (state.thread_index() == 0) {
       RMM_LOG_INFO("------ Start of Benchmark -----");
-      mr_ = factory_(simulated_size_);
+      mr_.emplace(factory_(simulated_size_));
     }
     // Can't release threads until MR is set up.
     barrier_.arrive_and_wait();
@@ -193,7 +187,7 @@ struct replay_benchmark {
         auto alloc = ptr_alloc.second;
         num_leaked++;
         total_leaked += alloc.size;
-        mr_->deallocate_sync(alloc.ptr, alloc.size);
+        mr_->deallocate_sync(alloc.ptr, alloc.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
       }
       if (num_leaked > 0) {
         std::cout << "LOG shows leak of " << num_leaked << " allocations of " << total_leaked
@@ -225,11 +219,11 @@ struct replay_benchmark {
 
         // rmm::detail::action::ALLOCATE_FAILURE is ignored.
         if (rmm::detail::action::ALLOCATE == event.act) {
-          auto ptr = mr_->allocate_sync(event.size);
+          auto ptr = mr_->allocate_sync(event.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
           set_allocation(event.pointer, allocation{ptr, event.size});
         } else if (rmm::detail::action::FREE == event.act) {
           auto alloc = remove_allocation(event.pointer);
-          mr_->deallocate_sync(alloc.ptr, event.size);
+          mr_->deallocate_sync(alloc.ptr, event.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
         }
 
         event_index++;
@@ -355,7 +349,7 @@ int main(int argc, char** argv)
 
       options.add_options()("f,file", "Name of RMM log file.", cxxopts::value<std::string>());
       options.add_options()("r,resource",
-                            "Type of device_memory_resource",
+                            "Type of memory resource",
                             cxxopts::value<std::string>()->default_value("pool"));
       options.add_options()(
         "s,size",

@@ -8,7 +8,6 @@
 #include "rapidcsv.h"
 
 #include <rmm/detail/error.hpp>
-#include <rmm/mr/device_memory_resource.hpp>
 
 #include <chrono>
 #include <cstdint>