rapidsai · bdice · Mar 25, 2026 · Mar 18, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -24,7 +24,7 @@ namespace mr {
 /**
  * @brief A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
  *
- * Allocation (do_allocate()) and deallocation (do_deallocate()) are thread-safe. Also,
+ * Allocation and deallocation are thread-safe. Also,
  * this class is compatible with CUDA per-thread default stream.
  *
  * GPU memory is divided into a global arena, per-thread arenas for default streams, and per-stream

@@ -28,12 +28,10 @@ namespace mr {
  *
  * * Returns a pointer to an allocation of at least `bytes` usable immediately on
  *   `stream`. The stream-ordered behavior requirements are identical to
- *   `device_memory_resource::allocate`.
+ *   `allocate`.
  *
- * * This signature is compatible with `do_allocate` but adds the extra function
- *   parameter `arg`. The `arg` is provided to the constructor of the
- *   `callback_memory_resource` and will be forwarded along to every invocation
- *   of the callback function.
+ * * The `arg` is provided to the constructor of the `callback_memory_resource`
+ *   and will be forwarded along to every invocation of the callback function.
  */
 using allocate_callback_t = std::function<void*(std::size_t, cuda_stream_view, void*)>;
 
@@ -46,12 +44,10 @@ using allocate_callback_t = std::function<void*(std::size_t, cuda_stream_view, v
  * * Deallocates memory pointed to by `ptr`. `bytes` specifies the size of the allocation
  *   in bytes, and must equal the value of `bytes` that was passed to the allocate callback
  *   function. The stream-ordered behavior requirements are identical to
- *   `device_memory_resource::deallocate`.
+ *   `deallocate`.
  *
- * * This signature is compatible with `do_deallocate` but adds the extra function
- *   parameter `arg`. The `arg` is provided to the constructor of the
- *   `callback_memory_resource` and will be forwarded along to every invocation
- *   of the callback function.
+ * * The `arg` is provided to the constructor of the `callback_memory_resource`
+ *   and will be forwarded along to every invocation of the callback function.
  */
 using deallocate_callback_t = std::function<void(void*, std::size_t, cuda_stream_view, void*)>;
 

@@ -68,8 +68,6 @@ class cuda_async_view_memory_resource final {
   cuda_async_view_memory_resource& operator=(cuda_async_view_memory_resource&&) =
     default;  ///< @default_move_assignment{cuda_async_view_memory_resource}
 
-  // -- CCCL memory resource interface (hides device_memory_resource versions) --
-
   /**
    * @brief Allocates memory of size at least \p bytes.
    *

@@ -34,8 +34,6 @@ class cuda_memory_resource final {
   cuda_memory_resource& operator=(cuda_memory_resource&&) =
     default;  ///< @default_move_assignment{cuda_memory_resource}
 
-  // -- CCCL memory resource interface (hides device_memory_resource versions) --
-
   /**
    * @brief Allocates memory of size at least \p bytes.
    *

@@ -480,8 +480,6 @@ inline auto max_free_size(std::set<superblock> const& superblocks)
  *
  * The global arena is a shared memory pool from which other arenas allocate superblocks.
  *
- * @tparam Upstream Memory resource to use for allocating the arena. Implements
- * rmm::mr::device_memory_resource interface.
  */
 class global_arena final {
  public:
@@ -778,8 +776,6 @@ class global_arena final {
  * An arena is a per-thread or per-non-default-stream memory pool. It allocates
  * superblocks from the global arena, and returns them when the superblocks become empty.
  *
- * @tparam Upstream Memory resource to use for allocating the global arena. Implements
- * rmm::mr::device_memory_resource interface.
  */
 class arena {
  public:
@@ -957,8 +953,6 @@ class arena {
  *
  * This is useful when a thread is about to terminate, and it contains a per-thread arena.
  *
- * @tparam Upstream Memory resource to use for allocating the global arena. Implements
- * rmm::mr::device_memory_resource interface.
  */
 class arena_cleaner {
  public:

@@ -76,39 +76,39 @@ class stream_ordered_memory_resource : public crtp<PoolResource> {
   stream_ordered_memory_resource& operator=(stream_ordered_memory_resource&&)      = delete;
 
   /**
-   * @brief Allocates memory of size at least `size` bytes.
+   * @brief Allocates memory of size at least `bytes` bytes.
    *
    * The returned pointer has at least 256B alignment.
    *
    * @throws `std::bad_alloc` if the requested allocation could not be fulfilled
    *
    * @param stream The stream in which to order this allocation
-   * @param size The size in bytes of the allocation
+   * @param bytes The size in bytes of the allocation
    * @param alignment Unused; alignment is always at least `CUDA_ALLOCATION_ALIGNMENT`
    * @return void* Pointer to the newly allocated memory
    */
-  void* allocate(cuda::stream_ref stream, std::size_t size, std::size_t /*alignment*/)
+  void* allocate(cuda::stream_ref stream, std::size_t bytes, std::size_t /*alignment*/)
   {
     auto const strm = cuda_stream_view{stream};
 
-    RMM_LOG_TRACE("[A][stream %s][%zuB]", rmm::detail::format_stream(strm), size);
+    RMM_LOG_TRACE("[A][stream %s][%zuB]", rmm::detail::format_stream(strm), bytes);
 
-    if (size == 0) { return nullptr; }
+    if (bytes == 0) { return nullptr; }
 
     lock_guard lock(mtx_);
 
     auto stream_event = get_event(strm);
 
-    size = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
-    RMM_EXPECTS(size <= this->underlying().get_maximum_allocation_size(),
+    bytes = rmm::align_up(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT);
+    RMM_EXPECTS(bytes <= this->underlying().get_maximum_allocation_size(),
                 std::string("Maximum allocation size exceeded (failed to allocate ") +
-                  rmm::detail::format_bytes(size) + ")",
+                  rmm::detail::format_bytes(bytes) + ")",
                 rmm::out_of_memory);
-    auto const block = this->underlying().get_block(size, stream_event);
+    auto const block = this->underlying().get_block(bytes, stream_event);
 
     RMM_LOG_TRACE("[A][stream %s][%zuB][%p]",
                   rmm::detail::format_stream(stream_event.stream),
-                  size,
+                  bytes,
                   block.pointer());
 
     log_summary_trace();
@@ -121,29 +121,29 @@ class stream_ordered_memory_resource : public crtp<PoolResource> {
    *
    * @param stream The stream in which to order this deallocation
    * @param ptr Pointer to be deallocated
-   * @param size The size in bytes of the allocation to deallocate
+   * @param bytes The size in bytes of the allocation to deallocate
    * @param alignment Unused
    */
   void deallocate(cuda::stream_ref stream,
                   void* ptr,
-                  std::size_t size,
+                  std::size_t bytes,
                   std::size_t /*alignment*/) noexcept
   {
     auto const strm = cuda_stream_view{stream};
 
-    RMM_LOG_TRACE("[D][stream %s][%zuB][%p]", rmm::detail::format_stream(strm), size, ptr);
+    RMM_LOG_TRACE("[D][stream %s][%zuB][%p]", rmm::detail::format_stream(strm), bytes, ptr);
 
-    if (size == 0 || ptr == nullptr) { return; }
+    if (bytes == 0 || ptr == nullptr) { return; }
 
     lock_guard lock(mtx_);
     auto stream_event = get_event(strm);
 
-    size             = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
-    auto const block = this->underlying().free_block(ptr, size);
+    bytes            = rmm::align_up(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT);
+    auto const block = this->underlying().free_block(ptr, bytes);
 
     // TODO: cudaEventRecord has significant overhead on deallocations. For the non-PTDS case
-    // we may be able to delay recording the event in some situations. But using events rather than
-    // streams allows stealing from deleted streams.
+    // we may be able to delay recording the event in some situations. But using events rather
+    // than streams allows stealing from deleted streams.
     RMM_ASSERT_CUDA_SUCCESS(cudaEventRecord(stream_event.event, strm.value()));
 
     stream_free_blocks_[stream_event].insert(block);