rapidsai · rapids-bot · Mar 18, 2026 · Mar 14, 2026 · Mar 18, 2026
@@ -1,6 +1,6 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
@@ -67,7 +67,7 @@ function(ConfigureBench)
 endfunction()
 
 if(BUILD_PRIMS_BENCH)
-  ConfigureBench(NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp)
+  ConfigureBench(NAME CORE_BENCH PATH core/bitset.cu core/copy.cu core/memory_tracking.cu main.cpp)
 
   ConfigureBench(NAME UTIL_BENCH PATH util/popc.cu main.cpp)
 

@@ -0,0 +1,129 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/memory_tracking_resources.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <unistd.h>
+
+#include <chrono>
+#include <cstdlib>
+#include <filesystem>
+#include <memory>
+#include <vector>
+
+namespace raft::bench::core {
+
+struct tracking_inputs {
+  int num_allocs;
+  size_t alloc_size;
+  int64_t sample_rate_us;
+  bool batch;
+};
+
+struct tracking_overhead : public fixture {
+  tracking_overhead(const tracking_inputs& p) : fixture(true), params(p)
+  {
+    if (p.sample_rate_us >= 0) {
+      std::string tpl = (std::filesystem::temp_directory_path() / "raft_bench_XXXXXX").string();
+      int fd          = mkstemp(tpl.data());
+      if (fd != -1) close(fd);
+      tmp_path_ = std::move(tpl);
+      tracked_res_.emplace(handle, tmp_path_, std::chrono::microseconds{p.sample_rate_us});
+    }
+  }
+
+  ~tracking_overhead()
+  {
+    tracked_res_.reset();
+    if (!tmp_path_.empty()) { std::remove(tmp_path_.c_str()); }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    state.counters["alloc_size"]     = params.alloc_size;
+    state.counters["sample_rate_us"] = params.sample_rate_us;
+    state.counters["batch"]          = params.batch;
+
+    run_allocs(state, tracked_res_ ? reinterpret_cast<raft::resources&>(*tracked_res_) : handle);
+
+    state.SetItemsProcessed(state.iterations() * params.num_allocs * 2);
+  }
+
+ private:
+  void run_allocs(::benchmark::State& state, raft::resources& res)
+  {
+    auto mr = raft::resource::get_workspace_resource_ref(res);
+    auto sv = raft::resource::get_cuda_stream(res);
+
+    if (params.batch) {
+      std::vector<void*> ptrs(params.num_allocs);
+      for (auto _ : state) {
+        auto t0 = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < params.num_allocs; i++)
+          ptrs[i] = mr.allocate(sv, params.alloc_size);
+        for (int i = params.num_allocs - 1; i >= 0; i--)
+          mr.deallocate(sv, ptrs[i], params.alloc_size);
+        state.SetIterationTime(
+          std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count());
+      }
+    } else {
+      for (auto _ : state) {
+        auto t0 = std::chrono::high_resolution_clock::now();
+        for (int i = 0; i < params.num_allocs; i++) {
+          void* p = mr.allocate(sv, params.alloc_size);
+          mr.deallocate(sv, p, params.alloc_size);
+        }
+        state.SetIterationTime(
+          std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count());
+      }
+    }
+  }
+
+  tracking_inputs params;
+  std::string tmp_path_;
+  std::optional<raft::memory_tracking_resources> tracked_res_ = std::nullopt;
+};
+
+const std::vector<tracking_inputs> inputs{
+  // ping-pong (isolates per-call overhead, pool recycles same block)
+  {10000, 256, -1, false},
+  {10000, 256, 0, false},
+  {10000, 256, 1, false},
+  {10000, 256, 10, false},
+  {10000, 256, 100, false},
+  {10000, 1 << 20, -1, false},
+  {10000, 1 << 20, 0, false},
+  {10000, 1 << 20, 1, false},
+  {10000, 1 << 20, 10, false},
+  {10000, 1 << 20, 100, false},
+  {1000, 1 << 26, -1, false},
+  {1000, 1 << 26, 0, false},
+  {1000, 1 << 26, 1, false},
+  {1000, 1 << 26, 10, false},
+  {1000, 1 << 26, 100, false},
+  // batch (allocate all, then deallocate all)
+  {10000, 256, -1, true},
+  {10000, 256, 0, true},
+  {10000, 256, 1, true},
+  {10000, 256, 10, true},
+  {10000, 256, 100, true},
+  {1000, 1 << 20, -1, true},
+  {1000, 1 << 20, 0, true},
+  {1000, 1 << 20, 1, true},
+  {1000, 1 << 20, 10, true},
+  {1000, 1 << 20, 100, true},
+};
+
+RAFT_BENCH_REGISTER(tracking_overhead, "", inputs);
+
+}  // namespace raft::bench::core
@@ -1,10 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
+#include <raft/core/detail/nvtx_range_stack.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
 #ifdef NVTX_ENABLED
@@ -146,6 +148,7 @@ inline void push_range_name(const char* name)
   event_attrib.messageType           = NVTX_MESSAGE_TYPE_ASCII;
   event_attrib.message.ascii         = name;
   nvtxDomainRangePushEx(domain_store<Domain>::value(), &event_attrib);
+  detail::range_name_stack_instance.push(name);
 }
 
 template <typename Domain, typename... Args>
@@ -168,12 +171,13 @@ inline void push_range(const char* format, Args... args)
 template <typename Domain>
 inline void pop_range()
 {
+  detail::range_name_stack_instance.pop();
   nvtxDomainRangePop(domain_store<Domain>::value());
 }
 
 }  // namespace raft::common::nvtx::detail
 
-#else  // NVTX_ENABLED
+#else   // NVTX_ENABLED
 
 namespace raft::common::nvtx::detail {
 
@@ -188,5 +192,4 @@ inline void pop_range()
 }
 
 }  // namespace raft::common::nvtx::detail
-
 #endif  // NVTX_ENABLED
@@ -0,0 +1,90 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <mutex>
+#include <stack>
+#include <string>
+#include <utility>
+
+namespace raft::common::nvtx {
+
+namespace detail {
+struct nvtx_range_name_stack;
+}  // namespace detail
+
+/**
+ * Shared, read-only handle to the current NVTX range name of a specific thread
+ * (set internally by one thread, read publicly by zero or more threads).
+ */
+class current_range {
+  friend detail::nvtx_range_name_stack;
+
+ public:
+  /** Read the current range name and stack depth (safe to call from any thread). */
+  auto get() const -> std::pair<std::string, std::size_t>
+  {
+    std::lock_guard lock(mu_);
+    return {value_, depth_};
+  }
+
+  operator std::string() const
+  {
+    std::lock_guard lock(mu_);
+    return value_;
+  }
+
+ private:
+  mutable std::mutex mu_;
+  std::string value_;
+  std::size_t depth_{0};
+
+  void set(const char* name, std::size_t depth)
+  {
+    std::lock_guard lock(mu_);
+    value_ = name ? name : "";
+    depth_ = depth;
+  }
+};
+
+namespace detail {
+
+struct nvtx_range_name_stack {
+  void push(const char* name)
+  {
+    stack_.emplace(name);
+    current_->set(name, stack_.size());
+  }
+
+  void pop()
+  {
+    if (!stack_.empty()) { stack_.pop(); }
+    current_->set(stack_.empty() ? nullptr : stack_.top().c_str(), stack_.size());
+  }
+
+  auto current() const -> std::shared_ptr<const current_range> { return current_; }
+
+ private:
+  std::stack<std::string> stack_{};
+  std::shared_ptr<current_range> current_{std::make_shared<current_range>()};
+};
+
+inline thread_local nvtx_range_name_stack range_name_stack_instance{};
+
+}  // namespace detail
+
+/**
+ * Get a read-only handle to this thread's current NVTX range name.
+ * Pass the returned shared_ptr to another thread to read this thread's current NVTX range name at
+ * any time.
+ */
+inline auto thread_local_current_range() -> std::shared_ptr<const current_range>
+{
+  return detail::range_name_stack_instance.current();
+}
+
+}  // namespace raft::common::nvtx
@@ -1,6 +1,6 @@
 /*
  * SPDX-FileCopyrightText: Copyright (2019) Sandia Corporation
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
  */
 /*
@@ -134,7 +134,7 @@ class device_uvector {
  * @brief A container policy for device mdarray.
  */
 template <typename ElementType>
-class device_uvector_policy {
+class device_container_policy {
  public:
   using element_type   = ElementType;
   using container_type = device_uvector<element_type>;
@@ -153,8 +153,8 @@ class device_uvector_policy {
     return container_type(n, resource::get_cuda_stream(res), mr_);
   }
 
-  constexpr device_uvector_policy() = default;
-  explicit device_uvector_policy(rmm::device_async_resource_ref mr) noexcept : mr_(mr) {}
+  constexpr device_container_policy() = default;
+  explicit device_container_policy(rmm::device_async_resource_ref mr) noexcept : mr_(mr) {}
 
   [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
   {
@@ -170,7 +170,7 @@ class device_uvector_policy {
   [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
 
  private:
-  rmm::device_async_resource_ref mr_{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref mr_{rmm::mr::get_current_device_resource_ref()};
 };
 
 }  // namespace raft
@@ -189,7 +189,7 @@ template <typename T>
 using device_uvector = detail::fail_container<T>;
 
 template <typename ElementType>
-using device_uvector_policy = detail::fail_container_policy<ElementType>;
+using device_container_policy = detail::fail_container_policy<ElementType>;
 
 }  // namespace raft
 #endif
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -29,7 +29,7 @@ using device_coordinate_structure_view = coordinate_structure_view<RowType, ColT
 template <typename RowType,
           typename ColType,
           typename NZType,
-          template <typename T> typename ContainerPolicy = device_uvector_policy>
+          template <typename T> typename ContainerPolicy = device_container_policy>
 using device_coordinate_structure =
   coordinate_structure<RowType, ColType, NZType, true, ContainerPolicy>;
 
@@ -43,7 +43,7 @@ template <typename ElementType,
           typename RowType,
           typename ColType,
           typename NZType,
-          template <typename T> typename ContainerPolicy = device_uvector_policy,
+          template <typename T> typename ContainerPolicy = device_container_policy,
           SparsityType sparsity_type                     = SparsityType::OWNING>
 using device_coo_matrix =
   coo_matrix<ElementType, RowType, ColType, NZType, true, ContainerPolicy, sparsity_type>;
@@ -55,15 +55,15 @@ template <typename ElementType,
           typename RowType,
           typename ColType,
           typename NZType,
-          template <typename T> typename ContainerPolicy = device_uvector_policy>
+          template <typename T> typename ContainerPolicy = device_container_policy>
 using device_sparsity_owning_coo_matrix =
   coo_matrix<ElementType, RowType, ColType, NZType, true, ContainerPolicy>;
 
 template <typename ElementType,
           typename RowType,
           typename ColType,
           typename NZType,
-          template <typename T> typename ContainerPolicy = device_uvector_policy>
+          template <typename T> typename ContainerPolicy = device_container_policy>
 using device_sparsity_preserving_coo_matrix = coo_matrix<ElementType,
                                                          RowType,
                                                          ColType,