[STF] Use cudaStreamGetId instead of manually assigning unique IDs to streams (#7899)

caugonnet · web-flow · commit c0b9df640e72 · 2026-03-06T14:41:53.000Z
* Stop manually assigning unique IDs to CUDA streams: we have had cu(da)StreamGetId since CUDA 12.0 so use it instead

* (un)register_stream is not needed at all

* clang-format

* remove redundant ctors

* Added a paranoid assertion that cuStreamGetId never returns k_no_stream_id
diff --git a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh
@@ -37,33 +37,59 @@
 #include <mutex>
 #include <unordered_map>
 
+#include <cuda.h>
+
 namespace cuda::experimental::stf
 {
 class green_context_helper;
 
 // Needed to set/get affinity
 class exec_place;
 
+/** Sentinel for "no stream" / empty slot. Distinct from any value returned by cuStreamGetId. */
+inline constexpr unsigned long long k_no_stream_id = static_cast<unsigned long long>(-1);
+
 /**
- * @brief A class to store a CUDA stream along with a few information to avoid CUDA queries
+ * @brief Returns the unique stream ID from the CUDA driver (cuStreamGetId).
+ * @param stream A valid CUDA stream, or nullptr.
+ * @return The stream's unique ID, or k_no_stream_id if stream is nullptr.
+ */
+inline unsigned long long get_stream_id(cudaStream_t stream)
+{
+  unsigned long long id = 0;
+  cuda_safe_call(cuStreamGetId(reinterpret_cast<CUstream>(stream), &id));
+  _CCCL_ASSERT(id != k_no_stream_id, "Internal error: cuStreamGetId returned k_no_stream_id");
+  return id;
+}
+
+/**
+ * @brief A class to store a CUDA stream along with metadata
  *
  * It contains
  *  - the stream itself,
- *  - a unique id (proper to CUDASTF, and only valid for streams in our pool, or equal to -1),
- *  - the pool associated to the unique ID, when valid
- *  - the device index in which the stream is
+ *  - the stream's unique ID from the CUDA driver (cuStreamGetId), or k_no_stream_id if no stream,
+ *  - the device index in which the stream resides
  */
 struct decorated_stream
 {
-  decorated_stream(cudaStream_t stream = nullptr, ::std::ptrdiff_t id = -1, int dev_id = -1)
+  decorated_stream() = default;
+
+  decorated_stream(cudaStream_t stream, unsigned long long id, int dev_id = -1)
       : stream(stream)
       , id(id)
       , dev_id(dev_id)
   {}
 
+  /** Construct from stream only; id is from cuStreamGetId, dev_id is -1 (filled lazily when needed). */
+  explicit decorated_stream(cudaStream_t stream)
+      : stream(stream)
+      , id(get_stream_id(stream))
+      , dev_id(-1)
+  {}
+
   cudaStream_t stream = nullptr;
-  // Unique ID (-1 if this is not part of our pool)
-  ::std::ptrdiff_t id = -1;
+  // Unique ID from cuStreamGetId (k_no_stream_id if no stream)
+  unsigned long long id = k_no_stream_id;
   // Device in which this stream resides
   int dev_id = -1;
 };
@@ -90,10 +116,10 @@ struct stream_pool
    * @brief stream_pool constructor taking a number of slots.
    *
    * Streams are created lazily only via next(place), which activates the place and calls place.create_stream().
-   * Slot dev_id is set from the created stream; the pool does not store a device id.
+   * Slot dev_id and id are set when the stream is created in next().
    */
   explicit stream_pool(size_t n)
-      : payload(n, decorated_stream(nullptr, -1, -1))
+      : payload(n, decorated_stream(nullptr, k_no_stream_id, -1))
   {}
 
   stream_pool(stream_pool&& rhs)
@@ -156,36 +182,6 @@ public:
   static constexpr size_t data_pool_size = 4;
 
 private:
-  /**
-   * @brief A helper class to maintain a set of available IDs, and attributes IDs
-   */
-  class id_pool
-  {
-  public:
-    ~id_pool()
-    {
-      assert(released.load() == current.load());
-    }
-
-    ::std::ptrdiff_t get_unique_id(size_t cnt = 1)
-    {
-      // Use fetch_add to atomically increment current and return the previous value
-      return current.fetch_add(cnt);
-    }
-
-    void release_unique_id(::std::ptrdiff_t /* id */, size_t cnt = 1)
-    {
-      // Use fetch_add to atomically increment released
-      released.fetch_add(cnt);
-    }
-
-  private:
-    // next available ID
-    ::std::atomic<::std::ptrdiff_t> current{0};
-    // Number of IDs released, for bookkeeping
-    ::std::atomic<::std::ptrdiff_t> released{0};
-  };
-
   /**
    * @brief This class implements a matrix to keep track of the previous
    * synchronization that occurred between each pair of streams in our pools.
@@ -195,7 +191,7 @@ private:
    * ID) is implied by the previous synchronization, so it can be skipped thanks
    * to stream-ordering of operations.
    *
-   * This is implemented as a hash table where keys are pairs of IDs.
+   * Keys are pairs of stream IDs from cuStreamGetId.
    */
   class last_event_per_stream
   {
@@ -204,10 +200,10 @@ private:
     // located on stream "from" to stream "dst" (stream dst waits for the
     // event)
     // Returned value : boolean indicating if we can skip the synchronization
-    bool validate_sync_and_update(::std::ptrdiff_t dst, ::std::ptrdiff_t src, int event_id)
+    bool validate_sync_and_update(unsigned long long dst, unsigned long long src, int event_id)
     {
-      // If either of the streams is not from the pool, do not skip
-      if (dst == -1 || src == -1)
+      // If either of the streams has no valid id, do not skip
+      if (dst == k_no_stream_id || src == k_no_stream_id)
       {
         return false;
       }
@@ -232,10 +228,10 @@ private:
     }
 
   private:
-    // For each pair of unique IDs, we keep the last event id
-    ::std::unordered_map<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>,
+    // For each pair of stream IDs (from cuStreamGetId), we keep the last event id
+    ::std::unordered_map<::std::pair<unsigned long long, unsigned long long>,
                          int,
-                         cuda::experimental::stf::hash<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>>>
+                         cuda::experimental::stf::hash<::std::pair<unsigned long long, unsigned long long>>>
       interactions;
 
     ::std::mutex mtx;
@@ -295,7 +291,7 @@ private:
       for (auto i : each(n))
       {
         ::std::ignore = i;
-        new_payload.emplace_back(nullptr, ids.get_unique_id(), dev_id);
+        new_payload.emplace_back(nullptr, k_no_stream_id, dev_id);
       }
 
       ::std::lock_guard<::std::mutex> locker(p.mtx);
@@ -312,7 +308,6 @@ private:
       // Clean up outside the critical section
       for (auto& e : goner)
       {
-        ids.release_unique_id(e.id);
         if (e.stream)
         {
           cuda_safe_call(cudaStreamDestroy(e.stream));
@@ -321,9 +316,6 @@ private:
     }
 
   public:
-    // These are constructed and destroyed in reversed order
-    id_pool ids;
-
     // This memorize what was the last event used to synchronize a pair of streams
     last_event_per_stream cached_syncs;
 
@@ -359,19 +351,7 @@ public:
     return pimpl->get_device_stream_pool(dev_id, for_computation);
   }
 
-  ::std::ptrdiff_t get_unique_id(size_t cnt = 1)
-  {
-    assert(pimpl);
-    return pimpl->ids.get_unique_id(cnt);
-  }
-
-  void release_unique_id(::std::ptrdiff_t id, size_t cnt = 1)
-  {
-    assert(pimpl);
-    return pimpl->ids.release_unique_id(id, cnt);
-  }
-
-  bool validate_sync_and_update(::std::ptrdiff_t dst, ::std::ptrdiff_t src, int event_id)
+  bool validate_sync_and_update(unsigned long long dst, unsigned long long src, int event_id)
   {
     assert(pimpl);
     return pimpl->cached_syncs.validate_sync_and_update(dst, src, event_id);
@@ -446,54 +426,6 @@ public:
   }
 };
 
-//! @brief Registers a user-provided CUDA stream with asynchronous resources
-//!
-//! @details This optimization records a CUDA stream in the provided asynchronous resources handle,
-//! creating a decorated_stream object that encapsulates:
-//! - The original stream handle
-//! - A unique identifier for stream tracking
-//! - The associated device ID
-//!
-//! @param[in,out] async_resources Handle to asynchronous resources manager
-//! @param[in] user_stream Raw CUDA stream to register. Must be a valid stream.
-//!
-//! @return decorated_stream Object containing:
-//!         - Original stream handle
-//!         - Unique ID from async_resources
-//!         - Device ID associated with the stream
-//!
-//! @pre `user_stream` must be a valid CUDA stream created with `cudaStreamCreate` or equivalent
-//! @note This registration is an optimization to avoid repeated stream metadata lookups
-//!       in performance-critical code paths
-inline decorated_stream register_stream(async_resources_handle& async_resources, cudaStream_t user_stream)
-{
-  // Get a unique ID
-  const auto id    = async_resources.get_unique_id();
-  const int dev_id = get_device_from_stream(user_stream);
-
-  return decorated_stream(user_stream, id, dev_id);
-}
-
-//! @brief Unregisters a decorated CUDA stream from asynchronous resources
-//!
-//! @details Performs cleanup operations to release resources associated with a previously
-//! registered stream. This includes:
-//! - Releasing the unique ID back to the resource manager
-//! - Invalidating the decorated stream's internal ID
-//!
-//! @param[in,out] async_resources Handle to asynchronous resources manager
-//! @param[in,out] dstream Decorated stream to unregister. Its `id` will be set to -1.
-//!
-//! @pre `dstream.id` must be valid (≥ 0) before calling this function
-//! @post `dstream.id == -1` and associated resources are released
-//! @note Should be paired with register_stream() for proper resource management
-inline void unregister_stream(async_resources_handle& async_resources, decorated_stream& dstream)
-{
-  async_resources.release_unique_id(dstream.id);
-  // reset the decorated stream
-  dstream.id = -1;
-}
-
 #ifdef UNITTESTED_FILE
 /*
  * This test ensures that the async_resources_handle type is default
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh b/cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh
@@ -114,7 +114,7 @@ public:
 inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream)
 {
   int devid = get_device_from_stream(stream);
-  return exec_place_cuda_stream(decorated_stream(stream, -1, devid));
+  return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid));
 }
 
 inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream)
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -1218,6 +1218,7 @@ inline decorated_stream stream_pool::next(const exec_place& place)
   {
     exec_place_guard guard(place);
     result.stream = place.create_stream();
+    result.id     = get_stream_id(result.stream);
     result.dev_id = get_device_from_stream(result.stream);
   }
 
diff --git a/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/stream/internal/event_types.cuh
@@ -255,7 +255,7 @@ public:
     return dstream;
   }
 
-  ::std::ptrdiff_t get_stream_id() const
+  unsigned long long get_stream_id() const
   {
     return dstream.id;
   }
@@ -397,7 +397,7 @@ private:
     for (const auto& e : prereq_in)
     {
       cudaStream_t stream;
-      ::std::ptrdiff_t stream_id = -1;
+      unsigned long long stream_id = 0;
       auto se   = reserved::handle<stream_and_event, reserved::handle_flags::non_null>(e, reserved::use_static_cast);
       stream    = se->get_stream();
       stream_id = se->get_stream_id();
@@ -415,7 +415,7 @@ private:
       if (stream_dev == devid)
       {
         //    fprintf(stderr, "Found matching device %d with stream %p\n", devid, stream);
-        return decorated_stream(stream, stream_id, devid);
+        return decorated_stream(stream, stream_id, static_cast<int>(stream_dev));
       }
     }
 
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -156,7 +156,7 @@ public:
               auto se                    = reserved::handle<stream_and_event>(e, reserved::use_static_cast);
               decorated_stream candidate = se->get_decorated_stream();
 
-              if (candidate.id != -1)
+              if (candidate.id != k_no_stream_id)
               {
                 for (const decorated_stream& pool_s : pool)
                 {
diff --git a/cudax/test/stf/places/cuda_stream_place.cu b/cudax/test/stf/places/cuda_stream_place.cu
@@ -53,30 +53,16 @@ int main()
   auto lX = ctx.logical_data(X);
   auto lY = ctx.logical_data(Y);
 
-  /* Compute Y = Y + alpha X */
+  /* Compute Y = Y + alpha X on the user stream */
   auto where = exec_place::cuda_stream(stream);
 
-  for (size_t iter = 0; iter < 10; iter++)
+  for (size_t iter = 0; iter < 20; iter++)
   {
     ctx.parallel_for(where, lX.shape(), lX.read(), lY.rw())->*[alpha] __device__(size_t i, auto x, auto y) {
       y(i) += alpha * x(i);
     };
   }
 
-  /* Associate the CUDA stream with a unique internal ID to speed up synchronizations */
-  auto rstream = register_stream(ctx.async_resources(), stream);
-  auto where2  = exec_place::cuda_stream(rstream);
-
-  for (size_t iter = 0; iter < 10; iter++)
-  {
-    ctx.parallel_for(where2, lX.shape(), lX.read(), lY.rw())->*[alpha] __device__(size_t i, auto x, auto y) {
-      y(i) += alpha * x(i);
-    };
-  }
-
-  // Remove the association
-  unregister_stream(ctx.async_resources(), rstream);
-
   ctx.finalize();
 
   for (size_t i = 0; i < N; i++)

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ public:`
`114`	`114`	`inline exec_place_cuda_stream exec_place::cuda_stream(cudaStream_t stream)`
`115`	`115`	`{`
`116`	`116`	`int devid = get_device_from_stream(stream);`
`117`		`- return exec_place_cuda_stream(decorated_stream(stream, -1, devid));`
	`117`	`+ return exec_place_cuda_stream(decorated_stream(stream, get_stream_id(stream), devid));`
`118`	`118`	`}`
`119`	`119`
`120`	`120`	`inline exec_place_cuda_stream exec_place::cuda_stream(const decorated_stream& dstream)`
Original file line number	Diff line number	Diff line change
`@@ -1218,6 +1218,7 @@ inline decorated_stream stream_pool::next(const exec_place& place)`
`1218`	`1218`	`{`
`1219`	`1219`	`exec_place_guard guard(place);`
`1220`	`1220`	`result.stream = place.create_stream();`
	`1221`	`+ result.id = get_stream_id(result.stream);`
`1221`	`1222`	`result.dev_id = get_device_from_stream(result.stream);`
`1222`	`1223`	`}`
`1223`	`1224`
Original file line number	Diff line number	Diff line change
`@@ -255,7 +255,7 @@ public:`
`255`	`255`	`return dstream;`
`256`	`256`	`}`
`257`	`257`
`258`		`- ::std::ptrdiff_t get_stream_id() const`
	`258`	`+ unsigned long long get_stream_id() const`
`259`	`259`	`{`
`260`	`260`	`return dstream.id;`
`261`	`261`	`}`
`@@ -397,7 +397,7 @@ private:`
`397`	`397`	`for (const auto& e : prereq_in)`
`398`	`398`	`{`
`399`	`399`	`cudaStream_t stream;`
`400`		`- ::std::ptrdiff_t stream_id = -1;`
	`400`	`+ unsigned long long stream_id = 0;`
`401`	`401`	`auto se = reserved::handle<stream_and_event, reserved::handle_flags::non_null>(e, reserved::use_static_cast);`
`402`	`402`	`stream = se->get_stream();`
`403`	`403`	`stream_id = se->get_stream_id();`
`@@ -415,7 +415,7 @@ private:`
`415`	`415`	`if (stream_dev == devid)`
`416`	`416`	`{`
`417`	`417`	`// fprintf(stderr, "Found matching device %d with stream %p\n", devid, stream);`
`418`		`- return decorated_stream(stream, stream_id, devid);`
	`418`	`+ return decorated_stream(stream, stream_id, static_cast<int>(stream_dev));`
`419`	`419`	`}`
`420`	`420`	`}`
`421`	`421`
Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,7 @@ public:`
`156`	`156`	`auto se = reserved::handle<stream_and_event>(e, reserved::use_static_cast);`
`157`	`157`	`decorated_stream candidate = se->get_decorated_stream();`
`158`	`158`
`159`		`- if (candidate.id != -1)`
	`159`	`+ if (candidate.id != k_no_stream_id)`
`160`	`160`	`{`
`161`	`161`	`for (const decorated_stream& pool_s : pool)`
`162`	`162`	`{`