NVIDIA
diff --git a/‎cudax/include/cuda/experimental/__stf/graph/graph_task.cuh‎
Lines changed: 3 additions & 3 deletions b/‎cudax/include/cuda/experimental/__stf/graph/graph_task.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh‎
Lines changed: 19 additions & 39 deletions b/‎cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh‎
Lines changed: 19 additions & 39 deletions
diff --git a/‎cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh‎
Lines changed: 2 additions & 2 deletions b/‎cudax/include/cuda/experimental/__stf/places/exec/cuda_stream.cuh‎
Lines changed: 2 additions & 2 deletions
@@ -103,7 +103,7 @@ public:
     if (is_capture_enabled())
     {
       // Select a stream from the pool
-      capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
+      capture_stream = get_exec_place().getStream(true).stream;
       // Use relaxed capture mode to allow capturing workloads that lazily initialize
       // resources (e.g., set up memory pools)
       cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeRelaxed));
@@ -366,7 +366,7 @@ public:
       //
 
       // Get a stream from the pool associated to the execution place
-      capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
+      capture_stream = get_exec_place().getStream(true).stream;
 
       cudaGraph_t childGraph = nullptr;
       // Use relaxed capture mode to allow capturing workloads that lazily initialize
@@ -628,7 +628,7 @@ public:
       auto lock = lock_ctx_graph();
 
       // Get a stream from the pool associated to the execution place
-      cudaStream_t capture_stream = get_exec_place().getStream(ctx.async_resources(), true).stream;
+      cudaStream_t capture_stream = get_exec_place().getStream(true).stream;
 
       cudaGraph_t childGraph = nullptr;
       // Use relaxed capture mode to allow capturing workloads that lazily initialize
 
@@ -27,7 +27,7 @@
 
 #include <cuda/experimental/__stf/internal/exec_affinity.cuh>
 #include <cuda/experimental/__stf/internal/executable_graph_cache.cuh>
-#include <cuda/experimental/__stf/places/stream_pool.cuh>
+#include <cuda/experimental/__stf/places/exec/green_context.cuh>
 #include <cuda/experimental/__stf/utility/core.cuh>
 #include <cuda/experimental/__stf/utility/cuda_safe_call.cuh>
 #include <cuda/experimental/__stf/utility/hash.cuh> // for ::std::hash<::std::pair<::std::ptrdiff_t, ::std::ptrdiff_t>>
@@ -40,8 +40,6 @@
 
 namespace cuda::experimental::stf
 {
-class green_context_helper;
-
 /**
  * @brief A handle which stores resources useful for an efficient asynchronous
  * execution. For example this will store the pools of CUDA streams.
@@ -52,12 +50,6 @@ class green_context_helper;
  */
 class async_resources_handle
 {
-  // TODO: optimize based on measurements
-
-public:
-  static constexpr size_t pool_size      = 4;
-  static constexpr size_t data_pool_size = 4;
-
 private:
   /**
    * @brief This class implements a matrix to keep track of the previous
@@ -118,43 +110,25 @@ private:
   class impl
   {
   public:
+#if _CCCL_CTK_AT_LEAST(12, 4)
     impl()
     {
       const int ndevices = cuda_try<cudaGetDeviceCount>();
-      assert(ndevices > 0);
-      assert(pool_size > 0);
-      assert(data_pool_size > 0);
-
+      _CCCL_ASSERT(ndevices > 0, "invalid device count");
       per_device_gc_helper.resize(ndevices, nullptr);
-      /* For every device, we keep two pools, one dedicated to computation,
-       * the other for auxiliary methods such as data transfers. This is intended to
-       * improve overlapping of transfers and computation, for example. */
-      pool.reserve(ndevices);
-      for (auto d : each(ndevices))
-      {
-        ::std::ignore = d;
-        pool.emplace_back(stream_pool(pool_size), stream_pool(data_pool_size));
-      }
-    }
-
-    stream_pool& get_device_stream_pool(int dev_id, bool for_computation)
-    {
-      assert(dev_id < int(pool.size()));
-      return for_computation ? pool[dev_id].first : pool[dev_id].second;
     }
+#endif // _CCCL_CTK_AT_LEAST(12, 4)
 
   public:
     // This memorize what was the last event used to synchronize a pair of streams
     last_event_per_stream cached_syncs;
 
-    // For each device, a pair of stream_pool objects, each stream_pool objects
-    // stores a pool of streams on this device
-    ::std::vector<::std::pair<stream_pool, stream_pool>> pool;
-
     /* Store previously instantiated graphs, indexed by the number of edges and nodes */
     executable_graph_cache cached_graphs;
 
+#if _CCCL_CTK_AT_LEAST(12, 4)
     ::std::vector<::std::shared_ptr<green_context_helper>> per_device_gc_helper;
+#endif // _CCCL_CTK_AT_LEAST(12, 4)
 
     mutable exec_affinity affinity;
   };
@@ -173,12 +147,6 @@ public:
     return pimpl != nullptr;
   }
 
-  stream_pool& get_device_stream_pool(int dev_id, bool for_computation) const
-  {
-    assert(pimpl);
-    return pimpl->get_device_stream_pool(dev_id, for_computation);
-  }
-
   bool validate_sync_and_update(unsigned long long dst, unsigned long long src, int event_id)
   {
     assert(pimpl);
@@ -192,6 +160,7 @@ public:
     return pimpl->cached_graphs.query(nnodes, nedges, mv(g));
   }
 
+#if _CCCL_CTK_AT_LEAST(12, 4)
   // Get the green context helper cached for this device (or let the user initialize it)
   auto& gc_helper(int dev_id)
   {
@@ -201,7 +170,17 @@ public:
   }
 
   // Get green context helper with lazy initialization
-  ::std::shared_ptr<green_context_helper> get_gc_helper(int dev_id, int sm_count);
+  ::std::shared_ptr<green_context_helper> get_gc_helper(int dev_id, int sm_count)
+  {
+    assert(pimpl);
+    assert(dev_id < int(pimpl->per_device_gc_helper.size()));
+    auto& h = pimpl->per_device_gc_helper[dev_id];
+    if (!h)
+    {
+      h = ::std::make_shared<green_context_helper>(sm_count, dev_id);
+    }
+    return h;
+  }
 
   // Register an external green context helper
   void register_gc_helper(int dev_id, ::std::shared_ptr<green_context_helper> helper)
@@ -210,6 +189,7 @@ public:
     assert(dev_id < int(pimpl->per_device_gc_helper.size()));
     pimpl->per_device_gc_helper[dev_id] = ::std::move(helper);
   }
+#endif // _CCCL_CTK_AT_LEAST(12, 4)
 
   exec_affinity& get_affinity()
   {
 
@@ -910,7 +910,7 @@ public:
   auto pick_dstream()
   {
     exec_place p = default_exec_place();
-    return p.get_stream_pool(async_resources(), true).next(p);
+    return p.get_stream_pool(true).next(p);
   }
   cudaStream_t pick_stream()
   {
 
@@ -25,7 +25,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/experimental/__stf/internal/backend_ctx.cuh>
+#include <cuda/experimental/__stf/places/places.cuh>
 
 namespace cuda::experimental::stf
 {
@@ -56,7 +56,7 @@ public:
       return exec_place::device(dstream.dev_id).deactivate(prev);
     }
 
-    stream_pool& get_stream_pool(async_resources_handle&, bool) const override
+    stream_pool& get_stream_pool(bool) const override
     {
       return dummy_pool;
     }
Original file line number	Diff line number	Diff line change
`@@ -910,7 +910,7 @@ public:`
`910`	`910`	`auto pick_dstream()`
`911`	`911`	`{`
`912`	`912`	`exec_place p = default_exec_place();`
`913`		`- return p.get_stream_pool(async_resources(), true).next(p);`
	`913`	`+ return p.get_stream_pool(true).next(p);`
`914`	`914`	`}`
`915`	`915`	`cudaStream_t pick_stream()`
`916`	`916`	`{`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@`
`25`	`25`	`# pragma system_header`
`26`	`26`	`#endif // no system header`
`27`	`27`
`28`		`-#include <cuda/experimental/__stf/internal/backend_ctx.cuh>`
	`28`	`+#include <cuda/experimental/__stf/places/places.cuh>`
`29`	`29`
`30`	`30`	`namespace cuda::experimental::stf`
`31`	`31`	`{`
`@@ -56,7 +56,7 @@ public:`
`56`	`56`	`return exec_place::device(dstream.dev_id).deactivate(prev);`
`57`	`57`	`}`
`58`	`58`
`59`		`- stream_pool& get_stream_pool(async_resources_handle&, bool) const override`
	`59`	`+ stream_pool& get_stream_pool(bool) const override`
`60`	`60`	`{`
`61`	`61`	`return dummy_pool;`
`62`	`62`	`}`