rahulsingh-intel
diff --git a/‎aten/src/ATen/cuda/CUDAGeneratorImpl.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/CUDAGeneratorImpl.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/cuda/CUDAGraph.cpp‎
Lines changed: 317 additions & 8 deletions b/‎aten/src/ATen/cuda/CUDAGraph.cpp‎
Lines changed: 317 additions & 8 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGraph.cu‎
Lines changed: 30 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDAGraph.cu‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGraph.h‎
Lines changed: 57 additions & 1 deletion b/‎aten/src/ATen/cuda/CUDAGraph.h‎
Lines changed: 57 additions & 1 deletion
diff --git a/‎docs/source/notes/cuda.rst‎
Lines changed: 49 additions & 1 deletion b/‎docs/source/notes/cuda.rst‎
Lines changed: 49 additions & 1 deletion
diff --git a/‎docs/source/torch.compiler_cudagraph_trees.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/torch.compiler_cudagraph_trees.rst‎
Lines changed: 1 addition & 1 deletion
@@ -347,7 +347,7 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   at::cuda::assertNotCapturing(
-      "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
+      "Please ensure to utilize the CUDAGeneratorImpl::graphsafe_set_state method during capturing.");
   static const size_t seed_size = sizeof(uint64_t);
   static const size_t offset_size = sizeof(int64_t);
   static const size_t total_size = seed_size + offset_size;
 
@@ -0,0 +1,30 @@
+#include <ATen/cuda/CUDAGraph.h>
+#include <ATen/cuda/Exceptions.h>
+
+namespace at::cuda {
+
+namespace {
+
+#if !(defined(USE_ROCM)) && (defined(CUDA_VERSION) && CUDA_VERSION >= 12040)
+__global__ void set_conditional_handle_kernel(
+    cudaGraphConditionalHandle handle,
+    const bool* value) {
+  cudaGraphSetConditional(handle, *value);
+}
+#endif
+}
+
+void CUDAGraph::set_conditional_handle(
+    cudaGraphConditionalHandle handle,
+    const Tensor& scalar_cuda_pred_tensor) {
+#if !(defined(USE_ROCM)) && (defined(CUDA_VERSION) && CUDA_VERSION >= 12040)
+  set_conditional_handle_kernel<<<1, 1, 0, getCurrentCUDAStream()>>>(
+      handle, scalar_cuda_pred_tensor.const_data_ptr<bool>());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+#else
+  AT_ERROR("not allowed");
+  return;
+#endif
+}
+
+} // namespace at::cuda
@@ -3,9 +3,20 @@
 #include <ATen/Tensor.h>
 #include <c10/core/Device.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/flat_hash_map.h>
 
+#include <limits>
+#include <stack>
+
+#if defined(USE_ROCM) || !(defined(CUDA_VERSION) && CUDA_VERSION >= 12040)
+// this type is not defined until CUDA 12.4, but we use it as a
+// parameter type and return type in some below functions, so we give
+// it the same definition as in CUDA 12.4.
+typedef unsigned long long cudaGraphConditionalHandle;
+#endif // defined(USE_ROCM) || !(defined(CUDA_VERSION) && CUDA_VERSION >= 12040)
+
 namespace at {
 
 struct Generator;
@@ -14,6 +25,9 @@ struct CUDAGeneratorState;
 
 namespace cuda {
 
+using UniquePtrExternalCudaStream =
+    std::unique_ptr<cudaStream_t, void (*)(cudaStream_t*)>;
+
 // Standalone way to get a unique mempool id usable as a pool=... argument
 // to CUDAGraph::capture_begin
 TORCH_CUDA_CPP_API MempoolId_t graph_pool_handle();
@@ -22,6 +36,26 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   CUDAGraph();
   ~CUDAGraph();
 
+  // Copy and move constructors and assignments are disabled. These
+  // were disabled because pybind11 believed that CUDAGraph was copy
+  // constructable because
+  // pybind11::is_copy_constructible<CUDAGraph>::value originally
+  // evaluated to true. However, it cannot generate a copy constructor
+  // because CUDAGeneratorState, one of CUDAGraph's members, is an
+  // incomplete type unless CUDAGeneratorImpl.h is included. However,
+  // that would create a circular dependency between
+  // CUDAGeneratorImpl.h and CUDAGraph.h. Disabling the copy and move
+  // constructors is the most straightforward way to prevent pybind11
+  // from trying to generate default implementations of them.
+  //
+  // We needed pybind11 to return a reference to a CUDAGraph as part
+  // of wrapping CUDAGraph::get_currently_capturing_graph, which
+  // unearthed the above problem.
+  CUDAGraph(const CUDAGraph&) = delete;
+  CUDAGraph& operator=(const CUDAGraph&) = delete;
+  CUDAGraph(CUDAGraph&& other) = delete;
+  CUDAGraph& operator=(CUDAGraph&& other) = delete;
+
   static void inc_pending_event_queries();
   static void dec_pending_event_queries();
   static int num_pending_event_queries();
@@ -38,6 +72,19 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   void enable_debug_mode();
   void debug_dump(const std::string& debug_path);
 
+  static CUDAGraph* get_currently_capturing_graph();
+  void begin_capture_to_if_node(const Tensor& scalar_cuda_pred_tensor);
+  cudaGraphConditionalHandle begin_capture_to_while_loop_node(
+      const Tensor& scalar_cuda_pred_tensor);
+  void end_capture_to_conditional_node();
+  static void set_conditional_handle(
+      cudaGraphConditionalHandle handle,
+      const Tensor& scalar_cuda_pred_tensor);
+
+ private:
+  std::function<bool(cudaStream_t)> create_allocate_filter();
+  std::function<bool(cudaStream_t)> create_child_allocate_filter();
+
  protected:
   cudaGraph_t graph_ = nullptr;
   cudaGraphExec_t graph_exec_ = nullptr;
@@ -54,7 +101,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
 
   // the ID assigned by cuda during graph capture,
   // used to identify when a stream is participating in capture
-  CaptureId_t capture_id_ = -1;
+  CaptureId_t capture_id_ = std::numeric_limits<CaptureId_t>::max();
 
   // uuid used to request a particular private mempool from CUDACachingAllocator.
   // By default, this will be set to {id_, 0}.
@@ -85,6 +132,15 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   // init capture_dev_ as UNDEFINED_DEVICE to check that it stores the real device id in the destructor
   static constexpr c10::DeviceIndex UNDEFINED_DEVICE = -1;
   c10::DeviceIndex capture_dev_{UNDEFINED_DEVICE};
+
+  cudaStreamCaptureMode capture_mode_{};
+
+#if !defined(USE_ROCM) && (defined(CUDA_VERSION) && CUDA_VERSION >= 12040)
+  std::stack<std::pair<at::cuda::CUDAStreamGuard, UniquePtrExternalCudaStream>>
+      conditional_node_streams_;
+  std::stack<CaptureId_t> conditional_graph_capture_streams_ids_;
+  std::vector<cudaGraph_t> descendent_graphs_;
+#endif // !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12040
 };
 
 } // namespace cuda
 
@@ -929,6 +929,10 @@ and you suspect its runtime is at least somewhat CPU-limited.
     https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture
 .. _cudaGraphLaunch:
     https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
+.. _issue 144787:
+    https://github.com/pytorch/pytorch/issues/144787#issuecomment-2606480564
+.. _conditional nodes:
+    https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
 
 PyTorch API
 ^^^^^^^^^^^
@@ -1017,6 +1021,9 @@ Violating any of these will likely cause a runtime error:
   Avoid using :meth:`Generator.get_state<torch.get_state>` and :meth:`Generator.set_state<torch.set_state>` during capture;
   instead, utilize :meth:`Generator.graphsafe_set_state<torch.Generator.graphsafe_set_state>` and :meth:`Generator.graphsafe_get_state<torch.Generator.graphsafe_get_state>`
   for managing generator states safely within the graph context. This ensures proper RNG operation and generator management within CUDA graphs.
+* Dynamic control flow (based on CPU or GPU data) is prohibited, unless it is based on GPU data and implemented via higher order operators
+  torch.cond() and torch.while_loop(). See :ref:`Data Dependent Control Flow<graph-data-dependent-control-flow>`.
+
 
 
 Violating any of these will likely cause silent numerical errors or undefined behavior:
@@ -1025,7 +1032,6 @@ Violating any of these will likely cause silent numerical errors or undefined be
 * No non-captured CUDA work may run in this process (on any thread) while capture is underway.
 * CPU work is not captured. If the captured ops include CPU work, that work will be elided during replay.
 * Every replay reads from and writes to the same (virtual) memory addresses.
-* Dynamic control flow (based on CPU or GPU data) is prohibited.
 * Dynamic shapes are prohibited. The graph assumes every tensor in the captured op sequence
   has the same size and layout in every replay.
 * Using multiple streams in a capture is allowed, but there are :ref:`restrictions<multistream-capture>`.
@@ -1334,3 +1340,45 @@ If, in the live workload, your callables will run in an order that occasionally
 or if they'll run concurrently, passing them as a tuple to a single invocation of
 :func:`~torch.cuda.make_graphed_callables` is not allowed. Instead, you must call
 :func:`~torch.cuda.make_graphed_callables` separately for each one.
+
+.. _graph-data-dependent-control-flow:
+
+Data Dependent Control Flow
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Data-dependent control flow can with cuda graphs in limited cases if
+the control flow is implemented using torch.cond() or
+torch.while_loop(). If your function uses these functions, compiling
+it with the "cudagraphs" backend will enable control flow in the
+resulting cuda graph via `conditional nodes`_.
+
+Unfortunately, eager mode execution does not work due to reasons
+described in `issue 144787`_.
+Support for inductor backend to torch.compile is not available yet, but there is no fundamental blocker.
+
+An example of using the cudagraphs backend to torch.compile on code
+using torch.cond is demonstrated below::
+
+    import torch
+
+    def true_fn(x):
+        return x.sin()
+
+    def false_fn(x):
+        return x.cos()
+
+    x = torch.randn(4, device="cuda", requires_grad=False)
+    pred = torch.tensor(False, device="cuda", requires_grad=False)
+    def foo(pred, x):
+        with torch.inference_mode():
+            return torch.cond(pred, true_fn, false_fn, [x])
+
+    # First call will run eager for warmup, second call will do graph
+    # capture followed by graph replay, third call and beyond will do
+    # just graph replay.
+    compiled_foo = torch.compile(foo, backend="cudagraphs")
+    for i in range(3):
+        y = compiled_foo(pred, x)
+
+    # will output x.sin()
+    y = compiled_foo(~pred, x)
@@ -13,7 +13,7 @@ For a longer background on CUDAGraphs, read `accelerating pytorch with CUDAGraph
 
 CUDA Graphs can give large speedups, especially for models with high CPU overhead or small compute. There are a number of limitations from requiring the same kernels to be run with the same arguments and dependencies, and memory addresses.
 
-- Control Flow is not possible
+- Arbitrary Control Flow is not possible (However, control flow expressed via torch.cond() and torch.while_loop() can be captured in a CUDA Graph. See :ref:`Data Dependent Control Flow<graph-data-dependent-control-flow>`.)
 - Kernels which trigger host to device syncs (such as .item()) errors
 - All input arguments to kernels are fixed to what they were recorded
 - CUDA Memory addresses are fixed, however the values of the memory at those addresses can change