PaddlePaddle
diff --git a/‎paddle/common/flags.cc‎
Lines changed: 13 additions & 0 deletions b/‎paddle/common/flags.cc‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎paddle/fluid/eager/to_static/run_program_op_node.h‎
Lines changed: 20 additions & 0 deletions b/‎paddle/fluid/eager/to_static/run_program_op_node.h‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc‎
Lines changed: 240 additions & 0 deletions b/‎paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc‎
Lines changed: 240 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.h‎
Lines changed: 82 additions & 0 deletions b/‎paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.h‎
Lines changed: 82 additions & 0 deletions
@@ -1195,6 +1195,19 @@ PHI_DEFINE_EXPORTED_bool(
     "cudaGraphInstantiateFlagAutoFreeOnLaunch so it would automatically "
     "release graph-owned blocks that have not freed before relaunching.");
 
+/*
+ * CUDA Graph related FLAG
+ * Name: FLAGS_cuda_graph_blacklist
+ * Since Version: 3.1
+ * Value Range: string, default=""
+ * Example: FLAGS_cuda_graph_blacklist="op1,op2,op3" would
+ * blacklist op1, op2, op3 from being captured in CUDA Graph.
+ */
+PHI_DEFINE_EXPORTED_string(
+    cuda_graph_blacklist,
+    "",
+    "CUDA Graph blacklist, split by ',', e.g., 'op1,op2,op3'");
+
 /*
  * Executor related FLAG
  * Name: FLAGS_executor_log_deps_every_microseconds
 
@@ -25,6 +25,7 @@
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/transforms/cuda_graph_extract_pass.h"
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/pir/utils/name_analysis.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -35,6 +36,7 @@
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/program.h"
 #include "paddle/pir/include/core/value.h"
+#include "paddle/pir/include/pass/pass_manager.h"
 
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/fluid/platform/onednn_helper.h"
@@ -494,6 +496,18 @@ inline void PirRunProgramAPI(
         }
       }
     }
+
+    auto program = forward_program;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (details::is_use_cuda_graph(cuda_graph_state)) {
+      pir::PassManager pass_pm(::pir::IrContext::Instance(), 3);
+      pass_pm.AddPass(pir::CreateCudaGraphExtractPass());
+      pir::IrMapping ir_mapping;
+      program = forward_program->Clone(ir_mapping);
+      pass_pm.Run(program.get());
+    }
+#endif
+
     auto passed_kernel_program = paddle::framework::ApplyIrPass(
         forward_program.get(), place, no_need_buffer_name_set);
     const auto &new_block = passed_kernel_program->block();
@@ -505,6 +519,9 @@ inline void PirRunProgramAPI(
         global_inner_scope,
         cache_key,
         in_sot_mode);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    interpreter_core->SetCUDAGraphState(static_cast<uint8_t>(cuda_graph_state));
+#endif
     // Step 4. get all eager gc vars (skip_names = backward_inputs -
     // no_need_buffers + outputs)
     std::vector<std::string> skip_names;
@@ -529,6 +546,9 @@ inline void PirRunProgramAPI(
     // Step 1. get cache interpretercore
     auto &cached_value = cache.GetMutable(cache_key);
     interpreter_core = cached_value.core_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    interpreter_core->SetCUDAGraphState(static_cast<uint8_t>(cuda_graph_state));
+#endif
     // Step 2. update scope for cache interpretercore
     details::ShareTensorsIntoScopeWithName(x, input_names, global_inner_scope);
     details::ShareTensorsIntoScopeWithName(
 
@@ -0,0 +1,240 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.h"
+
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
+#include "paddle/fluid/framework/new_executor/pir_interpreter.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/phi/core/platform/collective_helper.h"
+#include "paddle/phi/core/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/core/platform/device_context.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/type_defs.h"
+
+#include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/core/operation.h"
+#include "paddle/pir/include/core/value.h"
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+
+#ifdef PADDLE_WITH_CUDA || defined(PADDLE_WITH_HIP)
+
+namespace paddle::framework {
+
+CudaGraphInstruction::CudaGraphInstruction(
+    size_t id,
+    const phi::Place& place,
+    pir::Operation* op,
+    uint8_t* cuda_graph_state_ref,
+    int64_t cuda_graph_capture_pool_id,
+    ValueExecutionInfo* value_exec_info,
+    interpreter::ExecutionConfig execution_config)
+    : InstructionBase(id, place),
+      op_(op),
+      place_(place),
+      cuda_graph_state_ref_(cuda_graph_state_ref),
+      cuda_graph_capture_pool_id_(cuda_graph_capture_pool_id),
+      name_("cuda_graph_instruction"),
+      input_vars_(),
+      output_vars_(),
+      interpreter_(nullptr),
+      skip_gc_names_() {
+  PADDLE_ENFORCE(op->isa<paddle::dialect::CudaGraphOp>(),
+                 common::errors::PreconditionNotMet(
+                     "CudaGraph instruction only support cuda_graph op"));
+  op_ = op;
+
+  SetKernelType(OpFuncType::kGpuAsync);
+  VLOG(6) << "finish process analyse kernel type";
+
+  auto cuda_graph_op = op->dyn_cast<paddle::dialect::CudaGraphOp>();
+
+  std::unordered_map<pir::Value, std::vector<int>> inputs;
+  GetInputIds(op, *value_exec_info, &inputs);
+  const auto outside_inputs =
+      GetExternalInputs(cuda_graph_op.block(), *value_exec_info, &inputs);
+  for (size_t i = 0; i < outside_inputs.size(); ++i) {
+    input_vars_.push_back(value_exec_info->GetScope()->GetVar(
+        value_exec_info->GetValue2VarName().at(outside_inputs.at(i))));
+  }
+  VLOG(6) << "finish process input_vars";
+
+  for (size_t i = 0; i < cuda_graph_op.num_results(); ++i) {
+    output_vars_.push_back(value_exec_info->GetScope()->GetVar(
+        value_exec_info->GetValue2VarName().at(cuda_graph_op.result(i))));
+  }
+  VLOG(6) << "finish process output_vars";
+
+  for (auto& item : inputs) {
+    auto& var_vec = item.second;
+    for (auto it = var_vec.begin(); it != var_vec.end();) {
+      if (*it == -1) {
+        it = var_vec.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  SetInputs(inputs);
+
+  std::unordered_map<pir::Value, std::vector<int>> outputs;
+  bool is_last_op = true;
+  for (size_t i = 0; i < op->num_results(); i++) {
+    pir::Value value = op->result(i);
+    if (value && value.type()) {
+      PADDLE_ENFORCE_EQ(
+          value_exec_info->HasValue(value),
+          true,
+          common::errors::PreconditionNotMet(
+              "input should in name map, [%d] 'th input of [%s] op",
+              i,
+              "if op"));
+      outputs.emplace(value, GetValueIds(value, *value_exec_info));
+    }
+    if (value.use_count() > 0) {
+      VLOG(6) << "value " << i << " use conutn != 0";
+      is_last_op = false;
+    }
+  }
+
+  InsertInplacedExternalInputsToOuts(
+      cuda_graph_op.block(), outside_inputs, *value_exec_info, &outputs);
+
+  for (auto& item : outputs) {
+    auto& var_vec = item.second;
+    for (auto it = var_vec.begin(); it != var_vec.end();) {
+      if (*it == -1) {
+        it = var_vec.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  SetOutputs(outputs);
+  VLOG(6) << "finish process inputs outputs index";
+
+  Scope* scope = &(value_exec_info->GetScope()->NewScope());
+  auto skip_gc_vars = execution_config.skip_gc_vars;
+  execution_config.skip_gc_vars.clear();
+  execution_config.create_local_scope = true;
+  interpreter_ = new PirInterpreter(place,
+                                    {},
+                                    cuda_graph_op.block(),
+                                    scope,
+                                    value_exec_info->NewChild(scope),
+                                    execution_config);
+
+  std::set<std::string> skip_gc_names_set;
+  for (auto value : outside_inputs) {
+    skip_gc_names_.push_back(interpreter_->GetNameByValue(value));
+    skip_gc_names_set.insert(interpreter_->GetNameByValue(value));
+  }
+  for (const auto& var_name : skip_gc_vars) {
+    skip_gc_names_.push_back(var_name);
+    skip_gc_names_set.insert(var_name);
+  }
+  interpreter_->SetSkipGcVars(skip_gc_names_set);
+  VLOG(6) << "finish process interpreter";
+}
+
+CudaGraphInstruction::~CudaGraphInstruction() { delete interpreter_; }
+
+void CudaGraphInstruction::SetOutputHooks(
+    const std::vector<PirHookFunc>& hookfuncs) {
+  interpreter_->SetOutputHooks(hookfuncs);
+}
+
+void CudaGraphInstruction::SetInputHooks(
+    const std::vector<PirHookFunc>& hookfuncs) {
+  interpreter_->SetInputHooks(hookfuncs);
+}
+
+void CudaGraphInstruction::Run() {
+  if (cuda_graph_ != nullptr && *cuda_graph_state_ref_ == 3) {
+    VLOG(4) << "Start replaying cuda graph @" << cuda_graph_.get();
+    for (size_t i = 0; i < input_vars_.size(); ++i) {
+      if (input_vars_[i]->IsType<phi::DenseTensor>()) {
+        auto* tensor = input_vars_[i]->GetMutable<phi::DenseTensor>();
+        if (tensor->data() != input_tensors_.at(i).data()) {
+          LOG(WARNING) << "The input [" << i << "] tensor addr for "
+                       << "cuda graph is changed. Pay attention to this!";
+          if (phi::is_gpu_place(tensor->place())) {
+            const auto* dev_ctx =
+                phi::DeviceContextPool::Instance().Get(place_);
+            phi::Copy(*dev_ctx, *tensor, place_, false, &input_tensors_.at(i));
+          }
+        }
+      }
+    }
+
+    cuda_graph_->Replay();
+
+    // set the output tensors into scope
+    for (size_t i = 0; i < output_vars_.size(); ++i) {
+      *(output_vars_[i]->GetMutable<phi::DenseTensor>()) =
+          output_tensors_.at(i);
+    }
+    VLOG(4) << "Finish replaying cuda graph";
+    return;
+  }
+  if (*cuda_graph_state_ref_ == 2 && cuda_graph_ == nullptr) {
+    VLOG(4) << "Warmup before capturing";
+    interpreter_->Run({}, false);
+    VLOG(4) << "Start capturing cuda graph ...";
+    platform::BeginCUDAGraphCapture(
+        place_, cudaStreamCaptureModeRelaxed, cuda_graph_capture_pool_id_);
+
+    auto RecordTensorsForReplay = [&](const std::vector<Variable*>& vars) {
+      std::vector<phi::DenseTensor> record_tensors;
+      record_tensors.reserve(vars.size());
+      for (auto& var : vars) {
+        auto& tensor = var->Get<phi::DenseTensor>();
+        const auto& holder = tensor.Holder();
+        // Note: new_holder only record the memory address of the tensor for
+        // cuda graph, original tensor memory will be freed to allocator after
+        // graph capture.
+        auto new_holder = std::make_shared<phi::Allocation>(
+            holder->ptr(), holder->size(), holder->place());
+        record_tensors.emplace_back(new_holder, tensor.meta());
+      }
+      return record_tensors;
+    };
+
+    // record the input tensors for replay
+    input_tensors_ = RecordTensorsForReplay(input_vars_);
+
+    interpreter_->Run({}, false);
+
+    // record the output tensors for replay
+    output_tensors_ = RecordTensorsForReplay(output_vars_);
+
+    cuda_graph_ = platform::EndCUDAGraphCapture();
+    VLOG(4) << "Finish capturing cuda graph @" << cuda_graph_.get();
+
+    // compute the right result
+    cuda_graph_->Replay();
+  } else {
+    VLOG(4) << "Run interpreter without cuda graph";
+    interpreter_->Run({}, false);
+  }
+}
+
+}  // namespace paddle::framework
+
+#endif  // PADDLE_WITH_CUDA
@@ -0,0 +1,82 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA || defined(PADDLE_WITH_HIP)
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
+#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+
+namespace ir {
+class Operation;
+}  // namespace ir
+
+namespace paddle {
+namespace framework {
+class Scope;
+class Value;
+class PirInterpreter;
+class ValueExecutionInfo;
+
+class CudaGraphInstruction : public InstructionBase {
+ public:
+  CudaGraphInstruction(size_t id,
+                       const phi::Place& place,
+                       ::pir::Operation* op,
+                       uint8_t* cuda_graph_state_ref,
+                       int64_t cuda_graph_capture_pool_id,
+                       ValueExecutionInfo* value_exe_info,
+                       interpreter::ExecutionConfig execution_config);
+
+  ~CudaGraphInstruction();
+
+  void Run() override;
+
+  const std::string& Name() const override { return name_; }
+
+  ::pir::Operation* Operation() const override { return op_; }
+
+  PirInterpreter* interpreter() const { return interpreter_; }
+
+  void SetOutputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
+  void SetInputHooks(const std::vector<PirHookFunc>& hookfuncs);
+
+ private:
+  const phi::Place& place_;
+  pir::Operation* op_;
+  uint8_t* cuda_graph_state_ref_ = nullptr;
+  int64_t cuda_graph_capture_pool_id_ = -1;
+
+  std::string name_{"cuda_graph_instruction"};
+
+  std::vector<Variable*> input_vars_;
+  std::vector<Variable*> output_vars_;
+
+  PirInterpreter* interpreter_ = nullptr;
+
+  std::vector<std::string> skip_gc_names_;
+
+  std::unique_ptr<phi::backends::gpu::CUDAGraph> cuda_graph_ = nullptr;
+  std::vector<phi::DenseTensor> input_tensors_;
+  std::vector<phi::DenseTensor> output_tensors_;
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUDA