Support set_grad_in_dtype_consistent for pylayer (PaddlePaddle#76537)

DanielSun11 · web-flow · commit 69960293c3ce · 2025-11-24T19:57:47.000+08:00
* support dtype consistent for pylayer

* fix

* fix for static check
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/eager/backward.h"
 
 #include "paddle/fluid/eager/general_grad.h"
+#include "paddle/fluid/eager/pylayer/py_layer_node.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/phi/core/memory/stats.h"
@@ -321,8 +322,15 @@ std::vector<paddle::Tensor> RunBackward(
       VLOG(4) << "RunBackward: Create Value for grad input tensor " << i
               << " of grad node: " << grad_node->name() << "(" << grad_node
               << ")";
-      node_input_buffers_dict[grad_node] =
-          std::make_unique<GradTensorHolder>(grad_node->InputMeta());
+
+      if (typeid(*grad_node) == typeid(GradNodePyLayer)) {
+        auto pylayer_gradnode = dynamic_cast<GradNodePyLayer*>(grad_node);
+        node_input_buffers_dict[grad_node] = std::make_unique<GradTensorHolder>(
+            grad_node->InputMeta(), pylayer_gradnode->GradInDtypeConsistent());
+      } else {
+        node_input_buffers_dict[grad_node] =
+            std::make_unique<GradTensorHolder>(grad_node->InputMeta());
+      }
     }
 
     // copy grad tensor since we should totally run grad without affect forward
@@ -589,11 +597,23 @@ std::vector<paddle::Tensor> RunBackward(
 
           if (!node_input_buffers_dict.count(next_node)) {
             const auto& input_meta = next_node->InputMeta();
-            auto grad_tensor_holder =
-                std::make_unique<GradTensorHolder>(input_meta);
+
             VLOG(6) << "RunBackward: Construct GradTensorHolder for grad node: "
                     << next_node->name() << "(" << next_node << ") ";
-            node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
+
+            if (typeid(*next_node) == typeid(GradNodePyLayer)) {
+              auto pylayer_gradnode = dynamic_cast<GradNodePyLayer*>(next_node);
+              auto grad_tensor_holder = std::make_unique<GradTensorHolder>(
+                  next_node->InputMeta(),
+                  pylayer_gradnode->GradInDtypeConsistent());
+              node_input_buffers_dict[next_node] =
+                  std::move(grad_tensor_holder);
+            } else {
+              auto grad_tensor_holder =
+                  std::make_unique<GradTensorHolder>(input_meta);
+              node_input_buffers_dict[next_node] =
+                  std::move(grad_tensor_holder);
+            }
           }
 
           VLOG(7) << "RunBackward: Sum or Move grad inputs for edge slot: "
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
@@ -29,7 +29,8 @@ class GradTensorHolder {
  public:
   explicit GradTensorHolder(
       const paddle::small_vector<std::vector<GradSlotMeta>,
-                                 kSlotSmallVectorSize>& metas) {
+                                 kSlotSmallVectorSize>& metas,
+      bool record_input_dtypes = true) {
     VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size();
     buffer_.resize(metas.size());
     input_dtypes_.resize(metas.size());
@@ -41,12 +42,13 @@ class GradTensorHolder {
       // Extract only dtype information from metas
       for (size_t j = 0; j < metas[i].size(); j++) {
         const auto& meta = metas[i][j];
-        if (meta.HasTensorMeta()) {
+        if (meta.HasTensorMeta() && record_input_dtypes) {
           const auto& tensor_meta = meta.GetTensorMeta();
           input_dtypes_[i][j] = tensor_meta.dtype;
           VLOG(7) << "Init GradTensorHolder with dtype: "
                   << phi::DataTypeToString(tensor_meta.dtype);
         } else {
+          VLOG(7) << "Init GradTensorHolder with UNDEFINED";
           input_dtypes_[i][j] = phi::DataType::UNDEFINED;
         }
       }
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -106,6 +106,10 @@ class GradNodePyLayer : public GradNodeBase {
         std::shared_ptr<GradNodePyLayer>(new GradNodePyLayer(*this));
     return copied_node;
   }
+  bool GradInDtypeConsistent() { return grad_in_dtype_consistent_; }
+  void SetGradInDtypeConsistent(bool value) {
+    grad_in_dtype_consistent_ = value;
+  }
 
  private:
   PyObject* ctx_{nullptr};
@@ -116,6 +120,7 @@ class GradNodePyLayer : public GradNodeBase {
       forward_outputs_dist_attr_;
   std::vector<std::vector<phi::DDim>> forward_outputs_global_dims_;
   std::vector<std::vector<bool>> forward_outputs_is_dist_meta_;
+  bool grad_in_dtype_consistent_;
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
@@ -32,6 +32,7 @@ typedef struct {
   PyObject* non_differentiable;
   PyObject* not_inplace_tensors;
   bool materialize_grads;
+  bool grad_in_dtype_consistent;
   std::vector<bool> forward_input_tensor_is_duplicable;
   std::vector<bool> forward_output_tensor_is_duplicable;
   std::weak_ptr<egr::GradNodePyLayer> grad_node;
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
@@ -85,6 +85,7 @@ PyObject* PyLayerNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
     v->container = nullptr;
     v->materialize_grads = true;
     v->container_be_packed = false;
+    v->grad_in_dtype_consistent = true;
     new (&v->grad_node) std::weak_ptr<egr::GradNodePyLayer>();
     new (&v->forward_input_tensor_is_duplicable) std::vector<bool>();
     new (&v->forward_output_tensor_is_duplicable) std::vector<bool>();
@@ -575,6 +576,7 @@ PyObject* pylayer_method_apply(PyObject* cls,
     if (ctx->materialize_grads) {
       grad_node->SaveForwardOutputsMeta(outputs_tensor);
     }
+    grad_node->SetGradInDtypeConsistent(ctx->grad_in_dtype_consistent);
 
     for (size_t i = 0; i < inputs_autograd_meta.size(); i++) {
       if (ctx->forward_input_tensor_is_duplicable[i]) {
@@ -858,6 +860,14 @@ int tensor_properties_set_materialize_grads(PyLayerObject* self,
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
+int tensor_properties_set_grad_in_dtype_consistent(PyLayerObject* self,
+                                                   PyObject* value,
+                                                   void* closure) {
+  EAGER_TRY
+  self->grad_in_dtype_consistent = CastPyArg2AttrBoolean(value, 0);
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_NEG
+}
 
 PyMethodDef pylayer_methods[] = {{"name",  // NOLINT
                                   (PyCFunction)(void (*)())pylayer_method_name,
@@ -890,6 +900,11 @@ struct PyGetSetDef pylayer_properties[] {  // NOLINT
        (setter)tensor_properties_set_materialize_grads,
        nullptr,
        nullptr},
+      {"grad_in_dtype_consistent",
+       nullptr,
+       (setter)tensor_properties_set_grad_in_dtype_consistent,
+       nullptr,
+       nullptr},
   {
     nullptr, nullptr, nullptr, nullptr, nullptr
   }
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
@@ -63,6 +63,76 @@ class PyLayerContext:
     not_inplace_tensors: tuple[Tensor, ...]
     non_differentiable: tuple[Tensor, ...]
     materialize_grads: bool
+    grad_in_dtype_consistent: bool
+
+    def set_grad_in_dtype_consistent(self, flag: bool) -> None:
+        """
+        Set whether to maintain gradient input dtype consistency between forward output and backward input.
+
+        Note:
+            This API should be called only inside `forward`.
+            By default, backward input gradients are automatically cast to match the dtype of forward outputs.
+            Set this to `False` to disable automatic casting and maintain original gradient dtypes in backward.
+
+        Args:
+            flag (bool): Whether to enable automatic dtype conversion in backward.
+                - `True`:  Cast backward input gradient to match forward output dtype (default behavior)
+                - `False`: Preserve original dtype of backward input gradient
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                >>> import paddle
+                >>> from paddle.autograd import PyLayer
+                >>> paddle.seed(2025)
+                >>> class cus_tanh(PyLayer):
+                ...     @staticmethod
+                ...     def forward(ctx, x):
+                ...         y = paddle.tanh(x)
+                ...         # Pass tensors to backward.
+                ...         ctx.save_for_backward(y)
+                ...         # The gradient input in the backward process
+                ...         # will not be automatically cast to the dtype of the forward output.
+                ...         ctx.set_grad_in_dtype_consistent(False)
+                ...         return y
+                ...
+                ...     @staticmethod
+                ...     def backward(ctx, dy):
+                ...
+                ...         # Get the tensors passed by forward.
+                ...         y, = ctx.saved_tensor()
+                ...         grad = dy * (1 - paddle.square(y))
+                ...         return grad
+                ...
+                >>> class cus_tanh_cast_grad(PyLayer):
+                ...     @staticmethod
+                ...     def forward(ctx, x):
+                ...         y = paddle.tanh(x)
+                ...         # Pass tensors to backward.
+                ...         ctx.save_for_backward(y)
+                ...         return y
+                ...
+                ...     @staticmethod
+                ...     def backward(ctx, dy):
+                ...         # Get the tensors passed by forward.
+                ...         y, = ctx.saved_tensor()
+                ...         grad = dy * (1 - paddle.square(y))
+                ...         # The gradient input in cus_tanh be cast to bfloat16 manually,
+                ...         # and cus_tanh will not cast the gradient to the dtype of the forward output.
+                ...         grad = paddle.cast(grad,paddle.float16)
+                ...         return grad
+                ...
+                >>> x = paddle.randn([3,3]).astype("float32")
+                >>> x.stop_gradient = False
+                >>> y = cus_tanh.apply(x)
+                >>> z = cus_tanh_cast_grad.apply(y)
+                >>> z.sum().backward()
+
+        """
+        self.grad_in_dtype_consistent = flag
 
     def save_for_backward(self, *tensors: Tensor) -> None:
         """
diff --git a/test/legacy_test/test_pylayer_op.py b/test/legacy_test/test_pylayer_op.py
@@ -732,6 +732,53 @@ def test_nest_backward_error(self):
                 expect_msg, err_msg, expect_msg + " should in error message "
             )
 
+    def test_set_grad_in_dtype_consistent(self):
+        paddle.seed(2025)
+        cus_tanh_backward_input = paddle.empty([])
+
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                y = paddle.tanh(x)
+                # Pass tensors to backward.
+                ctx.save_for_backward(y)
+                # The gradient input in the backward process
+                # will not be automatically cast to the dtype of the forward output.
+                ctx.set_grad_in_dtype_consistent(False)
+                return y
+
+            @staticmethod
+            def backward(ctx, dy):
+                nonlocal cus_tanh_backward_input
+                cus_tanh_backward_input = dy
+                # Get the tensors passed by forward.
+                (y,) = ctx.saved_tensor()
+                grad = dy * (1 - paddle.square(y))
+                return grad
+
+        class cus_tanh_cast_grad(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                y = paddle.tanh(x)
+                # Pass tensors to backward.
+                ctx.save_for_backward(y)
+                return y
+
+            @staticmethod
+            def backward(ctx, dy):
+                # Get the tensors passed by forward.
+                (y,) = ctx.saved_tensor()
+                grad = dy * (1 - paddle.square(y))
+                grad = paddle.cast(grad, paddle.float16)
+                return grad
+
+        x = paddle.randn([3, 3]).astype("float32")
+        x.stop_gradient = False
+        y = cus_tanh.apply(x)
+        z = cus_tanh_cast_grad.apply(y)
+        z.backward()
+        self.assertEqual(cus_tanh_backward_input.dtype, paddle.float16)
+
 
 if __name__ == '__main__':
     unittest.main()