Fix MultiPrecisionAdd 0size (PaddlePaddle#76512)

cangtianhuang · web-flow · commit b5efb98a163a · 2025-11-24T13:11:08.000+08:00
* fix MultiPrecisionAdd 0size

* add 0size test

* fix add grad mixed precision in xpu

* add head

* fix

* fix

* add MixedPrecisionAddGradKernel

* refine

* fix
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -95,17 +95,21 @@ void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const DenseTensor& y,
                DenseTensor* out) {
-  if (x.numel() == 0 || y.numel() == 0) {
-    dev_ctx.template Alloc<T>(out);
-    return;
-  }
 #ifdef PADDLE_WITH_CUDA
   if (x.dtype() == DataType::FLOAT32 &&
       (y.dtype() == DataType::FLOAT16 || y.dtype() == DataType::BFLOAT16)) {
+    if (x.numel() == 0 || y.numel() == 0) {
+      dev_ctx.template Alloc<float>(out);
+      return;
+    }
     MultiPrecisionAddKernelImpl<float, Context>(dev_ctx, x, y, out);
     return;
   }
 #endif
+  if (x.numel() == 0 || y.numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
   phi::AddRawKernel<T, Context>(dev_ctx, x, y, -1, out);
 }
 
diff --git a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
@@ -24,10 +24,120 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace phi {
+template <typename YType, typename Context>
+void MixedPrecisionAddGradKernel(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& dout,
+                                 int axis,
+                                 DenseTensor* dx,
+                                 DenseTensor* dy) {
+  using T = float;
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  using XPUYType = typename XPUTypeTrait<YType>::Type;
+
+  if (dout.numel() == 0) {
+    if (dx) {
+      dev_ctx.template Alloc<T>(dx);
+      if (dx->numel() > 0) {
+        int ret =
+            xpu::constant<XPUType>(dev_ctx.x_context(),
+                                   reinterpret_cast<XPUType*>(dx->data<T>()),
+                                   dx->numel(),
+                                   static_cast<XPUType>(0));
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant");
+      }
+    }
+    if (dy) {
+      dev_ctx.template Alloc<YType>(dy);
+      if (dy->numel() > 0) {
+        int ret = xpu::constant<XPUYType>(
+            dev_ctx.x_context(),
+            reinterpret_cast<XPUYType*>(dy->data<YType>()),
+            dy->numel(),
+            static_cast<XPUYType>(0));
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant");
+      }
+    }
+    return;
+  }
+
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* dz = &dout;
+  const DDim& dz_dims = dz->dims();
+  const T* dz_data = dz->data<T>();
+
+  if (dx != nullptr) {
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+    if (dx->dims() == dz_dims) {
+      if (dx_data != dz_data) {
+        int ret = xpu::copy(dev_ctx.x_context(),
+                            reinterpret_cast<const XPUType*>(dz_data),
+                            reinterpret_cast<XPUType*>(dx_data),
+                            dx->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dz, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(*dz)) {
+        dx->clear();
+        dx->Resize(x.dims());
+        dev_ctx.template Alloc<T>(dx);
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(dx->dims(), dz_dims, axis);
+      std::vector<int64_t> dz_vector = common::vectorize<int64_t>(dz_dims);
+
+      int ret = xpu::reduce_sum<XPUType>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(dz_data),
+          reinterpret_cast<XPUType*>(dx_data),
+          dz_vector,
+          std::vector<int64_t>(reduce_dims.begin(), reduce_dims.end()));
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum");
+    }
+  }
+
+  if (dy != nullptr) {
+    YType* dy_data = dev_ctx.template Alloc<YType>(dy);
+    if (dy->dims() == dz_dims) {
+      int ret = xpu::cast<XPUType, XPUYType>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(dz_data),
+          reinterpret_cast<XPUYType*>(dy_data),
+          dout.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "cast");
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(dy->dims(), dz_dims, axis);
+      std::vector<int64_t> dz_vector = common::vectorize<int64_t>(dz_dims);
+
+      DenseTensor casted_dz;
+      casted_dz.Resize(dz_dims);
+      YType* casted_dz_data = dev_ctx.template Alloc<YType>(&casted_dz);
+
+      int ret_cast = xpu::cast<XPUType, XPUYType>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(dz_data),
+          reinterpret_cast<XPUYType*>(casted_dz_data),
+          dout.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret_cast, "cast");
+
+      int ret_reduce = xpu::reduce_sum<XPUYType>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUYType*>(casted_dz_data),
+          reinterpret_cast<XPUYType*>(dy_data),
+          dz_vector,
+          std::vector<int64_t>(reduce_dims.begin(), reduce_dims.end()));
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret_reduce, "reduce_sum");
+    }
+  }
+}
+
 template <typename T, typename Context>
 void AddGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -36,30 +146,50 @@ void AddGradKernel(const Context& dev_ctx,
                    int axis,
                    DenseTensor* dx,
                    DenseTensor* dy) {
+  // special case for "float32 + bfloat16", or "float32 + float16"
+  if (x.dtype() == DataType::FLOAT32) {
+    if (y.dtype() == DataType::FLOAT16) {
+      MixedPrecisionAddGradKernel<phi::float16>(
+          dev_ctx, x, y, dout, axis, dx, dy);
+      return;
+    }
+    if (y.dtype() == DataType::BFLOAT16) {
+      MixedPrecisionAddGradKernel<phi::bfloat16>(
+          dev_ctx, x, y, dout, axis, dx, dy);
+      return;
+    }
+  }
+
+  using XPUType = typename XPUTypeTrait<T>::Type;
   if (dout.numel() == 0) {
     if (dx) {
-      if (dx->numel() == 0) {
-        dev_ctx.template Alloc<T>(dx);
-      } else {
-        phi::Full<T, Context>(
-            dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx);
+      dev_ctx.template Alloc<T>(dx);
+      if (dx->numel() > 0) {
+        int ret =
+            xpu::constant<XPUType>(dev_ctx.x_context(),
+                                   reinterpret_cast<XPUType*>(dx->data<T>()),
+                                   dx->numel(),
+                                   static_cast<XPUType>(0));
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant");
       }
     }
     if (dy) {
-      if (dy->numel() == 0) {
-        dev_ctx.template Alloc<T>(dy);
-      } else {
-        phi::Full<T, Context>(
-            dev_ctx, phi::IntArray(common::vectorize(dy->dims())), 0, dy);
+      dev_ctx.template Alloc<T>(dy);
+      if (dy->numel() > 0) {
+        int ret =
+            xpu::constant<XPUType>(dev_ctx.x_context(),
+                                   reinterpret_cast<XPUType*>(dy->data<T>()),
+                                   dy->numel(),
+                                   static_cast<XPUType>(0));
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant");
       }
     }
     return;
   }
-  using XPUType = typename XPUTypeTrait<T>::Type;
+
   funcs::ElementwiseGradPreProcess(dout, dx);
   auto* dz = &dout;
   const DDim& dz_dims = dz->dims();
-
   const T* dz_data = dz->data<T>();
 
   if (dx != nullptr) {
@@ -68,7 +198,7 @@ void AddGradKernel(const Context& dev_ctx,
       if (dx_data != dz_data) {
         int ret = xpu::copy(dev_ctx.x_context(),
                             reinterpret_cast<const XPUType*>(dz_data),
-                            reinterpret_cast<XPUType*>(dx->data<T>()),
+                            reinterpret_cast<XPUType*>(dx_data),
                             dx->numel());
         PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
       }
@@ -87,7 +217,7 @@ void AddGradKernel(const Context& dev_ctx,
       int ret = xpu::reduce_sum<XPUType>(
           dev_ctx.x_context(),
           reinterpret_cast<const XPUType*>(dz_data),
-          reinterpret_cast<XPUType*>(dx->data<T>()),
+          reinterpret_cast<XPUType*>(dx_data),
           dz_vector,
           std::vector<int64_t>(reduce_dims.begin(), reduce_dims.end()));
       PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum");
@@ -100,7 +230,7 @@ void AddGradKernel(const Context& dev_ctx,
       if (dy_data != dz_data) {
         int ret = xpu::copy(dev_ctx.x_context(),
                             reinterpret_cast<const XPUType*>(dz_data),
-                            reinterpret_cast<XPUType*>(dy->data<T>()),
+                            reinterpret_cast<XPUType*>(dy_data),
                             dy->numel());
         PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
       }
@@ -118,6 +248,7 @@ void AddGradKernel(const Context& dev_ctx,
     }
   }
 }
+
 #ifdef PADDLE_WITH_XPU_FFT
 template <>
 void AddGradKernel<phi::complex64, XPUContext>(const XPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -35,14 +35,14 @@ void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const DenseTensor& y,
                DenseTensor* out) {
-  if (out->numel() == 0) {
-    dev_ctx.template Alloc<T>(out);
-    return;
-  }
   if (x.dtype() == phi::DataType::FLOAT32 &&
       (y.dtype() == phi::DataType::BFLOAT16 ||
        y.dtype() == phi::DataType::FLOAT16)) {
     // special case for "float32 + bfloat16", or "float32 + float16"
+    if (out->numel() == 0) {
+      dev_ctx.template Alloc<float>(out);
+      return;
+    }
     auto dev_version =
         phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId());
     if (dev_version >= phi::backends::xpu::XPUVersion::XPU3 &&
@@ -82,6 +82,10 @@ void AddKernel(const Context& dev_ctx,
       XPUElementwise<Type, XPUType>(dev_ctx, x, casted_y, -1, out, f);
     }
   } else {
+    if (out->numel() == 0) {
+      dev_ctx.template Alloc<T>(out);
+      return;
+    }
     using XPUType = typename XPUTypeTrait<T>::Type;
 
     auto f = [](xpu::Context* xpu_ctx,
diff --git a/test/legacy_test/test_add_op.py b/test/legacy_test/test_add_op.py
@@ -20,6 +20,33 @@
 from paddle.base import core
 
 
+class TestPaddleAddZeroSize(unittest.TestCase):
+    def setUp(self):
+        self.place = get_device_place()
+        self.shape = [0, 3]
+        self.dtype_pairs = [(paddle.float32, paddle.float32)]
+        if core.is_float16_supported(self.place):
+            self.dtype_pairs.append((paddle.float32, paddle.float16))
+        if core.is_bfloat16_supported(self.place):
+            self.dtype_pairs.append((paddle.float32, paddle.bfloat16))
+
+    def test_0size(self):
+        for x_dtype, y_dtype in self.dtype_pairs:
+            with self.subTest(msg=f"{x_dtype} + {y_dtype}"):
+                x = paddle.randn(self.shape, dtype=x_dtype)
+                y = paddle.randn(self.shape, dtype=y_dtype)
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                out = paddle.add(x, y)
+                out.backward()
+
+                self.assertEqual(out.shape, self.shape)
+                self.assertEqual(out.dtype, x_dtype)
+                self.assertEqual(x.grad.dtype, x_dtype)
+                self.assertEqual(y.grad.dtype, y_dtype)
+
+
 class TestPaddleAddBackward(unittest.TestCase):
     def setUp(self):
         self.place = get_device_place()