[API-Compat] Correct min/max_with index gradient behavior

Enigmatisms · Enigmatisms · commit 0135ca0578db · 2025-08-14T06:57:53.000Z
diff --git a/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu b/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+using EnableIfInteger =
+    typename std::enable_if<std::is_integral<T>::value, int>::type;
+
+template <typename T>
+using EnableIfNonInteger =
+    typename std::enable_if<!std::is_integral<T>::value, int>::type;
+
+// Here if keepdim=True, this will fallback to a simplified version of
+// take_along_axis. However, if keepdim=False (by default), indices will
+// not have equal rank will the input values (and values_grad), therefore
+// needs an unsqueeze operation by shallow copying indices and Resize
+#define DEFINE_WITH_INDEX_GRAD_KERNEL(OpType)                                \
+  template <typename T, typename Context, EnableIfNonInteger<T> = 0>         \
+  void OpType##WithIndexGradKernel(const Context& dev_ctx,                   \
+                                   const DenseTensor& x,                     \
+                                   const DenseTensor& values,                \
+                                   const DenseTensor& indices,               \
+                                   const DenseTensor& values_grad,           \
+                                   const Scalar& dim,                        \
+                                   bool keepdim,                             \
+                                   DenseTensor* x_grad) {                    \
+    x_grad->Resize(x.dims());                                                \
+    dev_ctx.template Alloc<T>(x_grad);                                       \
+    if (x_grad->numel() == 0) {                                              \
+      return;                                                                \
+    }                                                                        \
+    int64_t dim_val = dim.to<int64_t>();                                     \
+    if (dim_val < 0) {                                                       \
+      dim_val += x.dims().size();                                            \
+    }                                                                        \
+    DenseTensor shallow_copied_inds(indices);                                \
+    if (!keepdim) {                                                          \
+      auto indices_dim = x.dims();                                           \
+      indices_dim[dim_val] = 1;                                              \
+      shallow_copied_inds.Resize(indices_dim);                               \
+    }                                                                        \
+    phi::funcs::SetConstant<Context, T> functor;                             \
+    functor(dev_ctx, x_grad, static_cast<T>(0));                             \
+    phi::funcs::gpu_scatter_add_kernel<T, int64_t>(                          \
+        *x_grad, dim_val, shallow_copied_inds, values_grad, true, dev_ctx);  \
+  }                                                                          \
+  template <typename T, typename Context, EnableIfInteger<T> = 0>            \
+  void OpType##WithIndexGradKernel(const Context& dev_ctx,                   \
+                                   const DenseTensor& x,                     \
+                                   const DenseTensor& values,                \
+                                   const DenseTensor& indices,               \
+                                   const DenseTensor& values_grad,           \
+                                   const Scalar& dim,                        \
+                                   bool keepdim,                             \
+                                   DenseTensor* x_grad) {                    \
+    std::string dtype_name = phi::DataTypeToString(values.dtype());          \
+    PADDLE_ENFORCE_EQ(                                                       \
+        0,                                                                   \
+        1,                                                                   \
+        phi::errors::InvalidArgument(                                        \
+            "Integer type '%s' is not allowed to have stop_gradient=False.", \
+            dtype_name.c_str()));                                            \
+  }
+
+DEFINE_WITH_INDEX_GRAD_KERNEL(Max)
+DEFINE_WITH_INDEX_GRAD_KERNEL(Min)
+
+#undef DEFINE_WITH_INDEX_GRAD_KERNEL
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(max_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxWithIndexGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int,
+                   int16_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(min_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinWithIndexGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int,
+                   int16_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu b/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu
@@ -254,25 +254,25 @@ void MinMaxWithIndexOpCUDAKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void MinWithIndexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                        const Scalar& axis,
-                        bool keepdims,
+                        const Scalar& dim,
+                        bool keepdim,
                         bool flatten,
                         DenseTensor* val_out,
                         DenseTensor* ind_out) {
   MinMaxWithIndexOpCUDAKernel<Context, T, cub::ArgMin>(
-      dev_ctx, x, axis, keepdims, flatten, val_out, ind_out);
+      dev_ctx, x, dim, keepdim, flatten, val_out, ind_out);
 }
 
 template <typename T, typename Context>
 void MaxWithIndexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                        const Scalar& axis,
-                        bool keepdims,
+                        const Scalar& dim,
+                        bool keepdim,
                         bool flatten,
                         DenseTensor* val_out,
                         DenseTensor* ind_out) {
   MinMaxWithIndexOpCUDAKernel<Context, T, cub::ArgMax>(
-      dev_ctx, x, axis, keepdims, flatten, val_out, ind_out);
+      dev_ctx, x, dim, keepdim, flatten, val_out, ind_out);
 }
 
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -160,80 +160,6 @@ void ReduceAMaxGradKernel(const Context& dev_ctx,
       dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad);
 }
 
-template <typename T>
-using EnableIfInteger =
-    typename std::enable_if<std::is_integral<T>::value, int>::type;
-
-template <typename T>
-using EnableIfNonInteger =
-    typename std::enable_if<!std::is_integral<T>::value, int>::type;
-
-template <typename T, typename Context, EnableIfNonInteger<T> = 0>
-void MinWithIndexGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& values,
-                            const DenseTensor& values_grad,
-                            const Scalar& dim,
-                            bool keepdims,
-                            bool flatten,
-                            DenseTensor* x_grad) {
-  int64_t dim_val = dim.to<int64_t>();
-  flatten = recompute_reduce_all(x, {dim_val}, flatten);
-  ReduceCudaAMaxAMinGrad<T, Context>(
-      dev_ctx, x, values, values_grad, {dim_val}, keepdims, flatten, x_grad);
-}
-
-template <typename T, typename Context, EnableIfInteger<T> = 0>
-void MinWithIndexGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& values,
-                            const DenseTensor& values_grad,
-                            const Scalar& dim,
-                            bool keepdims,
-                            bool flatten,
-                            DenseTensor* x_grad) {
-  std::string dtype_name = phi::DataTypeToString(x.dtype());
-  PADDLE_ENFORCE_EQ(
-      0,
-      1,
-      phi::errors::InvalidArgument(
-          "Integer type '%s' is not allowed to have stop_gradient=False.",
-          dtype_name.c_str()));
-}
-
-template <typename T, typename Context, EnableIfNonInteger<T> = 0>
-void MaxWithIndexGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& values,
-                            const DenseTensor& values_grad,
-                            const Scalar& dim,
-                            bool keepdims,
-                            bool flatten,
-                            DenseTensor* x_grad) {
-  int64_t dim_val = dim.to<int64_t>();
-  flatten = recompute_reduce_all(x, {dim_val}, flatten);
-  ReduceCudaAMaxAMinGrad<T, Context>(
-      dev_ctx, x, values, values_grad, {dim_val}, keepdims, flatten, x_grad);
-}
-
-template <typename T, typename Context, EnableIfInteger<T> = 0>
-void MaxWithIndexGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& values,
-                            const DenseTensor& values_grad,
-                            const Scalar& dim,
-                            bool keepdims,
-                            bool flatten,
-                            DenseTensor* x_grad) {
-  std::string dtype_name = phi::DataTypeToString(x.dtype());
-  PADDLE_ENFORCE_EQ(
-      0,
-      1,
-      phi::errors::InvalidArgument(
-          "Integer type '%s' is not allowed to have stop_gradient=False.",
-          dtype_name.c_str()));
-}
-
 template <typename T, typename Context>
 void ReduceMaxGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
@@ -359,19 +285,6 @@ PD_REGISTER_KERNEL(max_grad,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 
-PD_REGISTER_KERNEL(max_with_index_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MaxWithIndexGradKernel,
-                   float,
-                   double,
-                   uint8_t,
-                   int,
-                   int16_t,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-
 PD_REGISTER_KERNEL(mean_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -398,19 +311,6 @@ PD_REGISTER_KERNEL(min_grad,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 
-PD_REGISTER_KERNEL(min_with_index_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MinWithIndexGradKernel,
-                   float,
-                   double,
-                   uint8_t,
-                   int,
-                   int16_t,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-
 PD_REGISTER_KERNEL(sum_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/min_max_with_index_grad_kernel.h.h b/paddle/phi/kernels/min_max_with_index_grad_kernel.h.h
diff --git a/paddle/phi/kernels/min_max_with_index_kernel.h b/paddle/phi/kernels/min_max_with_index_kernel.h
@@ -22,17 +22,17 @@ namespace phi {
 template <typename T, typename Context>
 void MinWithIndexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                        const Scalar& axis,
-                        bool keepdims,
+                        const Scalar& dim,
+                        bool keepdim,
                         bool flatten,
                         DenseTensor* val_out,
                         DenseTensor* ind_out);
 
 template <typename T, typename Context>
 void MaxWithIndexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                        const Scalar& axis,
-                        bool keepdims,
+                        const Scalar& dim,
+                        bool keepdim,
                         bool flatten,
                         DenseTensor* val_out,
                         DenseTensor* ind_out);
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -2278,8 +2278,8 @@
     func : max_pool3d_with_index_grad
 
 - backward_op : max_with_index_grad
-  forward : max_with_index (Tensor x, Scalar axis, bool keepdims, bool flatten) -> Tensor(values), Tensor(indices)
-  args : (Tensor x, Tensor values, Tensor values_grad, Scalar axis, bool keepdims, bool flatten)
+  forward : max_with_index (Tensor x, Scalar dim, bool keepdim, bool flatten) -> Tensor(values), Tensor(indices)
+  args : (Tensor x, Tensor values, Tensor indices, Tensor values_grad, Scalar dim, bool keepdim)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
@@ -2351,8 +2351,8 @@
     data_type : out_grad
 
 - backward_op : min_with_index_grad
-  forward : min_with_index (Tensor x, Scalar axis, bool keepdims, bool flatten) -> Tensor(values), Tensor(indices)
-  args : (Tensor x, Tensor values, Tensor values_grad, Scalar axis, bool keepdims, bool flatten)
+  forward : min_with_index (Tensor x, Scalar dim, bool keepdim, bool flatten) -> Tensor(values), Tensor(indices)
+  args : (Tensor x, Tensor values, Tensor indices, Tensor values_grad, Scalar dim, bool keepdim)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -3554,7 +3554,7 @@
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : max_with_index
-  args : (Tensor x, Scalar(int64_t) axis, bool keepdims = false, bool flatten = false)
+  args : (Tensor x, Scalar(int64_t) dim, bool keepdim = false, bool flatten = false)
   output : Tensor(values), Tensor(indices)
   infer_meta :
     func : MinMaxWithIndexInferMeta
@@ -3674,7 +3674,7 @@
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : min_with_index
-  args : (Tensor x, Scalar(int64_t) axis, bool keepdims = false, bool flatten = false)
+  args : (Tensor x, Scalar(int64_t) dim, bool keepdim = false, bool flatten = false)
   output : Tensor(values), Tensor(indices)
   infer_meta :
     func : MinMaxWithIndexInferMeta
diff --git a/python/paddle/tensor/compat.py b/python/paddle/tensor/compat.py
diff --git a/test/legacy_test/test_compat_minmax.py b/test/legacy_test/test_compat_minmax.py