[Precision Depth Alignment] fix eps of paddle.logit from float to double (PaddlePaddle#75816)

zhengshengning · web-flow · commit d2f4afd4256e · 2025-10-17T22:06:07.000+08:00
* accuracy_stable_logit

* add LogitOpTranscriber

* fix coverage

* fix 0yaml
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -3958,6 +3958,43 @@ struct SoftPlusOpTranscriber : public OpTranscriber {
   }
 };
 
+struct LogitOpTranscriber : public OpTranscriber {
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
+      const std::string& normalized_op_name,
+      const OpAttributeInfoList& op_attr_infos,
+      const OpDesc& op_desc) override {
+    auto& attribute_translator = AttributeTranslator::instance();
+    auto& op_normalizer = OpNameNormalizer::instance();
+    pir::AttributeMap attribute_map = {};
+
+    for (const auto& info : op_attr_infos) {
+      auto legacy_attr_name =
+          op_normalizer.GetLegacyAttrName(op_desc.Type(), info.name);
+      VLOG(10) << "[op: " << op_desc.Type()
+               << "][attr] from: " << legacy_attr_name << " to: " << info.name;
+      if (op_desc.HasAttr(legacy_attr_name)) {
+        paddle::framework::Attribute legacy_attr =
+            op_desc.GetAttr(legacy_attr_name);
+        VLOG(10) << "attribute in " << op_desc.Type()
+                 << " name: " << legacy_attr_name << " " << legacy_attr.index();
+        pir::Attribute new_attr =
+            attribute_translator(info.type_name, legacy_attr);
+        if (legacy_attr_name == "eps") {
+          new_attr = pir::DoubleAttribute::get(
+              ctx,
+              static_cast<double>(
+                  new_attr.dyn_cast<pir::FloatAttribute>().data()));
+        }
+        attribute_map[info.name] = new_attr;
+      } else {
+        this->HandleNonexistentAttribute(ctx, &attribute_map, info);
+      }
+    }
+    return attribute_map;
+  }
+};
+
 OpTranslator::OpTranslator() {
   pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
@@ -4072,5 +4109,7 @@ OpTranslator::OpTranslator() {
   special_handlers["c_sync_comm_stream"] = SyncCommStreamOpTranscriber();
   special_handlers["softplus"] = SoftPlusOpTranscriber();
   special_handlers["softplus_grad"] = SoftPlusOpTranscriber();
+  special_handlers["logit"] = LogitOpTranscriber();
+  special_handlers["logit_grad"] = LogitOpTranscriber();
 }
 }  // namespace paddle::translator
diff --git a/paddle/fluid/pir/serialize_deserialize/patch/0.yaml b/paddle/fluid/pir/serialize_deserialize/patch/0.yaml
@@ -4,26 +4,25 @@ op_patches:
       - action : modify_attr
         object : beta
         type : pir::DoubleAttribute
-        data : 1.0
       - action : modify_attr
         object : threshold
         type : pir::DoubleAttribute
-        data : 20.0
   - op_name : onednn_op.fused_softplus
     actions:
       - action : modify_attr
         object : beta
         type : pir::DoubleAttribute
-        data : 1.0
       - action : modify_attr
         object : threshold
         type : pir::DoubleAttribute
-        data : 20.0
       - action : modify_attr
         object : fuse_alpha
         type : pir::DoubleAttribute
-        data : 0.0
       - action : modify_attr
         object : fuse_beta
         type : pir::DoubleAttribute
-        data : 0.0
+  - op_name : pd_op.logit
+    actions:
+      - action : modify_attr
+        object : eps
+        type : pir::DoubleAttribute
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.cc b/paddle/phi/infermeta/spmd_rules/elementwise.cc
@@ -746,13 +746,13 @@ SpmdInfo ThresholdedReluGradInfoSpmd(const DistMetaTensor& x,
 }
 
 // logit
-SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const float eps) {
+SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const double eps) {
   return ElementwiseUnaryInferSpmd(x);
 }
 
 SpmdInfo LogitGradInfoSpmd(const DistMetaTensor& x,
                            const DistMetaTensor& out_grad,
-                           const float eps) {
+                           const double eps) {
   return ElementwiseUnaryGradInferSpmd(x, out_grad);
 }
 
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.h b/paddle/phi/infermeta/spmd_rules/elementwise.h
@@ -124,9 +124,9 @@ SpmdInfo ThresholdedReluGradInfoSpmd(const DistMetaTensor& x,
                                      const float threshold,
                                      const float value);
 
-SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const float eps);
+SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const double eps);
 SpmdInfo LogitGradInfoSpmd(const DistMetaTensor& x,
                            const DistMetaTensor& out_grad,
-                           const float eps);
+                           const double eps);
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
@@ -36,6 +36,14 @@ namespace phi {
                         float attr,                             \
                         DenseTensor* dx);
 
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(name, attr) \
+  template <typename T, typename Context>                              \
+  void name##GradKernel(const Context& dev_ctx,                        \
+                        const DenseTensor& x,                          \
+                        const DenseTensor& dout,                       \
+                        double attr,                                   \
+                        DenseTensor* dx);
+
 #define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \
   template <typename T, typename Context>                               \
   void name##GradKernel(const Context& dev_ctx,                         \
@@ -74,6 +82,14 @@ namespace phi {
                         float attr,                               \
                         DenseTensor* dx);
 
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(name, attr) \
+  template <typename T, typename Context>                                \
+  void name##GradKernel(const Context& dev_ctx,                          \
+                        const DenseTensor& out,                          \
+                        const DenseTensor& dout,                         \
+                        double attr,                                     \
+                        DenseTensor* dx);
+
 #define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
   template <typename T, typename Context>                                 \
   void name##GradKernel(const Context& dev_ctx,                           \
@@ -318,10 +334,10 @@ DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Ceil);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
-DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(Logit, eps);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha);
-DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA, eps);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(LogitCUDA, eps);
 
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max);
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b);
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
@@ -32,6 +32,13 @@ namespace phi {
                     float attr,                              \
                     DenseTensor* out);
 
+#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_DOUBLE_ATTRS(name, attr) \
+  template <typename T, typename Context>                           \
+  void name##Kernel(const Context& dev_ctx,                         \
+                    const DenseTensor& x,                           \
+                    double attr,                                    \
+                    DenseTensor* out);
+
 #define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
   template <typename T, typename Context>                            \
   void name##Kernel(const Context& dev_ctx,                          \
@@ -87,7 +94,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)
-DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Logit, eps)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_DOUBLE_ATTRS(Logit, eps)
 
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardTanh, t_min, t_max)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
@@ -554,7 +554,7 @@ struct CosFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct LogitFunctor {
   template <typename Device, typename X, typename Out, typename P>
-  void operator()(Device d, X x, Out out, P p, float eps) const {
+  void operator()(Device d, X x, Out out, P p, double eps) const {
     // logit(x) = ln(x/(1-x))
     auto tmp_x =
         (x.cwiseMin(static_cast<T>(1.0 - eps))).cwiseMax(static_cast<T>(eps));
@@ -1268,7 +1268,7 @@ struct AtanGradFunctor<ComplexType<T>>
 template <typename T>
 struct LogitGradFunctor {
   template <typename Device, typename X, typename dOut, typename dX, typename P>
-  void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const {
+  void operator()(Device d, X x, dOut dout, dX dx, P p, double eps) const {
     // logit(x)' = 1/(x*(1-x))
     if (!eps) {
       dx.device(d) = (x < static_cast<T>(0.0) || x > static_cast<T>(1.0))
@@ -3422,15 +3422,14 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaLogitFunctor : public BaseActivationFunctor<T> {
+  using AttrPair = std::vector<std::pair<const char*, double*>>;
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
 
   MT zero = static_cast<MT>(0.0f);
   MT one = static_cast<MT>(1.0f);
-  float eps;
+  double eps;
 
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"eps", &eps}};
-  }
+  typename CudaLogitFunctor<T>::AttrPair GetAttrs() { return {{"eps", &eps}}; }
 
   // logit(x) = ln(x/(1-x))
   __device__ __forceinline__ T operator()(const T arg_x) const {
@@ -3449,13 +3448,14 @@ struct CudaLogitFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaLogitGradFunctor : public BaseActivationFunctor<T> {
+  using AttrPair = std::vector<std::pair<const char*, double*>>;
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
 
-  float eps;
+  double eps;
   MT zero = static_cast<MT>(0.0f);
   MT one = static_cast<MT>(1.0f);
 
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+  typename CudaLogitGradFunctor<T>::AttrPair GetAttrs() {
     return {{"eps", &eps}};
   }
   // logit(x)' = 1/(x*(1-x))
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -163,6 +163,21 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
         dev_ctx, nullptr, &out, &dout, dx, functor);            \
   }
 
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT( \
+    name, functor_class, attr)                                   \
+  template <typename T, typename Context>                        \
+  void name##GradKernel(const Context& dev_ctx,                  \
+                        const DenseTensor& out,                  \
+                        const DenseTensor& dout,                 \
+                        double attr,                             \
+                        DenseTensor* dx) {                       \
+    funcs::functor_class<T> functor;                             \
+    auto attrs = functor.GetAttrs();                             \
+    *(attrs[0].second) = attr;                                   \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(  \
+        dev_ctx, nullptr, &out, &dout, dx, functor);             \
+  }
+
 #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
     name, functor_class, attr1, attr2)                          \
   template <typename T, typename Context>                       \
@@ -242,9 +257,9 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
                                                CudaCELUGradFunctor,
                                                alpha);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA,
-                                                 CudaLogitGradFunctor,
-                                                 eps);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(LogitCUDA,
+                                                        CudaLogitGradFunctor,
+                                                        eps);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh,
                                                CudaHardTanhGradFunctor,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -74,6 +74,19 @@ void ActivationGPUImpl(const Context& dev_ctx,
         dev_ctx, x, out, functor);                                      \
   }
 
+#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                                      \
+  void name##Kernel(const Context& dev_ctx,                                    \
+                    const DenseTensor& x,                                      \
+                    double attr,                                               \
+                    DenseTensor* out) {                                        \
+    funcs::functor_class<T> functor;                                           \
+    auto attrs = functor.GetAttrs();                                           \
+    *(attrs[0].second) = attr;                                                 \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(                    \
+        dev_ctx, x, out, functor);                                             \
+  }
+
 #define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(               \
     name, functor_class, attr1, attr2)                      \
   template <typename T, typename Context>                   \
@@ -140,7 +153,7 @@ DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Exp, CudaExpFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Expm1, CudaExpm1Functor)
 
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
                                      CudaHardShrinkFunctor,
                                      threshold)
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -235,7 +235,7 @@ template <typename T, typename Context>
 void LogitGradKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& out_grad,
-                     float eps,
+                     double eps,
                      DenseTensor* x_grad) {
   dev_ctx.template Alloc<T>(x_grad);
 
diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h
@@ -52,7 +52,7 @@ void ActivationImpl(const Context& dev_ctx,
 template <typename T, typename Context>
 void LogitKernel(const Context& dev_ctx,
                  const DenseTensor& x,
-                 float eps,
+                 double eps,
                  DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -2097,8 +2097,8 @@
     func : logcumsumexp_grad
 
 - backward_op : logit_grad
-  forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, float eps)
+  forward : logit (Tensor x, double eps = 1e-6) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, double eps)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml
@@ -62,3 +62,4 @@
 - triu_grad
 - unpool_grad
 - unsqueeze_grad
+- logit_grad
diff --git a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml
@@ -98,3 +98,4 @@
 - unsqueeze
 - zeros
 - zeros_like
+- logit
diff --git a/paddle/phi/ops/yaml/legacy/static_backward.yaml b/paddle/phi/ops/yaml/legacy/static_backward.yaml
@@ -245,6 +245,17 @@
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
 
+- backward_op : logit_grad
+  forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float eps)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+    spmd_rule : LogitGradInfoSpmd
+  kernel :
+    func : logit_grad
+
 - backward_op : lp_pool2d_grad
   forward : lp_pool2d(Tensor x, IntArray kernel_size, int[] strides = {1,1}, int[] paddings = {0,0}, bool ceil_mode = false, bool exclusive = true, str data_format = "NCHW", str pooling_type = "", bool global_pooling = false, bool adaptive = false, str padding_algorithm = "EXPLICIT", float norm_type = 0.0f) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, float norm_type)
diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml
@@ -522,6 +522,20 @@
     data_type : dtype
   traits : paddle::dialect::ForwardOnlyTrait
 
+- op : logit
+  args : (Tensor x, float eps = 1e-6f)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+    spmd_rule : LogitInfoSpmd
+  kernel :
+    func : logit
+  inplace: (x -> out)
+  backward : logit_grad
+  interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface
+  traits: pir::UnaryElementWiseTrait
+
 - op : lp_pool2d
   args : (Tensor x, IntArray kernel_size, int[] strides = {1,1}, int[] paddings = {0,0}, bool ceil_mode = false, bool exclusive = true, str data_format = "NCHW", str pooling_type = "", bool global_pooling = false, bool adaptive = false, str padding_algorithm = "EXPLICIT", float norm_type = 0.0f)
   output : Tensor(out)
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -3312,7 +3312,7 @@
   traits : paddle::dialect::ForwardOnlyTrait
 
 - op : logit
-  args : (Tensor x, float eps = 1e-6f)
+  args : (Tensor x, double eps = 1e-6)
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta

Original file line number	Diff line number	Diff line change
`@@ -746,13 +746,13 @@ SpmdInfo ThresholdedReluGradInfoSpmd(const DistMetaTensor& x,`
`746`	`746`	`}`
`747`	`747`
`748`	`748`	`// logit`
`749`		`-SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const float eps) {`
	`749`	`+SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const double eps) {`
`750`	`750`	`return ElementwiseUnaryInferSpmd(x);`
`751`	`751`	`}`
`752`	`752`
`753`	`753`	`SpmdInfo LogitGradInfoSpmd(const DistMetaTensor& x,`
`754`	`754`	`const DistMetaTensor& out_grad,`
`755`		`- const float eps) {`
	`755`	`+ const double eps) {`
`756`	`756`	`return ElementwiseUnaryGradInferSpmd(x, out_grad);`
`757`	`757`	`}`
`758`	`758`