Skip to content

Commit d2f4afd

Browse files
[Precision Depth Alignment] fix eps of paddle.logit from float to double (PaddlePaddle#75816)
* accuracy_stable_logit * add LogitOpTranscriber * fix coverage * fix 0yaml
1 parent 8e58cb9 commit d2f4afd

File tree

17 files changed

+146
-30
lines changed

17 files changed

+146
-30
lines changed

paddle/fluid/ir_adaptor/translator/op_translator.cc

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3958,6 +3958,43 @@ struct SoftPlusOpTranscriber : public OpTranscriber {
39583958
}
39593959
};
39603960

3961+
struct LogitOpTranscriber : public OpTranscriber {
3962+
pir::AttributeMap TranslateOpAttribute(
3963+
pir::IrContext* ctx,
3964+
const std::string& normalized_op_name,
3965+
const OpAttributeInfoList& op_attr_infos,
3966+
const OpDesc& op_desc) override {
3967+
auto& attribute_translator = AttributeTranslator::instance();
3968+
auto& op_normalizer = OpNameNormalizer::instance();
3969+
pir::AttributeMap attribute_map = {};
3970+
3971+
for (const auto& info : op_attr_infos) {
3972+
auto legacy_attr_name =
3973+
op_normalizer.GetLegacyAttrName(op_desc.Type(), info.name);
3974+
VLOG(10) << "[op: " << op_desc.Type()
3975+
<< "][attr] from: " << legacy_attr_name << " to: " << info.name;
3976+
if (op_desc.HasAttr(legacy_attr_name)) {
3977+
paddle::framework::Attribute legacy_attr =
3978+
op_desc.GetAttr(legacy_attr_name);
3979+
VLOG(10) << "attribute in " << op_desc.Type()
3980+
<< " name: " << legacy_attr_name << " " << legacy_attr.index();
3981+
pir::Attribute new_attr =
3982+
attribute_translator(info.type_name, legacy_attr);
3983+
if (legacy_attr_name == "eps") {
3984+
new_attr = pir::DoubleAttribute::get(
3985+
ctx,
3986+
static_cast<double>(
3987+
new_attr.dyn_cast<pir::FloatAttribute>().data()));
3988+
}
3989+
attribute_map[info.name] = new_attr;
3990+
} else {
3991+
this->HandleNonexistentAttribute(ctx, &attribute_map, info);
3992+
}
3993+
}
3994+
return attribute_map;
3995+
}
3996+
};
3997+
39613998
OpTranslator::OpTranslator() {
39623999
pir::IrContext* ctx = pir::IrContext::Instance();
39634000
ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
@@ -4072,5 +4109,7 @@ OpTranslator::OpTranslator() {
40724109
special_handlers["c_sync_comm_stream"] = SyncCommStreamOpTranscriber();
40734110
special_handlers["softplus"] = SoftPlusOpTranscriber();
40744111
special_handlers["softplus_grad"] = SoftPlusOpTranscriber();
4112+
special_handlers["logit"] = LogitOpTranscriber();
4113+
special_handlers["logit_grad"] = LogitOpTranscriber();
40754114
}
40764115
} // namespace paddle::translator

paddle/fluid/pir/serialize_deserialize/0.yaml renamed to paddle/fluid/pir/serialize_deserialize/patch/0.yaml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,25 @@ op_patches:
44
- action : modify_attr
55
object : beta
66
type : pir::DoubleAttribute
7-
data : 1.0
87
- action : modify_attr
98
object : threshold
109
type : pir::DoubleAttribute
11-
data : 20.0
1210
- op_name : onednn_op.fused_softplus
1311
actions:
1412
- action : modify_attr
1513
object : beta
1614
type : pir::DoubleAttribute
17-
data : 1.0
1815
- action : modify_attr
1916
object : threshold
2017
type : pir::DoubleAttribute
21-
data : 20.0
2218
- action : modify_attr
2319
object : fuse_alpha
2420
type : pir::DoubleAttribute
25-
data : 0.0
2621
- action : modify_attr
2722
object : fuse_beta
2823
type : pir::DoubleAttribute
29-
data : 0.0
24+
- op_name : pd_op.logit
25+
actions:
26+
- action : modify_attr
27+
object : eps
28+
type : pir::DoubleAttribute

paddle/phi/infermeta/spmd_rules/elementwise.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -746,13 +746,13 @@ SpmdInfo ThresholdedReluGradInfoSpmd(const DistMetaTensor& x,
746746
}
747747

748748
// logit
749-
SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const float eps) {
749+
SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const double eps) {
750750
return ElementwiseUnaryInferSpmd(x);
751751
}
752752

753753
SpmdInfo LogitGradInfoSpmd(const DistMetaTensor& x,
754754
const DistMetaTensor& out_grad,
755-
const float eps) {
755+
const double eps) {
756756
return ElementwiseUnaryGradInferSpmd(x, out_grad);
757757
}
758758

paddle/phi/infermeta/spmd_rules/elementwise.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,9 @@ SpmdInfo ThresholdedReluGradInfoSpmd(const DistMetaTensor& x,
124124
const float threshold,
125125
const float value);
126126

127-
SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const float eps);
127+
SpmdInfo LogitInfoSpmd(const DistMetaTensor& x, const double eps);
128128
SpmdInfo LogitGradInfoSpmd(const DistMetaTensor& x,
129129
const DistMetaTensor& out_grad,
130-
const float eps);
130+
const double eps);
131131
} // namespace distributed
132132
} // namespace phi

paddle/phi/kernels/activation_grad_kernel.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ namespace phi {
3636
float attr, \
3737
DenseTensor* dx);
3838

39+
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(name, attr) \
40+
template <typename T, typename Context> \
41+
void name##GradKernel(const Context& dev_ctx, \
42+
const DenseTensor& x, \
43+
const DenseTensor& dout, \
44+
double attr, \
45+
DenseTensor* dx);
46+
3947
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \
4048
template <typename T, typename Context> \
4149
void name##GradKernel(const Context& dev_ctx, \
@@ -74,6 +82,14 @@ namespace phi {
7482
float attr, \
7583
DenseTensor* dx);
7684

85+
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(name, attr) \
86+
template <typename T, typename Context> \
87+
void name##GradKernel(const Context& dev_ctx, \
88+
const DenseTensor& out, \
89+
const DenseTensor& dout, \
90+
double attr, \
91+
DenseTensor* dx);
92+
7793
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
7894
template <typename T, typename Context> \
7995
void name##GradKernel(const Context& dev_ctx, \
@@ -318,10 +334,10 @@ DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Ceil);
318334
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
319335
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
320336
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
321-
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
337+
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(Logit, eps);
322338
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold);
323339
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha);
324-
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA, eps);
340+
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(LogitCUDA, eps);
325341

326342
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max);
327343
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b);

paddle/phi/kernels/activation_kernel.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,13 @@ namespace phi {
3232
float attr, \
3333
DenseTensor* out);
3434

35+
#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_DOUBLE_ATTRS(name, attr) \
36+
template <typename T, typename Context> \
37+
void name##Kernel(const Context& dev_ctx, \
38+
const DenseTensor& x, \
39+
double attr, \
40+
DenseTensor* out);
41+
3542
#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
3643
template <typename T, typename Context> \
3744
void name##Kernel(const Context& dev_ctx, \
@@ -87,7 +94,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
8794
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
8895
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
8996
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)
90-
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Logit, eps)
97+
DECLARE_ACTIVATION_KERNEL_WITH_ONE_DOUBLE_ATTRS(Logit, eps)
9198

9299
DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardTanh, t_min, t_max)
93100
DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)

paddle/phi/kernels/funcs/activation_functor.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ struct CosFunctor : public BaseActivationFunctor<T> {
554554
template <typename T>
555555
struct LogitFunctor {
556556
template <typename Device, typename X, typename Out, typename P>
557-
void operator()(Device d, X x, Out out, P p, float eps) const {
557+
void operator()(Device d, X x, Out out, P p, double eps) const {
558558
// logit(x) = ln(x/(1-x))
559559
auto tmp_x =
560560
(x.cwiseMin(static_cast<T>(1.0 - eps))).cwiseMax(static_cast<T>(eps));
@@ -1268,7 +1268,7 @@ struct AtanGradFunctor<ComplexType<T>>
12681268
template <typename T>
12691269
struct LogitGradFunctor {
12701270
template <typename Device, typename X, typename dOut, typename dX, typename P>
1271-
void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const {
1271+
void operator()(Device d, X x, dOut dout, dX dx, P p, double eps) const {
12721272
// logit(x)' = 1/(x*(1-x))
12731273
if (!eps) {
12741274
dx.device(d) = (x < static_cast<T>(0.0) || x > static_cast<T>(1.0))
@@ -3422,15 +3422,14 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
34223422

34233423
template <typename T>
34243424
struct CudaLogitFunctor : public BaseActivationFunctor<T> {
3425+
using AttrPair = std::vector<std::pair<const char*, double*>>;
34253426
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
34263427

34273428
MT zero = static_cast<MT>(0.0f);
34283429
MT one = static_cast<MT>(1.0f);
3429-
float eps;
3430+
double eps;
34303431

3431-
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
3432-
return {{"eps", &eps}};
3433-
}
3432+
typename CudaLogitFunctor<T>::AttrPair GetAttrs() { return {{"eps", &eps}}; }
34343433

34353434
// logit(x) = ln(x/(1-x))
34363435
__device__ __forceinline__ T operator()(const T arg_x) const {
@@ -3449,13 +3448,14 @@ struct CudaLogitFunctor : public BaseActivationFunctor<T> {
34493448

34503449
template <typename T>
34513450
struct CudaLogitGradFunctor : public BaseActivationFunctor<T> {
3451+
using AttrPair = std::vector<std::pair<const char*, double*>>;
34523452
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
34533453

3454-
float eps;
3454+
double eps;
34553455
MT zero = static_cast<MT>(0.0f);
34563456
MT one = static_cast<MT>(1.0f);
34573457

3458-
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
3458+
typename CudaLogitGradFunctor<T>::AttrPair GetAttrs() {
34593459
return {{"eps", &eps}};
34603460
}
34613461
// logit(x)' = 1/(x*(1-x))

paddle/phi/kernels/gpu/activation_grad_kernel.cu

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,21 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
163163
dev_ctx, nullptr, &out, &dout, dx, functor); \
164164
}
165165

166+
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT( \
167+
name, functor_class, attr) \
168+
template <typename T, typename Context> \
169+
void name##GradKernel(const Context& dev_ctx, \
170+
const DenseTensor& out, \
171+
const DenseTensor& dout, \
172+
double attr, \
173+
DenseTensor* dx) { \
174+
funcs::functor_class<T> functor; \
175+
auto attrs = functor.GetAttrs(); \
176+
*(attrs[0].second) = attr; \
177+
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
178+
dev_ctx, nullptr, &out, &dout, dx, functor); \
179+
}
180+
166181
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \
167182
name, functor_class, attr1, attr2) \
168183
template <typename T, typename Context> \
@@ -242,9 +257,9 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
242257
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
243258
CudaCELUGradFunctor,
244259
alpha);
245-
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA,
246-
CudaLogitGradFunctor,
247-
eps);
260+
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(LogitCUDA,
261+
CudaLogitGradFunctor,
262+
eps);
248263

249264
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh,
250265
CudaHardTanhGradFunctor,

paddle/phi/kernels/gpu/activation_kernel.cu

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,19 @@ void ActivationGPUImpl(const Context& dev_ctx,
7474
dev_ctx, x, out, functor); \
7575
}
7676

77+
#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(name, functor_class, attr) \
78+
template <typename T, typename Context> \
79+
void name##Kernel(const Context& dev_ctx, \
80+
const DenseTensor& x, \
81+
double attr, \
82+
DenseTensor* out) { \
83+
funcs::functor_class<T> functor; \
84+
auto attrs = functor.GetAttrs(); \
85+
*(attrs[0].second) = attr; \
86+
ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
87+
dev_ctx, x, out, functor); \
88+
}
89+
7790
#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS( \
7891
name, functor_class, attr1, attr2) \
7992
template <typename T, typename Context> \
@@ -140,7 +153,7 @@ DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Exp, CudaExpFunctor)
140153
DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Expm1, CudaExpm1Functor)
141154

142155
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
143-
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
156+
DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
144157
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
145158
CudaHardShrinkFunctor,
146159
threshold)

paddle/phi/kernels/impl/activation_grad_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ template <typename T, typename Context>
235235
void LogitGradKernel(const Context& dev_ctx,
236236
const DenseTensor& x,
237237
const DenseTensor& out_grad,
238-
float eps,
238+
double eps,
239239
DenseTensor* x_grad) {
240240
dev_ctx.template Alloc<T>(x_grad);
241241

0 commit comments

Comments
 (0)