Skip to content

Commit fa1feb4

Browse files
authored
Revert ""cherry picked operators changes" (#12184)" (#12810)
This reverts commit bf3c349.
1 parent a530497 commit fa1feb4

30 files changed

+108
-328
lines changed

paddle/fluid/operators/activation_op.cu

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
2626
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
2727
ops::grad_functor<float>>, \
2828
ops::ActivationGradKernel<plat::CUDADeviceContext, \
29-
ops::grad_functor<double>>, \
30-
ops::ActivationGradKernel<plat::CUDADeviceContext, \
31-
ops::grad_functor<plat::float16>>);
29+
ops::grad_functor<double>>);
3230

3331
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);

paddle/fluid/operators/activation_op.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
333333
template <typename Device, typename X, typename Out, typename dOut,
334334
typename dX>
335335
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
336-
dx.device(d) = static_cast<T>(0.5) * dout / out;
336+
const Out out_conj = Eigen::numext::conj(out);
337+
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
337338
}
338339
};
339340

@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
739740
typename dX>
740741
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
741742
dx.device(d) = dout * static_cast<T>(factor) *
742-
x.pow(static_cast<T>(factor) - static_cast<T>(1));
743+
x.pow(static_cast<T>(factor - static_cast<T>(1)));
743744
}
744745
};
745746

@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
862863
template <typename Device, typename X, typename Out, typename dOut,
863864
typename dX>
864865
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
865-
T b = static_cast<T>(beta);
866866
auto temp1 = static_cast<T>(1) /
867-
(static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
868-
auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
869-
dx.device(d) = dout * ((b * out) + temp2);
867+
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
868+
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
869+
dx.device(d) = dout * ((beta * out) + temp2);
870870
}
871871
};
872872

paddle/fluid/operators/assign_value_op.cu.cc

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/assign_value_op.h"
16-
#include "paddle/fluid/platform/float16.h"
1716

1817
namespace ops = paddle::operators;
19-
namespace plat = paddle::platform;
2018
REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
21-
ops::AssignValueKernel<float>,
22-
ops::AssignValueKernel<plat::float16>);
19+
ops::AssignValueKernel<float>);

paddle/fluid/operators/conv_cudnn_op.cu.cc

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
3939
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
4040
static_cast<size_t>(1024) * 1024 * 1024;
4141

42-
template <typename T, typename DeviceContext>
43-
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
44-
bool EnableFp16(const DeviceContext& dev_ctx,
45-
cudnnConvolutionDescriptor_t cudnn_conv_desc) {
46-
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
47-
// Tensor core is supported since the volta GPU and
48-
// is only enabled when input and filter data are float16
49-
if (dev_ctx.GetComputeCapability() >= 70 &&
50-
std::type_index(typeid(T)) ==
51-
std::type_index(typeid(platform::float16))) {
52-
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
53-
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
54-
return true;
55-
} else {
56-
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
57-
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
58-
}
59-
#endif
60-
return false;
61-
}
62-
6342
template <typename T>
6443
class CUDNNConvOpKernel : public framework::OpKernel<T> {
6544
public:
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
149128
cudnnConvolutionFwdAlgo_t algo;
150129
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
151130
auto handle = dev_ctx.cudnn_handle();
152-
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
131+
132+
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
133+
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
134+
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
135+
workspace_size_limit, &algo));
136+
137+
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
138+
// Tensor core is supported since the volta GPU and
139+
// is only enabled when input and filter data are float16
140+
if (dev_ctx.GetComputeCapability() >= 70 &&
141+
std::type_index(typeid(T)) ==
142+
std::type_index(typeid(platform::float16))) {
143+
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
144+
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
145+
// Currently tensor core is only enabled using this algo
153146
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
154147
} else {
155-
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
156-
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
157-
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
158-
workspace_size_limit, &algo));
148+
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
149+
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
159150
}
151+
#endif
160152

161153
// get workspace size able to allocate
162154
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
296288
} else {
297289
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
298290
}
299-
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
300-
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
301-
}
302291

303292
CUDNN_ENFORCE(
304293
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
318307
} else {
319308
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
320309
}
321-
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
322-
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
323-
}
324310

325311
CUDNN_ENFORCE(
326312
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -376,14 +362,12 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
376362
paddle::operators::CUDNNConvOpKernel<plat::float16>);
377363
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
378364
paddle::operators::CUDNNConvGradOpKernel<float>,
379-
paddle::operators::CUDNNConvGradOpKernel<double>,
380-
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
365+
paddle::operators::CUDNNConvGradOpKernel<double>);
381366

382367
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
383368
paddle::operators::CUDNNConvOpKernel<float>,
384369
paddle::operators::CUDNNConvOpKernel<double>,
385370
paddle::operators::CUDNNConvOpKernel<plat::float16>);
386371
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
387372
paddle::operators::CUDNNConvGradOpKernel<float>,
388-
paddle::operators::CUDNNConvGradOpKernel<double>,
389-
paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
373+
paddle::operators::CUDNNConvGradOpKernel<double>);

paddle/fluid/operators/cross_entropy_op.cu

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/cross_entropy_op.h"
16-
#include "paddle/fluid/platform/float16.h"
1716

1817
namespace ops = paddle::operators;
19-
namespace plat = paddle::platform;
2018
using CUDACtx = paddle::platform::CUDADeviceContext;
2119
REGISTER_OP_CUDA_KERNEL(cross_entropy,
2220
ops::CrossEntropyOpKernel<CUDACtx, float>,
23-
ops::CrossEntropyOpKernel<CUDACtx, double>,
24-
ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
25-
REGISTER_OP_CUDA_KERNEL(
26-
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
27-
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
28-
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
21+
ops::CrossEntropyOpKernel<CUDACtx, double>);
22+
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
23+
ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
24+
ops::CrossEntropyGradientOpKernel<CUDACtx, double>);

paddle/fluid/operators/elementwise_add_op.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
3030
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
3131
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
3232
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
33-
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
34-
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
33+
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);

paddle/fluid/operators/elementwise_div_op.cu

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,19 @@ limitations under the License. */
1414

1515
#define EIGEN_USE_GPU
1616
#include "paddle/fluid/operators/elementwise_div_op.h"
17-
#include "paddle/fluid/platform/float16.h"
1817

1918
namespace ops = paddle::operators;
20-
namespace plat = paddle::platform;
2119

2220
REGISTER_OP_CUDA_KERNEL(
2321
elementwise_div,
2422
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
2523
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
2624
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
27-
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
28-
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
29-
plat::float16>);
25+
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
3026
REGISTER_OP_CUDA_KERNEL(
3127
elementwise_div_grad,
3228
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
3329
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
3430
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
35-
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
3631
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
37-
plat::float16>);
32+
int64_t>);

paddle/fluid/operators/elementwise_mul_op.cu

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,19 @@ limitations under the License. */
1414

1515
#define EIGEN_USE_GPU
1616
#include "paddle/fluid/operators/elementwise_mul_op.h"
17-
#include "paddle/fluid/platform/float16.h"
1817

1918
namespace ops = paddle::operators;
20-
namespace plat = paddle::platform;
2119

2220
REGISTER_OP_CUDA_KERNEL(
2321
elementwise_mul,
2422
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
2523
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
2624
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
27-
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
28-
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
29-
plat::float16>);
25+
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
3026
REGISTER_OP_CUDA_KERNEL(
3127
elementwise_mul_grad,
3228
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
3329
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
3430
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
35-
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
36-
plat::float16>,
3731
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
3832
int64_t>);

paddle/fluid/operators/elementwise_op_function.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
350350
int j = blockIdx.x;
351351
int i = threadIdx.x;
352352
int tid = threadIdx.x;
353-
T val(0);
353+
T val = 0;
354354

355355
do {
356356
int x_offset = i * w + j;
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
418418
int tid = threadIdx.x;
419419
int j = blockIdx.x;
420420

421-
T val(0);
421+
T val = 0;
422422
int ttid = tid;
423423

424424
while (true) {

paddle/fluid/operators/elementwise_sub_op.cu

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,19 @@ limitations under the License. */
1414

1515
#define EIGEN_USE_GPU
1616
#include "paddle/fluid/operators/elementwise_sub_op.h"
17-
#include "paddle/fluid/platform/float16.h"
1817

1918
namespace ops = paddle::operators;
20-
namespace plat = paddle::platform;
2119

2220
REGISTER_OP_CUDA_KERNEL(
2321
elementwise_sub,
2422
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
2523
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
2624
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
27-
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
28-
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
29-
plat::float16>);
25+
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
3026
REGISTER_OP_CUDA_KERNEL(
3127
elementwise_sub_grad,
3228
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
3329
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
3430
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
35-
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
36-
plat::float16>,
3731
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
3832
int64_t>);

0 commit comments

Comments
 (0)