PaddlePaddle
diff --git a/‎paddle/operators/CMakeLists.txt
Lines changed: 15 additions & 4 deletions b/‎paddle/operators/CMakeLists.txt
Lines changed: 15 additions & 4 deletions
diff --git a/‎paddle/operators/batch_norm_op.cu renamed to ‎paddle/operators/batch_norm_op.cu.cc b/‎paddle/operators/batch_norm_op.cu renamed to ‎paddle/operators/batch_norm_op.cu.cc
diff --git a/‎paddle/operators/concat_op.cu renamed to ‎paddle/operators/concat_op.cu.cc b/‎paddle/operators/concat_op.cu renamed to ‎paddle/operators/concat_op.cu.cc
diff --git a/‎paddle/operators/conv2d_transpose_cudnn_op.cu renamed to ‎paddle/operators/conv2d_transpose_cudnn_op.cu.cc
Lines changed: 3 additions & 6 deletions b/‎paddle/operators/conv2d_transpose_cudnn_op.cu renamed to ‎paddle/operators/conv2d_transpose_cudnn_op.cu.cc
Lines changed: 3 additions & 6 deletions
diff --git a/‎paddle/operators/conv_cudnn_op.cu renamed to ‎paddle/operators/conv_cudnn_op.cu.cc b/‎paddle/operators/conv_cudnn_op.cu renamed to ‎paddle/operators/conv_cudnn_op.cu.cc
diff --git a/‎paddle/operators/conv_op.cu renamed to ‎paddle/operators/conv_op.cu.cc b/‎paddle/operators/conv_op.cu renamed to ‎paddle/operators/conv_op.cu.cc
diff --git a/‎paddle/operators/conv_transpose_op.cu renamed to ‎paddle/operators/conv_transpose_op.cu.cc b/‎paddle/operators/conv_transpose_op.cu renamed to ‎paddle/operators/conv_transpose_op.cu.cc
diff --git a/‎paddle/operators/cross_entropy_op.cu
Lines changed: 0 additions & 2 deletions b/‎paddle/operators/cross_entropy_op.cu
Lines changed: 0 additions & 2 deletions
diff --git a/‎paddle/operators/fill_constant_batch_size_like_op.cu renamed to ‎paddle/operators/fill_constant_batch_size_like_op.cu.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/operators/fill_constant_batch_size_like_op.cu renamed to ‎paddle/operators/fill_constant_batch_size_like_op.cu.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/operators/fill_zeros_like_op.cu renamed to ‎paddle/operators/fill_zeros_like_op.cu.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/operators/fill_zeros_like_op.cu renamed to ‎paddle/operators/fill_zeros_like_op.cu.cc
Lines changed: 1 addition & 1 deletion
@@ -9,6 +9,7 @@ function(op_library TARGET)
     set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
+    set(cu_cc_srcs)
     set(op_common_deps operator op_registry math_function)
     set(options "")
     set(oneValueArgs "")
@@ -22,13 +23,18 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
             list(APPEND cc_srcs ${TARGET}.cc)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+        endif()
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
     else()
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cu.cc$")
+                list(APPEND cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
@@ -43,7 +49,7 @@ function(op_library TARGET)
     endif()
 
     if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
@@ -140,7 +146,9 @@ function(op_library TARGET)
 
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0)
+    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+
+    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -160,11 +168,12 @@ set(DEPS_OPS
     recurrent_op
     dynamic_recurrent_op
     softmax_with_cross_entropy_op
+    softmax_op
+    sequence_softmax_op
     sum_op
     pool_op
     pool_with_index_op
     conv_op
-    lstm_op
     conv_transpose_op
     nccl_op
     sequence_conv_op
@@ -182,6 +191,8 @@ set(DEPS_OPS
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(softmax_op DEPS softmax)
+op_library(sequence_softmax_op DEPS softmax)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(adagrad_op DEPS selected_rows_functor)
@@ -225,6 +236,6 @@ cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
         rnn/recurrent_op_utils.cc
         DEPS dynamic_recurrent_op)
 if(WITH_GPU)
-  nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context)
+  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
@@ -200,9 +200,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      math::set_constant(ctx.device_context(), input_grad, 0);
 
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_output_desc, output_grad_data,
@@ -214,9 +212,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      math::set_constant(ctx.device_context(), filter_grad, 0);
+
       // Gradient with respect to the filter
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
           handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
 
@@ -23,8 +23,6 @@ template <typename T>
 __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                            const int64_t* label, const int N,
                                            const int D) {
-  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
-  // CUDA_1D_KERNEL_LOOP(i, N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     int idx = i * D + label[i];
 
@@ -12,8 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
 
@@ -12,8 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
+#include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(