Some trivial optimization (#13530)

chengduo · web-flow · commit 1d91a49d2f8c · 2018-09-26T12:59:03.000+08:00
* some trivial opt

* remove the fix of lod_tensor and shrink_rnn_memory_op

* refine ShrinkRNNMemoryOp

test=develop
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
@@ -38,27 +38,31 @@ struct OpInfo {
   OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
+  std::string op_type_;
 
   bool HasOpProtoAndChecker() const {
     return proto_ != nullptr && checker_ != nullptr;
   }
 
   const proto::OpProto& Proto() const {
-    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered",
+                            op_type_);
     PADDLE_ENFORCE(proto_->IsInitialized(),
-                   "Operator Proto must be initialized in op info");
+                   "Operator %s Proto must be initialized in op info",
+                   op_type_);
     return *proto_;
   }
 
   const OpCreator& Creator() const {
-    PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator Creator has not been registered");
+    PADDLE_ENFORCE_NOT_NULL(
+        creator_, "Operator %s Creator has not been registered", op_type_);
     return creator_;
   }
 
   const GradOpMakerFN& GradOpMaker() const {
     PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
-                            "Operator GradOpMaker has not been registered.");
+                            "Operator %s GradOpMaker has not been registered.",
+                            op_type_);
     return grad_op_maker_;
   }
 
@@ -73,8 +77,9 @@ class OpInfoMap {
     return map_.find(op_type) != map_.end();
   }
 
-  void Insert(const std::string& type, const OpInfo& info) {
+  void Insert(const std::string& type, OpInfo info) {
     PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    info.op_type_ = type;
     map_.insert({type, info});
   }
 
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
@@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference {
     framework::VarDesc* reader = block->FindVarRecursive(reader_name);
     auto dtypes = reader->GetDataTypes();
     PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+    auto lod_levels = reader->GetLoDLevels();
     for (size_t i = 0; i < dtypes.size(); ++i) {
       framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
       out.SetType(framework::proto::VarType::LOD_TENSOR);
       out.SetDataType(dtypes[i]);
+      out.SetLoDLevel(lod_levels[i]);
     }
   }
 };
diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sgd_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
   }
 }
 
-template <typename T, int block_size>
+template <typename T>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                        const int64_t* rows,
                                        const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  selected_rows += ty * row_numel;
-  tensor_out += rows[ty] * row_numel;
-
-  for (int index = tid; index < row_numel; index += block_size) {
-    // Since index in rows of SelectedRows can be duplicate, we have to use
-    // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicAdd(
-        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+                                       int64_t row_numel, int64_t limit) {
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
+      // Since index in rows of SelectedRows can be duplicate, we have to use
+      // Atomic Operation to avoid concurrent write error.
+      paddle::platform::CudaAtomicAdd(
+          tensor_out_ptr + index,
+          -1.0 * learning_rate[0] * selected_rows_ptr[index]);
+    }
   }
 }
 }  // namespace
@@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       auto* in_data = in_value.data<T>();
       auto* out_data = param_out->data<T>();
 
-      const int block_size = 256;
-      dim3 threads(block_size, 1);
-      dim3 grid(1, in_rows.size());
-      SparseSGDFunctorKernel<
-          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+      const int kThreadsPerBlock = 256;
+      int thread_x = kThreadsPerBlock;
+      int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
+      int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+      SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
+                               ctx.cuda_device_context().stream()>>>(
           in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel);
+          out_data, in_row_numel, in_rows.size());
 
     } else {
       PADDLE_THROW("Unsupported Variable Type of Grad");
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -52,16 +52,26 @@ class ShrinkRNNMemoryOp : public ArrayOp {
     size_t height = dst_num_rows;
 
     // do shrink for the top level LoD
+
     if (x_tensor.lod().size() > 0 &&
         x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
-      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
-                                                              dst_num_rows, 0);
-      height = lod_offset.second.second;
-      auto out_lod = out_tensor.mutable_lod();
-      framework::AppendLoD(out_lod, lod_offset.first);
+      if (x_tensor.lod().size() > 1) {  // MultiLevel LoD
+        auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x_tensor.lod(), 0, dst_num_rows, 0);
+        height = lod_offset.second.second;
+        auto out_lod = out_tensor.mutable_lod();
+        framework::AppendLoD(out_lod, lod_offset.first);
+      } else {
+        // Shrink LoD
+        auto lod_item = x_tensor.lod()[0];
+        lod_item.resize(dst_num_rows + 1);
+        out_tensor.set_lod({lod_item});
+        const auto &const_lod_item = lod_item;
+        height = const_lod_item.back();
+      }
     }
 
-    if (dst_num_rows != 0) {
+    if (height != 0) {
       out_tensor.mutable_data(place, x_tensor.type());
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
@@ -134,8 +144,11 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     } else {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
-      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-      framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      if (height != 0) {
+        auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+        framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx,
+                              &slice);
+      }
       if (dx_tensor.dims()[0] > height) {
         auto rest_tensor = dx_tensor.Slice(
             static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
@@ -201,6 +201,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
   compute_capability = GetCUDAComputeCapability(place_.device);
   multi_process = GetCUDAMultiProcessors(place_.device);
   max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
+  grid_max_dims_ = GpuMaxGridDim(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
   eigen_stream_.reset(new EigenCudaStreamDevice());
   eigen_stream_->Reinitialize(&stream_, place);
@@ -239,6 +240,10 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
   return multi_process * max_threads_per_mp;
 }
 
+std::tuple<int, int, int> CUDADeviceContext::GetMaxGridDims() const {
+  return grid_max_dims_;
+}
+
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <vector>
 
@@ -91,6 +92,8 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return the max physical thread count in the device context */
   int GetMaxPhysicalThreadCount() const;
 
+  std::tuple<int, int, int> GetMaxGridDims() const;
+
   /*! \brief  Return eigen device in the device context. */
   Eigen::GpuDevice* eigen_device() const;
 
@@ -135,6 +138,8 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream_;
   cublasHandle_t cublas_handle_;
 
+  std::tuple<int, int, int> grid_max_dims_;
+
   int compute_capability;
   int multi_process;
   int max_threads_per_mp;
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
@@ -48,35 +48,54 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
 }
 
 template <typename Function>
-__global__ static void ForRangeElemwiseOp(Function func, int limit) {
+__global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
   size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
   if (idx < limit) {
     func(idx);
   }
 }
 
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit,
+                                                   int grid_dim) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  while (idx < limit) {
+    func(idx);
+    idx += grid_dim;
+  }
+}
+
 template <>
 struct ForRange<CUDADeviceContext> {
   ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
+      : dev_ctx_(dev_ctx), limit_(limit) {}
 
   template <typename Function>
   inline void operator()(Function func) const {
     constexpr int num_threads = 1024;
     int block_size = limit_ <= num_threads ? limit_ : num_threads;
-    int grid_size = (limit_ + num_threads - 1) / num_threads;
-
-    if (grid_size == 1) {
-      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
-          func);
+    size_t grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims());
+
+    if (grid_size < max_grid_dim) {
+      int grid_size_int = static_cast<int>(grid_size);
+      if (grid_size == 1) {
+        ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+            func);
+      } else {
+        ForRangeElemwiseOp<<<grid_size_int, block_size, 0, dev_ctx_.stream()>>>(
+            func, limit_);
+      }
     } else {
-      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
-          func, limit_);
+      ForRangeElemwiseOpGridLarge<<<max_grid_dim, block_size, 0,
+                                    dev_ctx_.stream()>>>(func, limit_,
+                                                         max_grid_dim);
     }
   }
 
   const CUDADeviceContext& dev_ctx_;
-  int limit_;
+  size_t limit_;
 };
 
 #endif
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
@@ -152,5 +152,22 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
   PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
                  "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
 }
+
+std::tuple<int, int, int> GpuMaxGridDim(int id) {
+  std::tuple<int, int, int> result;
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id),
+      "cudaDeviceGetAttribute failed in "
+      "cudaDevAttrMaxBlockDim");
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id),
+      "cudaDeviceGetAttribute failed in "
+      "cudaDevAttrMaxBlockDim");
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id),
+      "cudaDeviceGetAttribute failed in "
+      "cudaDevAttrMaxBlockDim");
+  return result;
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <stddef.h>
 #include <string>
+#include <tuple>
 
 namespace paddle {
 namespace platform {
@@ -72,6 +73,8 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
 
+std::tuple<int, int, int> GpuMaxGridDim(int id);
+
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
@@ -311,6 +311,7 @@ def _copy_reader_var_(block, var):
     new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
     new_var.desc.set_shapes(var.desc.shapes())
     new_var.desc.set_dtypes(var.desc.dtypes())
+    new_var.desc.set_lod_levels(var.desc.lod_levels())
     new_var.persistable = True
     return new_var
 
@@ -632,6 +633,7 @@ def py_reader(capacity,
         })
 
     startup_var.desc.set_dtypes(dtypes)
+    startup_var.desc.set_lod_levels(lod_levels)
     startup_var.persistable = True
 
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),

Original file line number	Diff line number	Diff line change
`@@ -45,10 +45,12 @@ class ReadInferVarType : public framework::VarTypeInference {`
`45`	`45`	`framework::VarDesc* reader = block->FindVarRecursive(reader_name);`
`46`	`46`	`auto dtypes = reader->GetDataTypes();`
`47`	`47`	`PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());`
	`48`	`+ auto lod_levels = reader->GetLoDLevels();`
`48`	`49`	`for (size_t i = 0; i < dtypes.size(); ++i) {`
`49`	`50`	`framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);`
`50`	`51`	`out.SetType(framework::proto::VarType::LOD_TENSOR);`
`51`	`52`	`out.SetDataType(dtypes[i]);`
	`53`	`+ out.SetLoDLevel(lod_levels[i]);`
`52`	`54`	`}`
`53`	`55`	`}`
`54`	`56`	`};`