bugfix lod cpu performance (#12297)

Superjomn · web-flow · commit 02cf54d331f4 · 2018-07-23T15:02:31.000+08:00
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
@@ -26,6 +26,7 @@
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -37,11 +38,11 @@ class Vector {
   Vector() { InitEmpty(); }
 
   // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
+  explicit Vector(size_t count, const T &value = T()) {
     InitEmpty();
     if (count != 0) {
       resize(count);
-      T* ptr = begin();
+      T *ptr = begin();
       for (size_t i = 0; i < count; ++i) {
         ptr[i] = value;
       }
@@ -59,7 +60,7 @@ class Vector {
 
   // implicit cast from std::vector.
   template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
     if (dat.size() == 0) {
       InitEmpty();
     } else {
@@ -68,10 +69,10 @@ class Vector {
   }
 
   // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { this->operator=(other); }
 
   // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
+  Vector<T> &operator=(const Vector<T> &other) {
     if (other.size() != 0) {
       this->InitByIter(other.size(), other.begin(), other.end());
     } else {
@@ -81,7 +82,7 @@ class Vector {
   }
 
   // Move ctor
-  Vector(Vector<T>&& other) {
+  Vector(Vector<T> &&other) {
     this->size_ = other.size_;
     this->flag_ = other.flag_;
     if (other.cuda_vec_.memory_size()) {
@@ -93,57 +94,57 @@ class Vector {
   }
 
   // CPU data access method. Mutable.
-  T& operator[](size_t i) {
+  T &operator[](size_t i) {
     MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
   }
 
   // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
+  const T &operator[](size_t i) const {
     ImmutableCPU();
     return cpu_vec_.data<T>()[i];
   }
 
   // std::vector iterator methods. Based on CPU data access method
   size_t size() const { return size_; }
 
-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
 
-  T* end() {
+  T *end() {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
   }
 
-  T& front() { return *begin(); }
+  T &front() { return *begin(); }
 
-  T& back() {
+  T &back() {
     auto it = end();
     --it;
     return *it;
   }
 
-  const T* begin() const {
+  const T *begin() const {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
   }
 
-  const T* end() const {
+  const T *end() const {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
   }
 
-  const T* cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }
 
-  const T* cend() const { return end(); }
+  const T *cend() const { return end(); }
 
-  const T& back() const {
+  const T &back() const {
     auto it = end();
     --it;
     return *it;
   }
 
-  T* data() { return begin(); }
+  T *data() { return begin(); }
 
-  const T* data() const { return begin(); }
+  const T *data() const { return begin(); }
 
-  const T& front() const { return *begin(); }
+  const T &front() const { return *begin(); }
   // end of std::vector iterator methods
 
   // assign this from iterator.
@@ -169,7 +170,7 @@ class Vector {
   void Extend(It begin, It end) {
     size_t pre_size = size_;
     resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
+    T *ptr = this->begin() + pre_size;
     for (; begin < end; ++begin, ++ptr) {
       *ptr = *begin;
     }
@@ -183,9 +184,9 @@ class Vector {
       MutableCPU();
       Tensor cpu_tensor;
       platform::Place cpu = platform::CPUPlace();
-      T* ptr = cpu_tensor.mutable_data<T>(
+      T *ptr = cpu_tensor.mutable_data<T>(
           framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T* old_ptr =
+      const T *old_ptr =
           cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
       if (old_ptr != nullptr) {
         std::copy(old_ptr, old_ptr + size_, ptr);
@@ -196,18 +197,18 @@ class Vector {
   }
 
   // get cuda ptr. immutable
-  const T* CUDAData(platform::Place place) const {
+  const T *CUDAData(platform::Place place) const {
     PADDLE_ENFORCE(platform::is_gpu_place(place),
                    "CUDA Data must on CUDA place");
     ImmutableCUDA(place);
     return cuda_vec_.data<T>();
   }
 
   // get cuda ptr. mutable
-  T* CUDAMutableData(platform::Place place) {
-    const T* ptr = CUDAData(place);
+  T *CUDAMutableData(platform::Place place) {
+    const T *ptr = CUDAData(place);
     flag_ = kDirty | kDataInCUDA;
-    return const_cast<T*>(ptr);
+    return const_cast<T *>(ptr);
   }
 
   // clear
@@ -228,7 +229,7 @@ class Vector {
   }
 
   // the unify method to access CPU or CUDA data. immutable.
-  const T* Data(platform::Place place) const {
+  const T *Data(platform::Place place) const {
     if (platform::is_gpu_place(place)) {
       return CUDAData(place);
     } else {
@@ -237,7 +238,7 @@ class Vector {
   }
 
   // the unify method to access CPU or CUDA data. mutable.
-  T* MutableData(platform::Place place) {
+  T *MutableData(platform::Place place) {
     if (platform::is_gpu_place(place)) {
       return CUDAMutableData(place);
     } else {
@@ -253,7 +254,7 @@ class Vector {
     return result;
   }
 
-  bool operator==(const Vector<T>& other) const {
+  bool operator==(const Vector<T> &other) const {
     if (size() != other.size()) return false;
     auto it1 = cbegin();
     auto it2 = other.cbegin();
@@ -274,7 +275,7 @@ class Vector {
   template <typename Iter>
   void InitByIter(size_t size, Iter begin, Iter end) {
     platform::Place cpu = platform::CPUPlace();
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
         framework::make_ddim({static_cast<int64_t>(size)}), cpu);
     for (size_t i = 0; i < size; ++i) {
       *ptr++ = *begin++;
@@ -368,7 +369,7 @@ class Vector {
     }
   }
 
-  static T& EmptyDummy() {
+  static T &EmptyDummy() {
     static T dummy = T();
     return dummy;
   }
@@ -379,5 +380,53 @@ class Vector {
   size_t size_;
 };
 
-}  // namespace framework
+#else  // PADDLE_WITH_CUDA
+
+template <typename T>
+class CPUVector : public std::vector<T, std::allocator<T>> {
+ public:
+  CPUVector() : std::vector<T>() {}
+  CPUVector(size_t count, const T &value = T())
+      : std::vector<T>(count, value) {}
+  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
+  explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector &operator=(const CPUVector &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+  CPUVector &operator=(const std::vector<T> &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
+    std::stringstream ss;
+    for (auto v : other) {
+      os << v << " ";
+    }
+    return os;
+  }
+
+  void resize(size_t size) { this->resize(size); }
+
+  T &operator[](size_t id) { return this->at(id); }
+
+  const T &operator[](size_t id) const { return this->at(id); }
+
+  template <typename D>
+  void Extend(const D &begin, const D &end) {
+    this->reserve(this->size() + size_t(end - begin));
+    this->insert(this->end(), begin, end);
+  }
+};
+
+template <typename T>
+using Vector = CPUVector<T>;
+
+#endif  // PADDLE_WITH_CUDA
+
+};  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
       int64_t* rows = nullptr;
+// When compiled without CUDA, the CUDAMutableData() interface should not be
+// provided.
+#if defined(PADDLE_WITH_CUDA)
       if (platform::is_gpu_place(ctx.GetPlace())) {
         rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
       } else {
+#endif
         rows = grad_merge.mutable_rows()->data();
+
+#if defined(PADDLE_WITH_CUDA)
       }
+#endif
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       SparseAdamFunctor<T> functor(
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     int64_t k = x->dims()[2];
 
     auto x_lod = x->lod().back();
+#if defined(PADDLE_WITH_CUDA)
     size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+#else
+    size_t* x_lod_data = x_lod.data();
+#endif
 
     TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                        mismatch_value, n, m, p, k, out_data,
@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
+#if defined(PADDLE_WITH_CUDA)
       size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+#else
+      size_t* neg_lod_data = neg_lod.data();
+#endif
       NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
       neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                       mismatch_value, out_data, out_wt_data);
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
     auto lods = lod_tensor.lod();
     PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
 
-    auto lod = lods[0];
+    const auto& lod = lods[0];
 
     std::vector<SeqInfo> seq_info;
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {