Skip to content

Commit 02cf54d

Browse files
authored
bugfix lod cpu performance (#12297)
1 parent b41f8b9 commit 02cf54d

File tree

4 files changed

+99
-35
lines changed

4 files changed

+99
-35
lines changed

paddle/fluid/framework/mixed_vector.h

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
namespace paddle {
2727
namespace framework {
2828

29+
#if defined(PADDLE_WITH_CUDA)
2930
// Vector<T> implements the std::vector interface, and can get Data or
3031
// MutableData from any place. The data will be synced implicitly inside.
3132
template <typename T>
@@ -37,11 +38,11 @@ class Vector {
3738
Vector() { InitEmpty(); }
3839

3940
// Fill vector with value. The vector size is `count`.
40-
explicit Vector(size_t count, const T& value = T()) {
41+
explicit Vector(size_t count, const T &value = T()) {
4142
InitEmpty();
4243
if (count != 0) {
4344
resize(count);
44-
T* ptr = begin();
45+
T *ptr = begin();
4546
for (size_t i = 0; i < count; ++i) {
4647
ptr[i] = value;
4748
}
@@ -59,7 +60,7 @@ class Vector {
5960

6061
// implicit cast from std::vector.
6162
template <typename U>
62-
Vector(const std::vector<U>& dat) { // NOLINT
63+
Vector(const std::vector<U> &dat) { // NOLINT
6364
if (dat.size() == 0) {
6465
InitEmpty();
6566
} else {
@@ -68,10 +69,10 @@ class Vector {
6869
}
6970

7071
// Copy ctor
71-
Vector(const Vector<T>& other) { this->operator=(other); }
72+
Vector(const Vector<T> &other) { this->operator=(other); }
7273

7374
// Copy operator
74-
Vector<T>& operator=(const Vector<T>& other) {
75+
Vector<T> &operator=(const Vector<T> &other) {
7576
if (other.size() != 0) {
7677
this->InitByIter(other.size(), other.begin(), other.end());
7778
} else {
@@ -81,7 +82,7 @@ class Vector {
8182
}
8283

8384
// Move ctor
84-
Vector(Vector<T>&& other) {
85+
Vector(Vector<T> &&other) {
8586
this->size_ = other.size_;
8687
this->flag_ = other.flag_;
8788
if (other.cuda_vec_.memory_size()) {
@@ -93,57 +94,57 @@ class Vector {
9394
}
9495

9596
// CPU data access method. Mutable.
96-
T& operator[](size_t i) {
97+
T &operator[](size_t i) {
9798
MutableCPU();
98-
return const_cast<T*>(cpu_vec_.data<T>())[i];
99+
return const_cast<T *>(cpu_vec_.data<T>())[i];
99100
}
100101

101102
// CPU data access method. Immutable.
102-
const T& operator[](size_t i) const {
103+
const T &operator[](size_t i) const {
103104
ImmutableCPU();
104105
return cpu_vec_.data<T>()[i];
105106
}
106107

107108
// std::vector iterator methods. Based on CPU data access method
108109
size_t size() const { return size_; }
109110

110-
T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
111+
T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
111112

112-
T* end() {
113+
T *end() {
113114
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
114115
}
115116

116-
T& front() { return *begin(); }
117+
T &front() { return *begin(); }
117118

118-
T& back() {
119+
T &back() {
119120
auto it = end();
120121
--it;
121122
return *it;
122123
}
123124

124-
const T* begin() const {
125+
const T *begin() const {
125126
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
126127
}
127128

128-
const T* end() const {
129+
const T *end() const {
129130
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
130131
}
131132

132-
const T* cbegin() const { return begin(); }
133+
const T *cbegin() const { return begin(); }
133134

134-
const T* cend() const { return end(); }
135+
const T *cend() const { return end(); }
135136

136-
const T& back() const {
137+
const T &back() const {
137138
auto it = end();
138139
--it;
139140
return *it;
140141
}
141142

142-
T* data() { return begin(); }
143+
T *data() { return begin(); }
143144

144-
const T* data() const { return begin(); }
145+
const T *data() const { return begin(); }
145146

146-
const T& front() const { return *begin(); }
147+
const T &front() const { return *begin(); }
147148
// end of std::vector iterator methods
148149

149150
// assign this from iterator.
@@ -169,7 +170,7 @@ class Vector {
169170
void Extend(It begin, It end) {
170171
size_t pre_size = size_;
171172
resize(pre_size + (end - begin));
172-
T* ptr = this->begin() + pre_size;
173+
T *ptr = this->begin() + pre_size;
173174
for (; begin < end; ++begin, ++ptr) {
174175
*ptr = *begin;
175176
}
@@ -183,9 +184,9 @@ class Vector {
183184
MutableCPU();
184185
Tensor cpu_tensor;
185186
platform::Place cpu = platform::CPUPlace();
186-
T* ptr = cpu_tensor.mutable_data<T>(
187+
T *ptr = cpu_tensor.mutable_data<T>(
187188
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
188-
const T* old_ptr =
189+
const T *old_ptr =
189190
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
190191
if (old_ptr != nullptr) {
191192
std::copy(old_ptr, old_ptr + size_, ptr);
@@ -196,18 +197,18 @@ class Vector {
196197
}
197198

198199
// get cuda ptr. immutable
199-
const T* CUDAData(platform::Place place) const {
200+
const T *CUDAData(platform::Place place) const {
200201
PADDLE_ENFORCE(platform::is_gpu_place(place),
201202
"CUDA Data must on CUDA place");
202203
ImmutableCUDA(place);
203204
return cuda_vec_.data<T>();
204205
}
205206

206207
// get cuda ptr. mutable
207-
T* CUDAMutableData(platform::Place place) {
208-
const T* ptr = CUDAData(place);
208+
T *CUDAMutableData(platform::Place place) {
209+
const T *ptr = CUDAData(place);
209210
flag_ = kDirty | kDataInCUDA;
210-
return const_cast<T*>(ptr);
211+
return const_cast<T *>(ptr);
211212
}
212213

213214
// clear
@@ -228,7 +229,7 @@ class Vector {
228229
}
229230

230231
// the unify method to access CPU or CUDA data. immutable.
231-
const T* Data(platform::Place place) const {
232+
const T *Data(platform::Place place) const {
232233
if (platform::is_gpu_place(place)) {
233234
return CUDAData(place);
234235
} else {
@@ -237,7 +238,7 @@ class Vector {
237238
}
238239

239240
// the unify method to access CPU or CUDA data. mutable.
240-
T* MutableData(platform::Place place) {
241+
T *MutableData(platform::Place place) {
241242
if (platform::is_gpu_place(place)) {
242243
return CUDAMutableData(place);
243244
} else {
@@ -253,7 +254,7 @@ class Vector {
253254
return result;
254255
}
255256

256-
bool operator==(const Vector<T>& other) const {
257+
bool operator==(const Vector<T> &other) const {
257258
if (size() != other.size()) return false;
258259
auto it1 = cbegin();
259260
auto it2 = other.cbegin();
@@ -274,7 +275,7 @@ class Vector {
274275
template <typename Iter>
275276
void InitByIter(size_t size, Iter begin, Iter end) {
276277
platform::Place cpu = platform::CPUPlace();
277-
T* ptr = this->cpu_vec_.template mutable_data<T>(
278+
T *ptr = this->cpu_vec_.template mutable_data<T>(
278279
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
279280
for (size_t i = 0; i < size; ++i) {
280281
*ptr++ = *begin++;
@@ -368,7 +369,7 @@ class Vector {
368369
}
369370
}
370371

371-
static T& EmptyDummy() {
372+
static T &EmptyDummy() {
372373
static T dummy = T();
373374
return dummy;
374375
}
@@ -379,5 +380,53 @@ class Vector {
379380
size_t size_;
380381
};
381382

382-
} // namespace framework
383+
#else // PADDLE_WITH_CUDA
384+
385+
template <typename T>
386+
class CPUVector : public std::vector<T, std::allocator<T>> {
387+
public:
388+
CPUVector() : std::vector<T>() {}
389+
CPUVector(size_t count, const T &value = T())
390+
: std::vector<T>(count, value) {}
391+
CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
392+
CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
393+
explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
394+
CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
395+
CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
396+
CPUVector &operator=(const CPUVector &other) {
397+
this->assign(other.begin(), other.end());
398+
return *this;
399+
}
400+
CPUVector &operator=(const std::vector<T> &other) {
401+
this->assign(other.begin(), other.end());
402+
return *this;
403+
}
404+
405+
friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
406+
std::stringstream ss;
407+
for (auto v : other) {
408+
os << v << " ";
409+
}
410+
return os;
411+
}
412+
413+
void resize(size_t size) { this->resize(size); }
414+
415+
T &operator[](size_t id) { return this->at(id); }
416+
417+
const T &operator[](size_t id) const { return this->at(id); }
418+
419+
template <typename D>
420+
void Extend(const D &begin, const D &end) {
421+
this->reserve(this->size() + size_t(end - begin));
422+
this->insert(this->end(), begin, end);
423+
}
424+
};
425+
426+
template <typename T>
427+
using Vector = CPUVector<T>;
428+
429+
#endif // PADDLE_WITH_CUDA
430+
431+
}; // namespace framework
383432
} // namespace paddle

paddle/fluid/operators/adam_op.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
293293
auto& grad_tensor = grad_merge.value();
294294
const T* grad_data = grad_tensor.template data<T>();
295295
int64_t* rows = nullptr;
296+
// When compiled without CUDA, the CUDAMutableData() interface should not be
297+
// provided.
298+
#if defined(PADDLE_WITH_CUDA)
296299
if (platform::is_gpu_place(ctx.GetPlace())) {
297300
rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
298301
} else {
302+
#endif
299303
rows = grad_merge.mutable_rows()->data();
304+
305+
#if defined(PADDLE_WITH_CUDA)
300306
}
307+
#endif
301308
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
302309

303310
SparseAdamFunctor<T> functor(

paddle/fluid/operators/detection/target_assign_op.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
106106
int64_t k = x->dims()[2];
107107

108108
auto x_lod = x->lod().back();
109+
#if defined(PADDLE_WITH_CUDA)
109110
size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
111+
#else
112+
size_t* x_lod_data = x_lod.data();
113+
#endif
110114

111115
TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
112116
mismatch_value, n, m, p, k, out_data,
@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
121125
PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
122126
const int* neg_idx_data = neg_indices->data<int>();
123127
auto neg_lod = neg_indices->lod().back();
128+
#if defined(PADDLE_WITH_CUDA)
124129
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
130+
#else
131+
size_t* neg_lod_data = neg_lod.data();
132+
#endif
125133
NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
126134
neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
127135
mismatch_value, out_data, out_wt_data);

paddle/fluid/operators/math/sequence2batch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
7878
auto lods = lod_tensor.lod();
7979
PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
8080

81-
auto lod = lods[0];
81+
const auto& lod = lods[0];
8282

8383
std::vector<SeqInfo> seq_info;
8484
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {

0 commit comments

Comments
 (0)