17
17
#include < algorithm>
18
18
#include < initializer_list>
19
19
#include < memory>
20
+ #include < mutex> // NOLINT
20
21
#include < utility>
21
22
#include < vector>
22
23
#include " paddle/fluid/framework/details/cow_ptr.h"
@@ -51,6 +52,7 @@ struct CUDABuffer {
51
52
ClearMemory ();
52
53
place_ = boost::get<platform::CUDAPlace>(place);
53
54
data_ = memory::Alloc (place_, size);
55
+ PADDLE_ENFORCE_NOT_NULL (data_);
54
56
size_ = size;
55
57
}
56
58
@@ -62,7 +64,7 @@ struct CUDABuffer {
62
64
63
65
private:
64
66
void ClearMemory () const {
65
- if (data_) {
67
+ if (data_ != nullptr ) {
66
68
memory::Free (place_, data_);
67
69
}
68
70
}
@@ -89,6 +91,7 @@ class Vector {
89
91
template <typename U>
90
92
explicit VectorData (const std::vector<U> &dat)
91
93
: cpu_(dat), flag_(kDataInCPU ) {}
94
+ ~VectorData () {}
92
95
93
96
VectorData (const VectorData &o) {
94
97
o.ImmutableCPU ();
@@ -215,7 +218,7 @@ class Vector {
215
218
size_t capacity () const { return cpu_.capacity (); }
216
219
217
220
// reserve data
218
- void reserve (size_t size) { cpu_.reserve (size); }
221
+ void reserve (size_t size) const { cpu_.reserve (size); }
219
222
220
223
// implicit cast operator. Vector can be cast to std::vector implicitly.
221
224
operator std::vector<T>() const {
@@ -229,6 +232,17 @@ class Vector {
229
232
return cpu_ == other.cpu_ ;
230
233
}
231
234
235
+ std::mutex &Mutex () const { return mtx_; }
236
+
237
+ std::unique_ptr<platform::CUDAPlace> CUDAPlace () const {
238
+ if (gpu_.data_ == nullptr ) {
239
+ return nullptr ;
240
+ } else {
241
+ return std::unique_ptr<platform::CUDAPlace>(
242
+ new platform::CUDAPlace (gpu_.place_ ));
243
+ }
244
+ }
245
+
232
246
private:
233
247
enum DataFlag {
234
248
kDataInCPU = 0x01 ,
@@ -239,10 +253,15 @@ class Vector {
239
253
240
254
void CopyToCPU () const {
241
255
// COPY GPU Data To CPU
256
+ auto *dev_ctx = static_cast <platform::CUDADeviceContext *>(
257
+ platform::DeviceContextPool::Instance ().Get (
258
+ platform::Place (gpu_.place_ )));
259
+ auto stream = dev_ctx->stream ();
242
260
void *src = gpu_.data_ ;
243
261
void *dst = cpu_.data ();
244
262
memory::Copy (platform::CPUPlace (), dst, gpu_.place_ , src, gpu_.size_ ,
245
- nullptr );
263
+ stream);
264
+ dev_ctx->Wait ();
246
265
}
247
266
248
267
void MutableCPU () {
@@ -260,7 +279,7 @@ class Vector {
260
279
SetFlag (kDataInCUDA );
261
280
} else if (IsInCUDA () &&
262
281
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_ )) {
263
- CopyCUDADataToAnotherPlace (place );
282
+ PADDLE_THROW ( " This situation should not happen " );
264
283
// Still dirty
265
284
} else {
266
285
// Dirty && DataInCUDA && Device is same
@@ -272,28 +291,21 @@ class Vector {
272
291
CopyCPUDataToCUDA (place);
273
292
SetFlag (kDataInCUDA );
274
293
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_ )) {
275
- CopyCUDADataToAnotherPlace (place );
294
+ PADDLE_THROW ( " This situation should not happen. " );
276
295
} else {
277
296
// Not Dirty && DataInCUDA && Device is same
278
297
// Do nothing.
279
298
}
280
299
}
281
300
}
282
- void CopyCUDADataToAnotherPlace (const platform::Place &place) const {
283
- details::CUDABuffer tmp (place, gpu_.size_ );
284
- const void *src = gpu_.data_ ;
285
- void *dst = tmp.data_ ;
286
301
287
- memory::Copy (tmp.place_ , dst, gpu_.place_ , src, gpu_.size_ , nullptr );
288
- gpu_.Swap (tmp);
289
- }
290
302
void CopyCPUDataToCUDA (const platform::Place &place) const {
291
303
void *src = cpu_.data ();
292
304
gpu_.Resize (place, cpu_.size () * sizeof (T));
293
305
void *dst = gpu_.data_ ;
294
- auto stream = static_cast <platform::CUDADeviceContext *>(
295
- platform::DeviceContextPool::Instance ().Get (place))
296
- ->stream ();
306
+ auto *dev_ctx = static_cast <platform::CUDADeviceContext *>(
307
+ platform::DeviceContextPool::Instance ().Get (place));
308
+ auto stream = dev_ctx ->stream ();
297
309
memory::Copy (gpu_.place_ , dst, platform::CPUPlace (), src, gpu_.size_ ,
298
310
stream);
299
311
}
@@ -319,6 +331,8 @@ class Vector {
319
331
mutable std::vector<T> cpu_;
320
332
mutable details::CUDABuffer gpu_;
321
333
mutable int flag_;
334
+
335
+ mutable std::mutex mtx_;
322
336
};
323
337
324
338
public:
@@ -350,81 +364,103 @@ class Vector {
350
364
Vector (Vector<T> &&other) { m_ = std::move (other.m_ ); }
351
365
352
366
// CPU data access method. Mutable.
353
- T &operator [](size_t i) { return (*m_)[i]; }
367
+ T &operator [](size_t i) { return (*m_. MutableData () )[i]; }
354
368
355
369
// CPU data access method. Immutable.
356
- const T &operator [](size_t i) const { return (*m_ )[i]; }
370
+ const T &operator [](size_t i) const { return m_. Data ( )[i]; }
357
371
358
372
// std::vector iterator methods. Based on CPU data access method
359
- size_t size () const { return m_-> size (); }
373
+ size_t size () const { return m_. Data (). size (); }
360
374
361
- iterator begin () { return m_->begin (); }
375
+ iterator begin () { return m_. MutableData () ->begin (); }
362
376
363
- iterator end () { return m_->end (); }
377
+ iterator end () { return m_. MutableData () ->end (); }
364
378
365
- T &front () { return m_->front (); }
379
+ T &front () { return m_. MutableData () ->front (); }
366
380
367
- T &back () { return m_->back (); }
381
+ T &back () { return m_. MutableData () ->back (); }
368
382
369
- const_iterator begin () const { return m_-> begin (); }
383
+ const_iterator begin () const { return m_. Data (). begin (); }
370
384
371
- const_iterator end () const { return m_-> end (); }
385
+ const_iterator end () const { return m_. Data (). end (); }
372
386
373
387
const_iterator cbegin () const { return begin (); }
374
388
375
389
const_iterator cend () const { return end (); }
376
390
377
- const T &back () const { return m_-> back (); }
391
+ const T &back () const { return m_. Data (). back (); }
378
392
379
- T *data () { return m_->data (); }
393
+ T *data () { return m_. MutableData () ->data (); }
380
394
381
- const T *data () const { return m_-> data (); }
395
+ const T *data () const { return m_. Data (). data (); }
382
396
383
- const T &front () const { return m_-> front (); }
397
+ const T &front () const { return m_. Data (). front (); }
384
398
// end of std::vector iterator methods
385
399
386
400
// assign this from iterator.
387
401
// NOTE: the iterator must support `end-begin`
388
402
template <typename Iter>
389
403
void assign (Iter begin, Iter end) {
390
- m_->assign (begin, end);
404
+ m_. MutableData () ->assign (begin, end);
391
405
}
392
406
393
407
// push_back. If the previous capacity is not enough, the memory will
394
408
// double.
395
- void push_back (T elem) { m_->push_back (elem); }
409
+ void push_back (T elem) { m_. MutableData () ->push_back (elem); }
396
410
397
411
// extend a vector by iterator.
398
412
// NOTE: the iterator must support end-begin
399
413
template <typename It>
400
414
void Extend (It begin, It end) {
401
- m_->Extend (begin, end);
415
+ m_. MutableData () ->Extend (begin, end);
402
416
}
403
417
404
418
// resize the vector
405
419
void resize (size_t size) {
406
420
if (m_.Data ().size () != size) {
407
- m_->resize (size);
421
+ m_. MutableData () ->resize (size);
408
422
}
409
423
}
410
424
411
425
// get cuda ptr. immutable
412
426
const T *CUDAData (platform::Place place) const {
413
- return m_.Data ().CUDAData (place);
427
+ {
428
+ auto &mtx = m_.Data ().Mutex ();
429
+ std::lock_guard<std::mutex> guard (mtx);
430
+ auto cuda_place = m_.Data ().CUDAPlace ();
431
+ if (cuda_place == nullptr ||
432
+ *cuda_place == boost::get<platform::CUDAPlace>(place)) {
433
+ return m_.Data ().CUDAData (place);
434
+ }
435
+ }
436
+ // If m_ contains CUDAData in a different place. Detach manually.
437
+ m_.Detach ();
438
+ return CUDAData (place);
414
439
}
415
440
416
441
// get cuda ptr. mutable
417
442
T *CUDAMutableData (platform::Place place) {
418
- return m_->CUDAMutableData (place);
443
+ {
444
+ auto &mtx = m_.Data ().Mutex ();
445
+ std::lock_guard<std::mutex> guard (mtx);
446
+ auto cuda_place = m_.Data ().CUDAPlace ();
447
+ if (cuda_place == nullptr ||
448
+ *cuda_place == boost::get<platform::CUDAPlace>(place)) {
449
+ return m_.MutableData ()->CUDAMutableData (place);
450
+ }
451
+ }
452
+ // If m_ contains CUDAData in a different place. Detach manually.
453
+ m_.Detach ();
454
+ return CUDAMutableData (place);
419
455
}
420
456
421
457
// clear
422
- void clear () { m_->clear (); }
458
+ void clear () { m_. MutableData () ->clear (); }
423
459
424
- size_t capacity () const { return m_-> capacity (); }
460
+ size_t capacity () const { return m_. Data (). capacity (); }
425
461
426
462
// reserve data
427
- void reserve (size_t size) { m_-> reserve (size); }
463
+ void reserve (size_t size) { m_. Data (). reserve (size); }
428
464
429
465
// the unify method to access CPU or CUDA data. immutable.
430
466
const T *Data (platform::Place place) const {
@@ -445,7 +481,7 @@ class Vector {
445
481
}
446
482
447
483
// implicit cast operator. Vector can be cast to std::vector implicitly.
448
- operator std::vector<T>() const { return *m_ ; }
484
+ operator std::vector<T>() const { return m_. Data () ; }
449
485
450
486
bool operator ==(const Vector<T> &other) const {
451
487
if (size () != other.size ()) return false ;
@@ -463,7 +499,7 @@ class Vector {
463
499
464
500
private:
465
501
// Vector is an COW object.
466
- details::COWPtr<VectorData> m_;
502
+ mutable details::COWPtr<VectorData> m_;
467
503
};
468
504
469
505
#else // PADDLE_WITH_CUDA
0 commit comments