Skip to content

Commit f1a392a

Browse files
authored
Merge pull request #13804 from sneaxiy/rewrite_allocation
Rewrite allocation
2 parents fd7e643 + 98bbfc1 commit f1a392a

File tree

88 files changed

+3531
-874
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+3531
-874
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ paddle/operators/tensor.save
44
python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
55
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
66
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
7+
paddle/fluid/operators/distributed/send_recv.proto
78
*.DS_Store
89
*.vs
910
build/
@@ -28,4 +29,5 @@ third_party/
2829
build_*
2930
# clion workspace.
3031
cmake-build-*
32+
paddle/fluid/operators/distributed/send_recv.proto
3133
model_test

paddle/fluid/framework/details/exception_holder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class ExceptionHolder {
3030
Catch(exp);
3131
} catch (platform::EnforceNotMet exp) {
3232
Catch(exp);
33+
} catch (std::exception& ex) {
34+
LOG(FATAL) << "std::exception caught, " << ex.what();
3335
} catch (...) {
3436
LOG(FATAL) << "Unknown exception caught";
3537
}

paddle/fluid/framework/executor.cc

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
418418
DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
419419
&(ctx->cur_ref_cnts_));
420420
}
421-
422-
if (FLAGS_benchmark) {
423-
VLOG(20) << "Memory used after operator " + op->Type() + " running: "
424-
<< memory::memory_usage(place_);
425-
}
426421
}
427422

428423
if (gc != nullptr) {
@@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
444439
scope->DropKids();
445440
}
446441
}
447-
448-
if (FLAGS_benchmark) {
449-
VLOG(20) << "-------------------------------------------------------";
450-
VLOG(20) << "Memory used after deleting local scope: "
451-
<< memory::memory_usage(place_);
452-
VLOG(20) << "-------------------------------------------------------";
453-
}
454442
}
455443

456444
void Executor::RunPreparedContext(

paddle/fluid/framework/lod_tensor.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
111111
public:
112112
LoDTensor() : Tensor() {}
113113

114-
/* Constructor with place should only be used in pybind */
115-
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
116-
117114
explicit LoDTensor(const LoD& lod) : lod_(lod) {}
118115

119116
void set_lod(const LoD& lod) { lod_ = lod; }

paddle/fluid/framework/mixed_vector.h

Lines changed: 22 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "paddle/fluid/framework/details/cow_ptr.h"
2424
#include "paddle/fluid/framework/tensor.h"
2525
#include "paddle/fluid/framework/tensor_util.h"
26+
#include "paddle/fluid/memory/malloc.h"
2627
#include "paddle/fluid/memory/memcpy.h"
2728

2829
#include "glog/logging.h"
@@ -31,46 +32,6 @@ namespace paddle {
3132
namespace framework {
3233

3334
#if defined(PADDLE_WITH_CUDA)
34-
namespace details {
35-
struct CUDABuffer {
36-
void *data_{nullptr};
37-
size_t size_{0};
38-
platform::CUDAPlace place_;
39-
40-
CUDABuffer() {}
41-
CUDABuffer(platform::Place place, size_t size)
42-
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
43-
data_ = memory::Alloc(place_, size);
44-
}
45-
46-
~CUDABuffer() { ClearMemory(); }
47-
48-
CUDABuffer(const CUDABuffer &o) = delete;
49-
CUDABuffer &operator=(const CUDABuffer &o) = delete;
50-
51-
void Resize(platform::Place place, size_t size) {
52-
ClearMemory();
53-
place_ = boost::get<platform::CUDAPlace>(place);
54-
data_ = memory::Alloc(place_, size);
55-
PADDLE_ENFORCE_NOT_NULL(data_);
56-
size_ = size;
57-
}
58-
59-
void Swap(CUDABuffer &o) {
60-
std::swap(data_, o.data_);
61-
std::swap(place_, o.place_);
62-
std::swap(size_, o.size_);
63-
}
64-
65-
private:
66-
void ClearMemory() const {
67-
if (data_ != nullptr) {
68-
memory::Free(place_, data_);
69-
}
70-
}
71-
};
72-
} // namespace details
73-
7435
// Vector<T> implements the std::vector interface, and can get Data or
7536
// MutableData from any place. The data will be synced implicitly inside.
7637
template <typename T>
@@ -103,8 +64,6 @@ class Vector {
10364
o.ImmutableCPU();
10465
cpu_ = o.cpu_;
10566
flag_ = kDataInCPU;
106-
details::CUDABuffer null;
107-
gpu_.Swap(null);
10867
return *this;
10968
}
11069

@@ -199,7 +158,7 @@ class Vector {
199158
PADDLE_ENFORCE(platform::is_gpu_place(place),
200159
"CUDA Data must on CUDA place");
201160
ImmutableCUDA(place);
202-
return reinterpret_cast<T *>(gpu_.data_);
161+
return reinterpret_cast<T *>(gpu_->ptr());
203162
}
204163

205164
// get cuda ptr. mutable
@@ -234,13 +193,11 @@ class Vector {
234193

235194
std::mutex &Mutex() const { return mtx_; }
236195

237-
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
238-
if (gpu_.data_ == nullptr) {
239-
return nullptr;
240-
} else {
241-
return std::unique_ptr<platform::CUDAPlace>(
242-
new platform::CUDAPlace(gpu_.place_));
243-
}
196+
boost::optional<platform::CUDAPlace> CUDAPlace() const {
197+
return gpu_ == nullptr
198+
? boost::none
199+
: boost::optional<platform::CUDAPlace>(
200+
boost::get<platform::CUDAPlace>(gpu_->place()));
244201
}
245202

246203
private:
@@ -254,13 +211,12 @@ class Vector {
254211
void CopyToCPU() const {
255212
// COPY GPU Data To CPU
256213
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
257-
platform::DeviceContextPool::Instance().Get(
258-
platform::Place(gpu_.place_)));
214+
platform::DeviceContextPool::Instance().Get(gpu_->place()));
259215
auto stream = dev_ctx->stream();
260-
void *src = gpu_.data_;
216+
void *src = gpu_->ptr();
261217
void *dst = cpu_.data();
262-
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
263-
stream);
218+
memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
219+
gpu_->size(), stream);
264220
dev_ctx->Wait();
265221
}
266222

@@ -277,8 +233,7 @@ class Vector {
277233
CopyCPUDataToCUDA(place);
278234
UnsetFlag(kDirty);
279235
SetFlag(kDataInCUDA);
280-
} else if (IsInCUDA() &&
281-
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
236+
} else if (IsInCUDA() && !(place == gpu_->place())) {
282237
PADDLE_THROW("This situation should not happen");
283238
// Still dirty
284239
} else {
@@ -290,7 +245,7 @@ class Vector {
290245
// Even data is not dirty. However, data is not in CUDA. Copy data.
291246
CopyCPUDataToCUDA(place);
292247
SetFlag(kDataInCUDA);
293-
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
248+
} else if (!(place == gpu_->place())) {
294249
PADDLE_THROW("This situation should not happen.");
295250
} else {
296251
// Not Dirty && DataInCUDA && Device is same
@@ -301,13 +256,13 @@ class Vector {
301256

302257
void CopyCPUDataToCUDA(const platform::Place &place) const {
303258
void *src = cpu_.data();
304-
gpu_.Resize(place, cpu_.size() * sizeof(T));
305-
void *dst = gpu_.data_;
259+
gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
260+
void *dst = gpu_->ptr();
306261
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
307262
platform::DeviceContextPool::Instance().Get(place));
308263
auto stream = dev_ctx->stream();
309-
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
310-
stream);
264+
memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
265+
gpu_->size(), stream);
311266
}
312267

313268
void ImmutableCPU() const {
@@ -329,7 +284,7 @@ class Vector {
329284
bool IsInCPU() const { return flag_ & kDataInCPU; }
330285

331286
mutable std::vector<T> cpu_;
332-
mutable details::CUDABuffer gpu_;
287+
mutable memory::AllocationPtr gpu_;
333288
mutable int flag_;
334289

335290
mutable std::mutex mtx_;
@@ -428,8 +383,8 @@ class Vector {
428383
auto &mtx = m_.Data().Mutex();
429384
std::lock_guard<std::mutex> guard(mtx);
430385
auto cuda_place = m_.Data().CUDAPlace();
431-
if (cuda_place == nullptr ||
432-
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
386+
if (cuda_place == boost::none ||
387+
cuda_place == boost::get<platform::CUDAPlace>(place)) {
433388
return m_.Data().CUDAData(place);
434389
}
435390
}
@@ -444,8 +399,8 @@ class Vector {
444399
auto &mtx = m_.Data().Mutex();
445400
std::lock_guard<std::mutex> guard(mtx);
446401
auto cuda_place = m_.Data().CUDAPlace();
447-
if (cuda_place == nullptr ||
448-
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
402+
if (cuda_place == boost::none ||
403+
cuda_place == boost::get<platform::CUDAPlace>(place)) {
449404
return m_.MutableData()->CUDAMutableData(place);
450405
}
451406
}

paddle/fluid/framework/tensor.cc

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,9 @@ size_t Tensor::memory_size() const {
3232
}
3333

3434
void* Tensor::mutable_data(platform::Place place, std::type_index type,
35+
memory::Allocator::Attr attr,
3536
size_t requested_size) {
36-
if (holder_ != nullptr) {
37-
holder_->set_type(type);
38-
}
37+
type_ = type;
3938
PADDLE_ENFORCE_GE(numel(), 0,
4039
"When calling this method, the Tensor's numel must be "
4140
"equal or larger than zero. "
@@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
4847
/* some versions of boost::variant don't have operator!= */
4948
if (holder_ == nullptr || !(holder_->place() == place) ||
5049
holder_->size() < size + offset_) {
51-
if (platform::is_cpu_place(place)) {
52-
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
53-
boost::get<platform::CPUPlace>(place), size, type));
54-
} else if (platform::is_gpu_place(place) ||
55-
platform::is_cuda_pinned_place(place)) {
56-
#ifndef PADDLE_WITH_CUDA
57-
PADDLE_THROW(
58-
"CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
59-
}
60-
#else
61-
if (platform::is_gpu_place(place)) {
62-
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
63-
boost::get<platform::CUDAPlace>(place), size, type));
64-
} else if (platform::is_cuda_pinned_place(place)) {
65-
holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
66-
boost::get<platform::CUDAPinnedPlace>(place), size, type));
67-
}
68-
}
69-
#endif
50+
holder_ = memory::AllocShared(place, size, attr);
7051
offset_ = 0;
7152
}
7253
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
7354
offset_);
7455
}
7556

76-
void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
57+
void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
58+
size_t requested_size) {
7759
PADDLE_ENFORCE(this->holder_ != nullptr,
7860
"Cannot invoke mutable data if current hold nothing.");
79-
return mutable_data(place, holder_->type(), requested_size);
61+
return mutable_data(place, type_, attr, requested_size);
8062
}
8163

8264
Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
10183
Tensor dst;
10284
dst.holder_ = holder_;
10385
dst.set_layout(layout_);
86+
dst.type_ = type_;
10487
DDim dst_dims = dims_;
10588
dst_dims[0] = end_idx - begin_idx;
10689
dst.Resize(dst_dims);

0 commit comments

Comments
 (0)