Skip to content

Commit fcb4844

Browse files
author
weixing02
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into img
Fix img links errors
2 parents 4d1fe5c + f2c0b88 commit fcb4844

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1211
-197
lines changed

paddle/fluid/framework/details/multi_devices_graph_builder.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
2222
#endif
2323

24+
#include <string>
25+
#include <vector>
26+
2427
namespace paddle {
2528
namespace framework {
2629
namespace details {
@@ -168,6 +171,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
168171
*/
169172
PolishGraphToSupportDataHazards(&result);
170173

174+
/*
175+
* Only variables should be the leaves of graph.
176+
*/
177+
AddOutputToLeafOps(&result);
178+
171179
if (VLOG_IS_ON(10)) {
172180
std::ostringstream sout;
173181
PrintGraphviz(*graph, sout);

paddle/fluid/framework/details/ssa_graph_builder.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,17 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
136136

137137
sout << "}\n";
138138
}
139+
140+
void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
141+
for (auto &op : graph->ops_) {
142+
if (!op->outputs_.empty()) {
143+
continue;
144+
}
145+
auto *dummy_leaf = new DummyVarHandle();
146+
graph->dep_vars_.emplace(dummy_leaf);
147+
op->AddOutput(dummy_leaf);
148+
}
149+
}
139150
} // namespace details
140151
} // namespace framework
141152
} // namespace paddle

paddle/fluid/framework/details/ssa_graph_builder.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414

1515
#pragma once
1616

17+
#include <memory>
18+
#include <string>
19+
1720
#include "paddle/fluid/framework/details/ssa_graph.h"
1821
#include "paddle/fluid/framework/program_desc.h"
1922
#include "paddle/fluid/platform/place.h"
2023

21-
#include <memory>
22-
#include <string>
23-
2424
namespace paddle {
2525
namespace framework {
2626
namespace details {
@@ -52,6 +52,8 @@ class SSAGraphBuilder {
5252
const std::string &each_var_name,
5353
const platform::Place &place, size_t place_offset);
5454

55+
static void AddOutputToLeafOps(SSAGraph *graph);
56+
5557
static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
5658
};
5759
} // namespace details

paddle/fluid/framework/details/threaded_ssa_graph_executor.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
8787

8888
// Step 2. Insert FetchOps
8989
std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
90-
std::vector<DummyVarHandle> dummy_vars;
9190
FeedFetchList fetch_data(fetch_tensors.size());
9291

9392
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
@@ -101,20 +100,25 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
101100
}
102101
}
103102

103+
std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
104104
for (size_t i = 0; i < fetch_tensors.size(); ++i) {
105105
auto &var_name = fetch_tensors[i];
106106
auto &vars = fetched_vars.at(var_name);
107107
auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
108108
fetch_ops.emplace_back(op);
109109

110-
// FIXME: Use new device context
111110
for (auto &p : places_) {
112111
op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
113112
}
114113

115114
for (auto *var : vars) {
116115
op->AddInput(var);
117116
}
117+
118+
auto *fetch_dummy = new DummyVarHandle();
119+
op->AddOutput(fetch_dummy);
120+
fetch_dependencies.emplace(fetch_dummy);
121+
InsertPendingVar(*fetch_dummy);
118122
InsertPendingOp(*op);
119123
}
120124

paddle/fluid/framework/tensor.h

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,10 @@ class Tensor {
4545
friend struct EigenVector;
4646

4747
public:
48-
Tensor() : offset_(0), is_pinned_(false) {}
48+
Tensor() : offset_(0) {}
4949

5050
/*! Constructor with place should only be used in pybind. */
51-
explicit Tensor(const platform::Place& place)
52-
: offset_(0), is_pinned_(false) {
51+
explicit Tensor(const platform::Place& place) : offset_(0) {
5352
holder_->set_place(place);
5453
}
5554

@@ -70,12 +69,11 @@ class Tensor {
7069
* @note If not exist, then allocation.
7170
*/
7271
template <typename T>
73-
inline T* mutable_data(platform::Place place, bool is_pinned = false);
72+
inline T* mutable_data(platform::Place place);
7473

75-
inline void* mutable_data(platform::Place place, std::type_index type,
76-
bool is_pinned = false);
74+
inline void* mutable_data(platform::Place place, std::type_index type);
7775

78-
inline void* mutable_data(platform::Place place, bool is_pinned = false);
76+
inline void* mutable_data(platform::Place place);
7977

8078
/**
8179
* @brief Return a pointer to mutable memory block.
@@ -86,18 +84,14 @@ class Tensor {
8684
* @note If not exist, then allocation.
8785
*/
8886
template <typename T>
89-
inline T* mutable_data(DDim dims, platform::Place place,
90-
bool is_pinned = false);
87+
inline T* mutable_data(DDim dims, platform::Place place);
9188

9289
/*! Return the dimensions of the memory block. */
9390
inline const DDim& dims() const;
9491

9592
/*! Return the numel of the memory block. */
9693
inline int64_t numel() const;
9794

98-
/*! Return the numel of the memory block. */
99-
inline bool isPinned() const;
100-
10195
/*! Resize the dimensions of the memory block. */
10296
inline Tensor& Resize(const DDim& dims);
10397

@@ -152,14 +146,12 @@ class Tensor {
152146

153147
template <typename Place>
154148
struct PlaceholderImpl : public Placeholder {
155-
PlaceholderImpl(Place place, size_t size, std::type_index type,
156-
bool is_pinned = false)
157-
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
158-
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
149+
PlaceholderImpl(Place place, size_t size, std::type_index type)
150+
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
151+
memory::PODDeleter<uint8_t, Place>(place)),
159152
place_(place),
160153
size_(size),
161-
type_(type),
162-
is_pinned_(is_pinned) {
154+
type_(type) {
163155
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
164156
(is_cpu_place(place_) ? "CPU" : "GPU"));
165157
}
@@ -182,9 +174,6 @@ class Tensor {
182174

183175
/* the current type of memory */
184176
std::type_index type_;
185-
186-
/*! use pinned memory or not. */
187-
bool is_pinned_;
188177
};
189178

190179
/*! holds the memory block if allocated. */
@@ -219,7 +208,6 @@ class Tensor {
219208
* PlaceHolder::ptr_ and where the tensor data really begins.
220209
*/
221210
size_t offset_;
222-
bool is_pinned_;
223211
};
224212

225213
inline void Tensor::switch_place(platform::Place new_place) {

paddle/fluid/framework/tensor_impl.h

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -101,21 +101,19 @@ inline T* Tensor::data() {
101101
}
102102

103103
template <typename T>
104-
inline T* Tensor::mutable_data(DDim dims, platform::Place place,
105-
bool is_pinned) {
104+
inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
106105
static_assert(std::is_pod<T>::value, "T must be POD");
107106
Resize(dims);
108-
return mutable_data<T>(place, is_pinned);
107+
return mutable_data<T>(place);
109108
}
110109

111110
template <typename T>
112-
inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
111+
inline T* Tensor::mutable_data(platform::Place place) {
113112
static_assert(std::is_pod<T>::value, "T must be POD");
114-
return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
113+
return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
115114
}
116115

117-
inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
118-
bool is_pinned) {
116+
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
119117
if (holder_ != nullptr) {
120118
holder_->set_type(type);
121119
}
@@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
129127
holder_->size() < size + offset_) {
130128
if (platform::is_cpu_place(place)) {
131129
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
132-
boost::get<platform::CPUPlace>(place), size, type, is_pinned));
130+
boost::get<platform::CPUPlace>(place), size, type));
133131
} else if (platform::is_gpu_place(place)) {
134132
#ifndef PADDLE_WITH_CUDA
135133
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
136134
}
137135
#else
138136
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
139-
boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
137+
boost::get<platform::CUDAPlace>(place), size, type));
140138
}
141139
#endif
142140
offset_ = 0;
143-
is_pinned_ = is_pinned;
144141
}
145142
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
146143
offset_);
147144
}
148145

149-
inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
146+
inline void* Tensor::mutable_data(platform::Place place) {
150147
PADDLE_ENFORCE(this->holder_ != nullptr,
151148
"Cannot invoke mutable data if current hold nothing");
152-
return mutable_data(place, holder_->type(), is_pinned);
149+
return mutable_data(place, holder_->type());
153150
}
154151

155152
inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
191188

192189
inline int64_t Tensor::numel() const { return product(dims_); }
193190

194-
inline bool Tensor::isPinned() const { return is_pinned_; }
195-
196191
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
197192
Tensor res;
198193
res.ShareDataWith(src);

paddle/fluid/framework/tensor_util.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,11 @@ struct AnyVisitor : public boost::static_visitor<bool> {
148148
const platform::CPUPlace& cpu) const {
149149
return *out.data<bool>();
150150
}
151+
152+
bool GetResult(const framework::Tensor& out,
153+
const platform::CUDAPinnedPlace& cpu) const {
154+
return *out.data<bool>();
155+
}
151156
};
152157

153158
template <typename Predicate>

paddle/fluid/memory/CMakeLists.txt

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@ cc_library(memory SRCS memory.cc DEPS place enforce)
44
cc_library(memcpy SRCS memcpy.cc DEPS place)
55

66
cc_library(paddle_memory
7-
DEPS
8-
memory
9-
memcpy
10-
meta_data
11-
meta_cache
12-
memory_block
13-
buddy_allocator
14-
system_allocator)
7+
DEPS
8+
memory
9+
memcpy
10+
meta_data
11+
meta_cache
12+
memory_block
13+
buddy_allocator
14+
system_allocator)
1515

1616
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
17+
18+
#if (WITH_GPU)
19+
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place paddle_memory)
20+
#endif()

paddle/fluid/memory/detail/system_allocator.cc

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ limitations under the License. */
1414

1515
#include "paddle/fluid/memory/detail/system_allocator.h"
1616
#include "paddle/fluid/platform/assert.h"
17+
#include "paddle/fluid/platform/cpu_info.h"
1718
#include "paddle/fluid/platform/enforce.h"
1819
#include "paddle/fluid/platform/gpu_info.h"
1920

@@ -134,21 +135,31 @@ bool GPUAllocator::UseGpu() const { return true; }
134135
// memory. It’s locked to a physical address.
135136
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
136137
if (size <= 0) return nullptr;
137-
void* p;
138-
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
138+
139+
// NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
139140
// of host pinned allocation. Allocates too much would reduce
140141
// the amount of memory available to the underlying system for paging.
142+
size_t usable =
143+
paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
141144

142-
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
143-
144-
if (size > usable) return nullptr;
145+
if (size > usable) {
146+
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
147+
<< " MB pinned memory."
148+
<< ", available " << usable / 1024.0 / 1024.0 << " MB";
149+
return nullptr;
150+
}
145151

152+
void* p;
146153
// PINNED memory is visible to all CUDA contexts.
147154
cudaError_t result = cudaMallocHost(&p, size);
155+
148156
if (result == cudaSuccess) {
149-
index = 1;
150-
fallback_alloc_size_ += size;
157+
index = 1; // PINNED memory
158+
cuda_pinnd_alloc_size_ += size;
151159
return p;
160+
} else {
161+
LOG(WARNING) << "cudaMallocHost failed.";
162+
return nullptr;
152163
}
153164

154165
return nullptr;
@@ -158,8 +169,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
158169
cudaError_t err;
159170
PADDLE_ASSERT(index == 1);
160171

161-
PADDLE_ASSERT(fallback_alloc_size_ >= size);
162-
fallback_alloc_size_ -= size;
172+
PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
173+
cuda_pinnd_alloc_size_ -= size;
163174
err = cudaFreeHost(p);
164175

165176
// Purposefully allow cudaErrorCudartUnloading, because
@@ -172,7 +183,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
172183
}
173184
}
174185

175-
bool CUDAPinnedAllocator::UseGpu() const { return true; }
186+
bool CUDAPinnedAllocator::UseGpu() const { return false; }
176187

177188
#endif
178189

paddle/fluid/memory/detail/system_allocator.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ namespace memory {
2121
namespace detail {
2222

2323
/**
24-
* \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
25-
* A BuddyAllocator object uses a SystemAllocator* pointing to the
24+
* \brief SystemAllocator is the parent class of CPUAllocator,
25+
* CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator
26+
* object uses a SystemAllocator* pointing to the
2627
* underlying system allocator.
2728
*/
2829
class SystemAllocator {
@@ -62,9 +63,7 @@ class CUDAPinnedAllocator : public SystemAllocator {
6263
virtual bool UseGpu() const;
6364

6465
private:
65-
size_t gpu_alloc_size_ =
66-
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
67-
size_t fallback_alloc_size_ = 0;
66+
size_t cuda_pinnd_alloc_size_ = 0;
6867
};
6968
#endif
7069

0 commit comments

Comments
 (0)