Skip to content

Commit 0264ec3

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-listen-and-serv-op
2 parents 63bd38b + c02ba51 commit 0264ec3

29 files changed

+1247
-128
lines changed
Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,35 @@
11
Use different clusters
22
======================
33

4-
PaddlePaddle supports running jobs on several platforms including:
5-
- `Kubernetes <http://kubernetes.io>`_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
6-
- `OpenMPI <https://www.open-mpi.org>`_ Mature high performance parallel computing framework.
7-
- `Fabric <http://www.fabfile.org>`_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
4+
The user's cluster environment is not the same. To facilitate everyone's deployment, we provide a variety of cluster deployment methods to facilitate the submission of cluster training tasks, which will be introduced as follows:
85

9-
We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
6+
`Kubernetes <http://kubernetes.io>`_ is a scheduling framework of Google open source container cluster, supporting a complete cluster solution for large-scale cluster production environment. The following guidelines show PaddlePaddle's support for Kubernetes:
107

11-
These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
8+
.. toctree::
9+
:maxdepth: 1
10+
11+
k8s_cn.md
12+
k8s_distributed_cn.md
13+
14+
`OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task:
1215

1316
.. toctree::
1417
:maxdepth: 1
1518

16-
fabric_en.md
17-
openmpi_en.md
18-
k8s_en.md
19-
k8s_aws_en.md
19+
openmpi_cn.md
20+
21+
`Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines:
22+
23+
.. toctree::
24+
:maxdepth: 1
25+
26+
fabric_cn.md
27+
28+
We also support the deployment of PaddlePaddle on AWS. Learn more about:
29+
30+
.. toctree::
31+
:maxdepth: 1
32+
33+
k8s_aws_cn.md
34+
35+
The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .

paddle/fluid/framework/executor.cc

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -226,15 +226,15 @@ static bool has_fetch_operators(
226226
}
227227

228228
void Executor::Run(const ProgramDesc& program, Scope* scope,
229-
std::map<std::string, const LoDTensor*>& feed_targets,
230-
std::map<std::string, LoDTensor*>& fetch_targets,
229+
std::map<std::string, const LoDTensor*>* feed_targets,
230+
std::map<std::string, LoDTensor*>* fetch_targets,
231231
bool create_vars, const std::string& feed_holder_name,
232232
const std::string& fetch_holder_name) {
233233
platform::RecordBlock b(kProgramId);
234234
bool has_feed_ops =
235-
has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
235+
has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
236236
bool has_fetch_ops =
237-
has_fetch_operators(program.Block(0), fetch_targets, fetch_holder_name);
237+
has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
238238

239239
ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
240240
if (!has_feed_ops || !has_fetch_ops) {
@@ -250,7 +250,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
250250
feed_holder->SetPersistable(true);
251251

252252
int i = 0;
253-
for (auto& feed_target : feed_targets) {
253+
for (auto& feed_target : (*feed_targets)) {
254254
std::string var_name = feed_target.first;
255255
VLOG(3) << "feed target's name: " << var_name;
256256

@@ -273,7 +273,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
273273
fetch_holder->SetPersistable(true);
274274

275275
int i = 0;
276-
for (auto& fetch_target : fetch_targets) {
276+
for (auto& fetch_target : (*fetch_targets)) {
277277
std::string var_name = fetch_target.first;
278278
VLOG(3) << "fetch target's name: " << var_name;
279279

@@ -361,25 +361,25 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
361361

362362
void Executor::RunPreparedContext(
363363
ExecutorPrepareContext* ctx, Scope* scope,
364-
std::map<std::string, const LoDTensor*>& feed_targets,
365-
std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
364+
std::map<std::string, const LoDTensor*>* feed_targets,
365+
std::map<std::string, LoDTensor*>* fetch_targets, bool create_vars,
366366
const std::string& feed_holder_name, const std::string& fetch_holder_name) {
367367
auto& global_block = ctx->prog_.Block(ctx->block_id_);
368368

369369
PADDLE_ENFORCE(
370-
has_feed_operators(global_block, feed_targets, feed_holder_name),
370+
has_feed_operators(global_block, *feed_targets, feed_holder_name),
371371
"Program in ExecutorPrepareContext should has feed_ops.");
372372
PADDLE_ENFORCE(
373-
has_fetch_operators(global_block, fetch_targets, fetch_holder_name),
373+
has_fetch_operators(global_block, *fetch_targets, fetch_holder_name),
374374
"Program in the prepared context should has fetch_ops.");
375375

376376
// map the data of feed_targets to feed_holder
377377
for (auto* op : global_block.AllOps()) {
378378
if (op->Type() == kFeedOpType) {
379379
std::string feed_target_name = op->Output("Out")[0];
380380
int idx = boost::get<int>(op->GetAttr("col"));
381-
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
382-
idx);
381+
SetFeedVariable(scope, *(*feed_targets)[feed_target_name],
382+
feed_holder_name, idx);
383383
}
384384
}
385385

@@ -390,7 +390,7 @@ void Executor::RunPreparedContext(
390390
if (op->Type() == kFetchOpType) {
391391
std::string fetch_target_name = op->Input("X")[0];
392392
int idx = boost::get<int>(op->GetAttr("col"));
393-
*fetch_targets[fetch_target_name] =
393+
*(*fetch_targets)[fetch_target_name] =
394394
GetFetchVariable(*scope, fetch_holder_name, idx);
395395
}
396396
}

paddle/fluid/framework/executor.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ class Executor {
5555
bool create_local_scope = true, bool create_vars = true);
5656

5757
void Run(const ProgramDesc& program, Scope* scope,
58-
std::map<std::string, const LoDTensor*>& feed_targets,
59-
std::map<std::string, LoDTensor*>& fetch_targets,
58+
std::map<std::string, const LoDTensor*>* feed_targets,
59+
std::map<std::string, LoDTensor*>* fetch_targets,
6060
bool create_vars = true,
6161
const std::string& feed_holder_name = "feed",
6262
const std::string& fetch_holder_name = "fetch");
@@ -74,8 +74,8 @@ class Executor {
7474
bool create_vars = true);
7575

7676
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
77-
std::map<std::string, const LoDTensor*>& feed_targets,
78-
std::map<std::string, LoDTensor*>& fetch_targets,
77+
std::map<std::string, const LoDTensor*>* feed_targets,
78+
std::map<std::string, LoDTensor*>* fetch_targets,
7979
bool create_vars = true,
8080
const std::string& feed_holder_name = "feed",
8181
const std::string& fetch_holder_name = "fetch");

paddle/fluid/inference/engine.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include "paddle/fluid/framework/framework.pb.h"
18+
19+
namespace paddle {
20+
namespace inference {
21+
22+
/*
23+
* EngineBase is the base class of all inference engines. An inference engine
24+
* takes a paddle program as input, and outputs the result in fluid Tensor
25+
* format. It can be used to optimize performance of computation sub-blocks, for
26+
* example, break down the original block into sub-blocks and execute each
27+
* sub-blocks in different engines.
28+
*
29+
* For example:
30+
* When inference, the resnet50 model can put most of the model into subgraph
31+
* and run it on a TensorRT engine.
32+
*
33+
* There are several engines such as TensorRT and other frameworks, so an
34+
* EngineBase is put forward to give an unified interface for all the
35+
* different engine implemention.
36+
*/
37+
class EngineBase {
38+
public:
39+
using DescType = ::paddle::framework::proto::BlockDesc;
40+
41+
// Build the model and do some preparation, for example, in TensorRT, run
42+
// createInferBuilder, buildCudaEngine.
43+
virtual void Build(const DescType& paddle_model) = 0;
44+
45+
// Execute the engine, that will run the inference network.
46+
virtual void Execute(int batch_size) = 0;
47+
48+
virtual ~EngineBase() {}
49+
50+
}; // class EngineBase
51+
52+
} // namespace inference
53+
} // namespace paddle
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
1+
if(WITH_TESTING)
2+
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
3+
nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
4+
endif()
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/inference/tensorrt/engine.h"
16+
17+
#include <NvInfer.h>
18+
#include <cuda.h>
19+
#include <glog/logging.h>
20+
#include "paddle/fluid/inference/tensorrt/helper.h"
21+
#include "paddle/fluid/platform/enforce.h"
22+
23+
namespace paddle {
24+
namespace inference {
25+
namespace tensorrt {
26+
27+
void TensorRTEngine::Build(const DescType& paddle_model) {
28+
PADDLE_ENFORCE(false, "not implemented");
29+
}
30+
31+
void TensorRTEngine::Execute(int batch_size) {
32+
infer_context_->enqueue(batch_size, buffers_.data(), *stream_, nullptr);
33+
cudaStreamSynchronize(*stream_);
34+
}
35+
36+
TensorRTEngine::~TensorRTEngine() {
37+
// clean buffer
38+
for (auto& buffer : buffers_) {
39+
if (buffer != nullptr) {
40+
PADDLE_ENFORCE_EQ(0, cudaFree(buffer));
41+
buffer = nullptr;
42+
}
43+
}
44+
}
45+
46+
void TensorRTEngine::FreezeNetwork() {
47+
PADDLE_ENFORCE(infer_builder_ != nullptr,
48+
"Call InitNetwork first to initialize network.");
49+
PADDLE_ENFORCE(infer_network_ != nullptr,
50+
"Call InitNetwork first to initialize network.");
51+
// build engine.
52+
infer_builder_->setMaxBatchSize(max_batch_);
53+
infer_builder_->setMaxWorkspaceSize(max_workspace_);
54+
55+
infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
56+
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
57+
58+
infer_context_.reset(infer_engine_->createExecutionContext());
59+
60+
// allocate GPU buffers.
61+
buffers_.resize(buffer_sizes_.size(), nullptr);
62+
for (auto& item : buffer_sizes_) {
63+
if (item.second == 0) {
64+
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
65+
item.second = kDataTypeSize[static_cast<int>(
66+
infer_engine_->getBindingDataType(slot_offset))] *
67+
AccumDims(infer_engine_->getBindingDimensions(slot_offset));
68+
}
69+
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buffer(item.first), item.second));
70+
}
71+
}
72+
73+
nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
74+
nvinfer1::DataType dtype,
75+
const nvinfer1::Dims& dim) {
76+
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
77+
name);
78+
79+
PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
80+
auto* input = infer_network_->addInput(name.c_str(), dtype, dim);
81+
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
82+
83+
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim);
84+
return input;
85+
}
86+
87+
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
88+
const std::string& name) {
89+
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
90+
name);
91+
92+
auto* output = layer->getOutput(offset);
93+
PADDLE_ENFORCE(output != nullptr);
94+
output->setName(name.c_str());
95+
infer_network_->markOutput(*output);
96+
// output buffers' size can only be decided latter, set zero here to mark this
97+
// and will reset latter.
98+
buffer_sizes_[name] = 0;
99+
}
100+
101+
void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
102+
return buffer(name);
103+
}
104+
105+
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
106+
size_t max_size) {
107+
// determine data size
108+
auto it = buffer_sizes_.find(name);
109+
PADDLE_ENFORCE(it != buffer_sizes_.end());
110+
PADDLE_ENFORCE_GT(it->second, 0);
111+
PADDLE_ENFORCE_GE(max_size, it->second);
112+
113+
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second,
114+
cudaMemcpyDeviceToHost, *stream_));
115+
}
116+
117+
void*& TensorRTEngine::buffer(const std::string& name) {
118+
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
119+
auto it = buffer_sizes_.find(name);
120+
PADDLE_ENFORCE(it != buffer_sizes_.end());
121+
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
122+
return buffers_[slot_offset];
123+
}
124+
125+
void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
126+
size_t size) {
127+
void* buf = buffer(name);
128+
PADDLE_ENFORCE_EQ(
129+
0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
130+
}
131+
132+
} // namespace tensorrt
133+
} // namespace inference
134+
} // namespace paddle

0 commit comments

Comments
 (0)