Skip to content

Commit a135fec

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into gen_nccl_id_op
2 parents 0f86397 + f43b71b commit a135fec

File tree

19 files changed

+435
-72
lines changed

19 files changed

+435
-72
lines changed

benchmark/cluster/vgg16/vgg16_fluid.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def str2bool(v):
8080
type=str,
8181
default="",
8282
help="Comma-separated list of hostname:port pairs")
83+
parser.add_argument(
84+
"--profile", action='store_true', help="If set, profile a few steps.")
8385

8486
# Flags for defining the tf.train.Server
8587
parser.add_argument(
@@ -183,8 +185,8 @@ def train_loop(exe, trainer_prog):
183185
start_time = time.time()
184186
num_samples = 0
185187
train_pass_acc.reset()
186-
for batch_id, data in enumerate(train_reader()):
187-
ts = time.time()
188+
189+
def run_step(batch_id, data):
188190
img_data = np.array(
189191
map(lambda x: x[0].reshape(data_shape), data)).astype(
190192
"float32")
@@ -196,14 +198,28 @@ def train_loop(exe, trainer_prog):
196198
feed={"pixel": img_data,
197199
"label": y_data},
198200
fetch_list=[avg_cost, batch_acc, batch_size])
201+
return loss, acc, b_size
202+
203+
if args.profile and args.task_index == 0:
204+
# warmup.
205+
for batch_id, data in enumerate(train_reader()):
206+
if batch_id > 5: break
207+
run_step(batch_id, data)
208+
with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
209+
for batch_id, data in enumerate(train_reader()):
210+
if batch_id > 5: break
211+
run_step(batch_id, data)
212+
213+
for batch_id, data in enumerate(train_reader()):
214+
ts = time.time()
215+
loss, acc, b_size = run_step(batch_id, data)
199216
iters += 1
200217
num_samples += len(data)
201218
train_pass_acc.add(value=acc, weight=b_size)
202219
print(
203-
"Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
204-
"Speed = %.2f img/s " % (args.task_index, pass_id, iters,
205-
loss, acc,
206-
len(data) / (time.time() - ts))
220+
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
221+
"Speed = %.2f img/s" % (pass_id, iters, loss, acc,
222+
len(data) / (time.time() - ts))
207223
) # The accuracy is the accumulation of batches, but not the current batch.
208224

209225
pass_elapsed = time.time() - start_time
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
22
nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
3+
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
34
set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
45
add_subdirectory(convert)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/inference/tensorrt/io_converter.h"
16+
#include <cuda.h>
17+
#include "paddle/fluid/platform/enforce.h"
18+
19+
namespace paddle {
20+
namespace inference {
21+
namespace tensorrt {
22+
23+
using platform::is_gpu_place;
24+
using platform::is_cpu_place;
25+
26+
class DefaultInputConverter : public EngineInputConverter {
27+
public:
28+
DefaultInputConverter() {}
29+
// NOTE out is GPU memory.
30+
virtual void operator()(const LoDTensor& in, void* out,
31+
size_t max_size) override {
32+
PADDLE_ENFORCE(out != nullptr);
33+
PADDLE_ENFORCE_LE(in.memory_size(), max_size);
34+
const auto& place = in.place();
35+
if (is_cpu_place(place)) {
36+
PADDLE_ENFORCE(stream_ != nullptr);
37+
PADDLE_ENFORCE_EQ(0,
38+
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
39+
cudaMemcpyHostToDevice, *stream_));
40+
41+
} else if (is_gpu_place(place)) {
42+
PADDLE_ENFORCE_EQ(0,
43+
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
44+
cudaMemcpyHostToHost, *stream_));
45+
46+
} else {
47+
PADDLE_THROW("Unknown device for converter");
48+
}
49+
cudaStreamSynchronize(*stream_);
50+
}
51+
};
52+
53+
REGISTER_TENSORRT_INPUT_CONVERTER(mul, DefaultInputConverter);
54+
55+
} // namespace tensorrt
56+
} // namespace inference
57+
} // namespace paddle
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include <unordered_map>
18+
#include "paddle/fluid/framework/lod_tensor.h"
19+
#include "paddle/fluid/inference/utils/singleton.h"
20+
21+
namespace paddle {
22+
namespace inference {
23+
namespace tensorrt {
24+
25+
using framework::LoDTensor;
26+
27+
/*
28+
* Convert Input from Fluid to an Engine.
29+
* TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
30+
* most cases just need to copy the data.
31+
*/
32+
class EngineInputConverter {
33+
public:
34+
EngineInputConverter() {}
35+
36+
virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
37+
38+
void SetStream(cudaStream_t* stream) { stream_ = stream; }
39+
40+
static void Run(const std::string& in_op_type, const LoDTensor& in, void* out,
41+
size_t max_size, cudaStream_t* stream) {
42+
PADDLE_ENFORCE(stream != nullptr);
43+
auto* converter = Registry<EngineInputConverter>::Lookup(in_op_type);
44+
PADDLE_ENFORCE_NOT_NULL(converter);
45+
converter->SetStream(stream);
46+
(*converter)(in, out, max_size);
47+
}
48+
49+
virtual ~EngineInputConverter() {}
50+
51+
protected:
52+
cudaStream_t* stream_{nullptr};
53+
};
54+
55+
} // namespace tensorrt
56+
} // namespace inference
57+
} // namespace paddle
58+
59+
#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
60+
struct trt_input_##in_op_type__##_converter { \
61+
trt_input_##in_op_type__##_converter() { \
62+
::paddle::inference::Registry<EngineInputConverter>::Register< \
63+
Converter__>(#in_op_type__); \
64+
} \
65+
}; \
66+
trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/framework/lod_tensor.h"
16+
#include "paddle/fluid/inference/tensorrt/io_converter.h"
17+
18+
#include <gtest/gtest.h>
19+
20+
namespace paddle {
21+
namespace inference {
22+
namespace tensorrt {
23+
24+
class EngineInputConverterTester : public ::testing::Test {
25+
public:
26+
void SetUp() override { tensor.Resize({10, 10}); }
27+
28+
framework::LoDTensor tensor;
29+
};
30+
31+
TEST_F(EngineInputConverterTester, DefaultCPU) {
32+
void* buffer;
33+
tensor.mutable_data<float>(platform::CPUPlace());
34+
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
35+
36+
cudaStream_t stream;
37+
EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
38+
&stream);
39+
}
40+
41+
TEST_F(EngineInputConverterTester, DefaultGPU) {
42+
void* buffer;
43+
tensor.mutable_data<float>(platform::CUDAPlace());
44+
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
45+
46+
cudaStream_t stream;
47+
EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
48+
&stream);
49+
}
50+
51+
} // namespace tensorrt
52+
} // namespace inference
53+
} // namespace paddle

paddle/fluid/inference/tests/book/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ function(inference_test TARGET_NAME)
2424
endforeach()
2525
endfunction(inference_test)
2626

27+
####################
28+
# Inference tests here depend on fluid/tests/book. If users want to run
29+
# individual test with ctest, they need to run tests in fluid/tests/book
30+
# first to generate saved model.
31+
####################
2732
# This unittest is buggy!
2833
#inference_test(fit_a_line)
2934
inference_test(image_classification ARGS vgg resnet)
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include <unordered_map>
18+
#include "paddle/fluid/platform/enforce.h"
19+
20+
namespace paddle {
21+
namespace inference {
22+
23+
// NOTE not thread-safe.
24+
template <typename T>
25+
struct Singleton {
26+
static T& Global() {
27+
static T* x = new T;
28+
return *x;
29+
}
30+
31+
Singleton() = delete;
32+
Singleton& operator=(const Singleton&) = delete;
33+
};
34+
35+
/*
36+
* An registor for any type.
37+
* NOTE not thread-safe.
38+
*/
39+
template <typename ItemParent>
40+
struct Registry {
41+
static Registry& Global() {
42+
static auto* x = new Registry<ItemParent>;
43+
return *x;
44+
}
45+
46+
template <typename ItemChild>
47+
static void Register(const std::string& name) {
48+
PADDLE_ENFORCE_EQ(items_.count(name), 0);
49+
items_[name] = new ItemChild;
50+
}
51+
52+
static ItemParent* Lookup(const std::string& name) {
53+
auto it = items_.find(name);
54+
if (it == items_.end()) return nullptr;
55+
return it->second;
56+
}
57+
58+
~Registry() {
59+
for (auto& item : items_) {
60+
delete item.second;
61+
}
62+
}
63+
64+
private:
65+
Registry() = default;
66+
static std::unordered_map<std::string, ItemParent*> items_;
67+
};
68+
69+
template <typename ItemParent>
70+
std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
71+
72+
} // namespace inference
73+
} // namespace paddle

paddle/fluid/operators/detail/send_recv.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ message VariableMessage {
7070
bytes rows = 9;
7171
// Look up table block execution output variable name.
7272
string out_varname = 10;
73+
// If true, the ps server will start profiling, the ps
74+
// server stops profiling and generates a profile to /tmp/profile_ps_*
75+
// when profile switches from true to false.
76+
bool profile = 11;
7377
}
7478

7579
message VoidMessage {}

paddle/fluid/operators/detail/sendrecvop_utils.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ limitations under the License. */
2626
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
2727
#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
2828
#include "paddle/fluid/operators/detail/variable_response.h"
29+
#include "paddle/fluid/platform/profiler.h"
2930

3031
namespace paddle {
3132
namespace operators {
@@ -48,6 +49,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
4849
void* payload = nullptr;
4950
size_t payload_size = 0;
5051
ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
52+
// Note: normally the profiler is enabled in 1 trainer, hence only
53+
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
54+
// servers the trainer's profiling state so that PS can follow the
55+
// trainer.
56+
if (platform::ShouldSendProfileState()) {
57+
e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled());
58+
}
5159
e.WriteString(VarMsg::kVarnameFieldNumber, name);
5260
if (var->IsType<framework::LoDTensor>()) {
5361
e.WriteUint64(VarMsg::kTypeFieldNumber, 0);

paddle/fluid/operators/detail/variable_response.cc

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#ifdef PADDLE_WITH_CUDA
2121
#include <nccl.h>
2222
#endif
23+
#include "paddle/fluid/platform/profiler.h"
2324

2425
#include "paddle/fluid/operators/detail/send_recv.pb.h"
2526
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
@@ -446,7 +447,26 @@ int VariableResponse::Parse(Source* source) {
446447
meta_.set_out_varname(temp);
447448
break;
448449
}
449-
450+
case sendrecv::VariableMessage::kProfileFieldNumber: {
451+
bool profiling;
452+
if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
453+
return tag;
454+
}
455+
meta_.set_profile(profiling);
456+
int64_t listener_id = platform::ListenerId();
457+
if (listener_id <= 0) {
458+
break;
459+
}
460+
if (profiling && !platform::IsProfileEnabled()) {
461+
platform::EnableProfiler(platform::ProfilerState::kCPU);
462+
} else if (!profiling && platform::IsProfileEnabled()) {
463+
// TODO(panyx0718): Should we allow to customize file dir.
464+
platform::DisableProfiler(
465+
platform::EventSortingKey::kDefault,
466+
string::Sprintf("/tmp/profile_ps_%lld", listener_id));
467+
}
468+
break;
469+
}
450470
default: {
451471
// Unknown tag, return unknown error.
452472
return -1;

0 commit comments

Comments
 (0)