Skip to content

Commit bcaa8a3

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_production_dockerfile
test=develop
2 parents 6274c0f + 6224e61 commit bcaa8a3

34 files changed

+1016
-125
lines changed

cmake/operators.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ function(op_library TARGET)
109109

110110
# Define operators that don't need pybind here.
111111
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
112-
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
112+
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
113+
"fusion_transpose_flatten_concat_op")
113114
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
114115
set(pybind_flag 1)
115116
endif()

paddle/fluid/framework/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
117117
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
118118

119119
if (NOT WIN32)
120-
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto)
120+
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
121121
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
122122
shape_inference data_transform lod_tensor profiler transfer_scope_cache)
123123
else()

paddle/fluid/framework/executor.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
392392

393393
int64_t max_memory_size = GetEagerDeletionThreshold();
394394
std::unique_ptr<GarbageCollector<Tensor>> gc;
395-
// WhileOp would set keep_kids to false
396-
// WhileGradOp would need the scopes created in WhileOp
395+
// WhileOp would set keep_kids to true,
396+
// because WhileGradOp needs the scopes created in WhileOp.
397397
// Perhaps, we should not perform eager deletion in WhileOp
398398
// The scopes and variables created by WhileOp would be deleted
399399
// in WhileGradOp.

paddle/fluid/framework/transfer_scope_cache.cc

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,28 @@
1717
namespace paddle {
1818
namespace framework {
1919

20+
// Holds all the transfer scope across the process.
2021
std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
21-
thread_local auto* x = new std::unordered_map<size_t, Scope*>;
22+
typedef std::unordered_map<size_t, Scope*> map_t;
23+
thread_local std::unique_ptr<map_t> x(new map_t);
2224
return *x;
2325
}
2426

27+
// Holds all the transfer scope for this thread.
2528
std::unordered_set<Scope*>& global_transfer_scope_cache() {
26-
thread_local auto* x = new std::unordered_set<Scope*>;
29+
typedef std::unordered_set<Scope*> set_t;
30+
thread_local std::unique_ptr<set_t> x(new set_t);
2731
return *x;
2832
}
2933

34+
// Try to create a transfer scope. If one cached scope has match the
35+
// requirement, just return that one.
36+
// Inputs:
37+
// @type0: the source kernel type.
38+
// @type1: the target kernel type.
39+
// @scope: the execution scope of this op.
40+
// Returns: A scope used to hold the transfer data across the different kernel
41+
// type.
3042
Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
3143
const Scope* scope) {
3244
Scope* new_scope{nullptr};
@@ -46,27 +58,5 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
4658
return new_scope;
4759
}
4860

49-
void RemoveKidsFromTransferScopeCache(Scope* scope) {
50-
auto it = global_transfer_scope_cache().find(scope);
51-
if (it != global_transfer_scope_cache().end()) {
52-
global_transfer_scope_cache().erase(it);
53-
}
54-
for (auto* s : scope->kids()) {
55-
auto it = global_transfer_scope_cache().find(s);
56-
if (it != global_transfer_scope_cache().end()) {
57-
global_transfer_scope_cache().erase(it);
58-
}
59-
}
60-
61-
// remove global transfer data cache
62-
auto& cache = global_transfer_data_cache();
63-
for (auto it = cache.begin(); it != cache.end();) {
64-
if (it->second == scope)
65-
it = cache.erase(it);
66-
else
67-
it++;
68-
}
69-
}
70-
7161
} // namespace framework
7262
} // namespace paddle

paddle/fluid/inference/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ endif()
44
# analysis and tensorrt must be added before creating static library,
55
# otherwise, there would be undefined reference to them in static library.
66
add_subdirectory(analysis)
7+
add_subdirectory(utils)
78
if (TENSORRT_FOUND)
89
add_subdirectory(tensorrt)
910
endif()

paddle/fluid/inference/api/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
3030
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
3131
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
3232
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
33-
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
33+
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
34+
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
35+
analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
3436

3537
cc_test(test_paddle_inference_api
3638
SRCS api_tester.cc

paddle/fluid/inference/api/analysis_predictor.cc

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
3232
#endif
3333
#include "paddle/fluid/inference/utils/singleton.h"
34+
#include "paddle/fluid/memory/memcpy.h"
3435
#include "paddle/fluid/platform/cpu_helper.h"
3536
#include "paddle/fluid/platform/profiler.h"
3637

@@ -174,7 +175,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
174175
inference::Timer timer;
175176
timer.tic();
176177
// set feed variable
177-
std::vector<framework::LoDTensor> feeds;
178178
framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
179179
if (!SetFeed(inputs, scope)) {
180180
LOG(ERROR) << "fail to set feed";
@@ -215,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
215215
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
216216
void *input_ptr;
217217
if (inputs[i].dtype == PaddleDType::INT64) {
218-
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
218+
input_ptr = input.mutable_data<int64_t>(ddim, place_);
219219
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
220-
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
220+
input_ptr = input.mutable_data<float>(ddim, place_);
221221
} else {
222222
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
223223
return false;
224224
}
225225

226-
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
227-
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
228-
inputs[i].data.length());
226+
if (platform::is_cpu_place(place_)) {
227+
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
228+
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
229+
inputs[i].data.length());
230+
} else {
231+
#ifdef PADDLE_WITH_CUDA
232+
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
233+
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
234+
platform::CPUPlace(), inputs[i].data.data(),
235+
inputs[i].data.length(),
236+
0); // stream 0 for sync copy
237+
#else
238+
PADDLE_THROW("Not compile with CUDA, should not reach here.");
239+
#endif
240+
}
229241
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
230242
framework::LoD lod;
231243
for (auto &level : inputs[i].lod) {

paddle/fluid/inference/api/api_impl.cc

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ limitations under the License. */
2424
#include "paddle/fluid/inference/api/api_impl.h"
2525
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
2626
#include "paddle/fluid/inference/api/helper.h"
27+
#include "paddle/fluid/memory/memcpy.h"
2728
#include "paddle/fluid/platform/cpu_helper.h"
2829
#include "paddle/fluid/platform/profiler.h"
2930

@@ -138,7 +139,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
138139
Timer timer;
139140
timer.tic();
140141
// set feed variable
141-
std::vector<framework::LoDTensor> feeds;
142142
framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
143143
if (!SetFeed(inputs, scope)) {
144144
LOG(ERROR) << "fail to set feed";
@@ -194,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
194194
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
195195
void *input_ptr;
196196
if (inputs[i].dtype == PaddleDType::INT64) {
197-
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
197+
input_ptr = input.mutable_data<int64_t>(ddim, place_);
198198
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
199-
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
199+
input_ptr = input.mutable_data<float>(ddim, place_);
200200
} else {
201201
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
202202
return false;
203203
}
204204

205-
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
206-
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
207-
inputs[i].data.length());
205+
if (platform::is_cpu_place(place_)) {
206+
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
207+
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
208+
inputs[i].data.length());
209+
} else {
210+
#ifdef PADDLE_WITH_CUDA
211+
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
212+
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
213+
platform::CPUPlace(), inputs[i].data.data(),
214+
inputs[i].data.length(),
215+
0); // stream 0 for sync copy
216+
#else
217+
PADDLE_THROW("Not compile with CUDA, should not reach here.");
218+
#endif
219+
}
220+
208221
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
209222
framework::LoD lod;
210223
for (auto &level : inputs[i].lod) {

paddle/fluid/inference/api/demo_ci/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,6 @@ if(WITH_GPU)
4646
endif()
4747
endif(NOT WIN32)
4848
endif()
49-
50-
include_directories("D:/Paddle/")
5149
include_directories("${PADDLE_LIB}")
5250
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
5351
include_directories("${PADDLE_LIB}/third_party/install/glog/include")

paddle/fluid/inference/tests/api/CMakeLists.txt

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
7474
# ocr
7575
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
7676
if (NOT EXISTS ${OCR_INSTALL_DIR})
77-
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
77+
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
7878
endif()
7979
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
8080

@@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
8888

8989
# anakin
9090
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
91-
# anakin rnn1
92-
set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
93-
set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
94-
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
95-
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
96-
cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
97-
ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
98-
--datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
99-
DEPS inference_anakin_api_shared SERIAL)
100-
# anakin mobilenet
101-
if(WITH_GPU)
102-
set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
103-
inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
104-
cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
105-
ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
106-
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
107-
endif()
91+
# anakin rnn1
92+
set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
93+
set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
94+
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
95+
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
96+
cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
97+
ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
98+
--datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
99+
DEPS inference_anakin_api_shared SERIAL)
100+
# anakin mobilenet
101+
if(WITH_GPU)
102+
set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
103+
inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
104+
cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
105+
ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
106+
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
107+
endif()
108108
endif()
109109

110110
if(WITH_GPU AND TENSORRT_FOUND)
111-
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
112-
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
113-
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
114-
endif()
115-
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
116-
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
117-
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
111+
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
112+
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
113+
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
114+
endif()
115+
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
116+
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
117+
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
118118
endif()

0 commit comments

Comments
 (0)