Skip to content

Commit 39ec80d

Browse files
authored
Remove the memory copy of feeding data in C++ inference API (#14577)
* Remove the memory copy for feeding data in C++ inference API * Fix compling dependence * Fix compling in ONLY_CPU mode
1 parent cf5be6a commit 39ec80d

File tree

3 files changed

+40
-11
lines changed

3 files changed

+40
-11
lines changed

paddle/fluid/inference/api/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
3030
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
3131
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
3232
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
33-
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
33+
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
34+
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
35+
analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
3436

3537
cc_test(test_paddle_inference_api
3638
SRCS api_tester.cc

paddle/fluid/inference/api/analysis_predictor.cc

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
3232
#endif
3333
#include "paddle/fluid/inference/utils/singleton.h"
34+
#include "paddle/fluid/memory/memcpy.h"
3435
#include "paddle/fluid/platform/cpu_helper.h"
3536
#include "paddle/fluid/platform/profiler.h"
3637

@@ -214,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
214215
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
215216
void *input_ptr;
216217
if (inputs[i].dtype == PaddleDType::INT64) {
217-
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
218+
input_ptr = input.mutable_data<int64_t>(ddim, place_);
218219
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
219-
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
220+
input_ptr = input.mutable_data<float>(ddim, place_);
220221
} else {
221222
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
222223
return false;
223224
}
224225

225-
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
226-
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
227-
inputs[i].data.length());
226+
if (platform::is_cpu_place(place_)) {
227+
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
228+
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
229+
inputs[i].data.length());
230+
} else {
231+
#ifdef PADDLE_WITH_CUDA
232+
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
233+
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
234+
platform::CPUPlace(), inputs[i].data.data(),
235+
inputs[i].data.length(),
236+
0); // stream 0 for sync copy
237+
#else
238+
PADDLE_THROW("Not compile with CUDA, should not reach here.");
239+
#endif
240+
}
228241
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
229242
framework::LoD lod;
230243
for (auto &level : inputs[i].lod) {

paddle/fluid/inference/api/api_impl.cc

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ limitations under the License. */
2424
#include "paddle/fluid/inference/api/api_impl.h"
2525
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
2626
#include "paddle/fluid/inference/api/helper.h"
27+
#include "paddle/fluid/memory/memcpy.h"
2728
#include "paddle/fluid/platform/cpu_helper.h"
2829
#include "paddle/fluid/platform/profiler.h"
2930

@@ -193,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
193194
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
194195
void *input_ptr;
195196
if (inputs[i].dtype == PaddleDType::INT64) {
196-
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
197+
input_ptr = input.mutable_data<int64_t>(ddim, place_);
197198
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
198-
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
199+
input_ptr = input.mutable_data<float>(ddim, place_);
199200
} else {
200201
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
201202
return false;
202203
}
203204

204-
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
205-
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
206-
inputs[i].data.length());
205+
if (platform::is_cpu_place(place_)) {
206+
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
207+
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
208+
inputs[i].data.length());
209+
} else {
210+
#ifdef PADDLE_WITH_CUDA
211+
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
212+
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
213+
platform::CPUPlace(), inputs[i].data.data(),
214+
inputs[i].data.length(),
215+
0); // stream 0 for sync copy
216+
#else
217+
PADDLE_THROW("Not compile with CUDA, should not reach here.");
218+
#endif
219+
}
220+
207221
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
208222
framework::LoD lod;
209223
for (auto &level : inputs[i].lod) {

0 commit comments

Comments
 (0)