Skip to content

Commit 7796f65

Browse files
authored
fix inference on gpu out of mem (#14414)
* fix inference on gpu out of mem the transfer logic in operator.cc will keep creating new scopes.
1 parent 64f7516 commit 7796f65

File tree

6 files changed

+50
-2
lines changed

6 files changed

+50
-2
lines changed

CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,6 @@ endif()
315315

316316
if (ON_INFER)
317317
message(STATUS "On inference mode, will take place some specific optimization.")
318-
add_definitions(-DPADDLE_ON_INFERENCE)
319318
else()
320319
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
321320
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")

cmake/configure.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,3 +218,7 @@ endif(WITH_GRPC)
218218
if(WITH_BRPC_RDMA)
219219
add_definitions(-DPADDLE_WITH_BRPC_RDMA)
220220
endif(WITH_BRPC_RDMA)
221+
222+
if(ON_INFER)
223+
add_definitions(-DPADDLE_ON_INFERENCE)
224+
endif(ON_INFER)

paddle/fluid/framework/naive_executor.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
7070
}
7171

7272
void NaiveExecutor::Run() {
73+
#ifndef PADDLE_ON_INFERENCE
74+
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
75+
"cmake flag ON_INFER is not set.";
76+
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
77+
"variables will be reused to save the allocation "
78+
"overhead.";
79+
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
80+
"setting the cmake flag ON_INFER=ON if you are "
81+
"running Paddle Inference";
82+
#endif // PADDLE_ON_INFERENCE
7383
for (auto &op : ops_) {
7484
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
7585
<< " on scope " << scope_;

paddle/fluid/framework/op_kernel_type.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ struct OpKernelType {
6363
place_(dev_ctx.GetPlace()),
6464
library_type_(library_type) {}
6565

66+
size_t hash_key() const { return Hash()(*this); }
67+
6668
bool operator==(const OpKernelType& o) const {
6769
return platform::places_are_same_class(place_, o.place_) &&
6870
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&

paddle/fluid/framework/operator.cc

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
3535
namespace paddle {
3636
namespace framework {
3737

38+
// Combine two hash values to a single hash.
39+
inline size_t CombineHash(size_t seed, size_t a) {
40+
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
41+
}
42+
3843
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
3944
std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
4045
std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
794799
Scope* OperatorWithKernel::TryTransferData(
795800
const Scope& scope, const OpKernelType& expected_kernel_key,
796801
std::vector<std::string>* transfered_inplace_vars) const {
802+
// In the inference scenerio, the scopes will be reused across the batches, so
803+
// the `new_scope` here will result in GPU memroy explosion over the running of
804+
// operators.
805+
// We use a thread_local cache to fix that issue, the key in the cache is the
806+
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
807+
// Have a discussion with @Superjomn or the inference developers if some changes
808+
// on this logic for this macro might not tested on the other scenerios.
809+
#ifdef PADDLE_ON_INFERENCE
810+
thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
811+
#endif
812+
797813
Scope* new_scope = nullptr;
798814
for (auto& var_name_item : Inputs()) {
799815
for (auto& var_name : var_name_item.second) {
@@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
824840
VLOG(30) << "Transform Variable " << var_name << " from "
825841
<< kernel_type_for_var << " to " << expected_kernel_key;
826842

843+
#ifdef PADDLE_ON_INFERENCE
844+
size_t infer_cache_key =
845+
CombineHash(OpKernelType::Hash()(kernel_type_for_var),
846+
OpKernelType::Hash()(expected_kernel_key));
847+
infer_cache_key =
848+
CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
849+
850+
auto it = infer_transfer_scope_cache.find(infer_cache_key);
851+
if (it != infer_transfer_scope_cache.end()) {
852+
new_scope = infer_transfer_scope_cache[infer_cache_key];
853+
} else {
854+
new_scope = &scope.NewScope();
855+
infer_transfer_scope_cache[infer_cache_key] = new_scope;
856+
}
857+
#endif
858+
827859
if (new_scope == nullptr) {
828860
new_scope = &scope.NewScope();
829861
}
830862

831863
auto* trans_var = new_scope->Var(var_name);
864+
832865
Tensor out;
833866
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
834867
SetTensorToVariable(*var, out, trans_var);

paddle/fluid/framework/scope.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ DEFINE_double(
4242
// a mean time, but a scope may be read by multiple threads concurrently, and
4343
// the mutex will cause serious performance issue.
4444
// So the mutex is disabled when `ON_INFER`.
45-
#ifdef ON_INFER
45+
#ifdef PADDLE_ON_INFERENCE
4646
#define SCOPE_LOCK_GUARD
4747
#else
4848
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);

0 commit comments

Comments
 (0)