Skip to content

Commit c0492f2

Browse files
authored
Merge pull request #15213 from velconia/accelerate_little_model_local_release_1_2_x
Accelerate little models
2 parents 19534da + b5baca1 commit c0492f2

File tree

7 files changed

+125
-117
lines changed

7 files changed

+125
-117
lines changed

paddle/fluid/framework/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ cc_test(variable_test SRCS variable_test.cc)
8080
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
8181
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
8282

83-
cc_library(scope SRCS scope.cc DEPS glog threadpool)
83+
cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash)
8484
cc_test(scope_test SRCS scope_test.cc DEPS scope)
8585

8686
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)

paddle/fluid/framework/details/execution_strategy.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
2525
size_t num_threads_{0};
2626
bool use_cuda_{true};
2727
bool allow_op_delay_{false};
28-
size_t num_iteration_per_drop_scope_{100};
28+
size_t num_iteration_per_drop_scope_{1};
2929
ExecutorType type_{kDefault};
3030
bool dry_run_{false};
3131
};

paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@
1818
#include <vector>
1919
#include "paddle/fluid/framework/variable_helper.h"
2020
#include "paddle/fluid/platform/profiler.h"
21-
#ifdef PADDLE_WITH_CUDA
22-
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
23-
#endif
2421

2522
namespace paddle {
2623
namespace framework {
@@ -67,35 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
6764
}
6865

6966
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
70-
drop_scope_counter_ += 1;
67+
++drop_scope_counter_;
7168

72-
#ifdef PADDLE_WITH_CUDA
73-
const std::string gc_name = "garbage_collector";
74-
DeviceGarbageCollectorMap *gc =
75-
Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
76-
: nullptr;
77-
#endif
69+
bool stream_end = false;
70+
if (!fetch_tensors.empty()) {
71+
WaitComputationalStreams();
72+
stream_end = true;
73+
}
7874

79-
if (!fetch_tensors.empty() ||
80-
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
81-
drop_scope_counter_ = 0;
82-
// Wait All computational streams
83-
for (auto p : places_) {
84-
platform::DeviceContextPool::Instance().Get(p)->Wait();
85-
#ifdef PADDLE_WITH_CUDA
86-
if (gc != nullptr && platform::is_gpu_place(p)) {
87-
auto gpu_place = boost::get<platform::CUDAPlace>(p);
88-
auto &gc_at_place = gc->at(gpu_place.device);
89-
gc_at_place->Wait();
90-
gc_at_place->Reset();
91-
}
92-
#endif
75+
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
76+
if (!stream_end) {
77+
WaitComputationalStreams();
9378
}
79+
9480
for (auto &scope : local_scopes_) {
9581
auto &local_scope =
9682
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
9783
scope->DeleteScope(local_scope);
9884
}
85+
86+
drop_scope_counter_ = 0;
9987
}
10088
if (eptr) {
10189
std::rethrow_exception(eptr);

paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
2525
#include "paddle/fluid/framework/scope.h"
2626
#include "paddle/fluid/platform/place.h"
27+
#ifdef PADDLE_WITH_CUDA
28+
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
29+
#endif
30+
2731
namespace paddle {
2832
namespace framework {
2933
namespace details {
@@ -47,6 +51,30 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
4751

4852
FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
4953

54+
private:
55+
inline void WaitComputationalStreams() {
56+
#ifdef PADDLE_WITH_CUDA
57+
const std::string gc_name = "garbage_collector";
58+
DeviceGarbageCollectorMap* gc =
59+
Graph().Has(gc_name)
60+
? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
61+
: nullptr;
62+
#endif
63+
64+
// Wait All computational streams
65+
for (auto p : places_) {
66+
platform::DeviceContextPool::Instance().Get(p)->Wait();
67+
#ifdef PADDLE_WITH_CUDA
68+
if (gc != nullptr && platform::is_gpu_place(p)) {
69+
auto gpu_place = boost::get<platform::CUDAPlace>(p);
70+
auto& gc_at_place = gc->at(gpu_place.device);
71+
gc_at_place->Wait();
72+
gc_at_place->Reset();
73+
}
74+
#endif
75+
}
76+
}
77+
5078
private:
5179
size_t drop_scope_counter_{0};
5280

paddle/fluid/framework/rw_lock.h

Lines changed: 35 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ limitations under the License. */
1616

1717
#if !defined(_WIN32)
1818
#include <pthread.h>
19-
#endif // !_WIN32
19+
#else
20+
#include <mutex> // NOLINT
21+
#endif // !_WIN32
2022

2123
#include "paddle/fluid/platform/enforce.h"
2224

@@ -29,17 +31,17 @@ struct RWLock {
2931

3032
~RWLock() { pthread_rwlock_destroy(&lock_); }
3133

32-
void RDLock() {
34+
inline void RDLock() {
3335
PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
3436
"acquire read lock failed");
3537
}
3638

37-
void WRLock() {
39+
inline void WRLock() {
3840
PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
3941
"acquire write lock failed");
4042
}
4143

42-
void UNLock() {
44+
inline void UNLock() {
4345
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
4446
}
4547

@@ -51,81 +53,46 @@ struct RWLock {
5153
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
5254
// In windows, rw_lock seems like a hack. Use empty object and do nothing.
5355
struct RWLock {
54-
void RDLock() {}
55-
void WRLock() {}
56-
void UNLock() {}
56+
// FIXME(minqiyang): use mutex here to do fake lock
57+
inline void RDLock() { mutex_.lock(); }
58+
59+
inline void WRLock() { mutex_.lock(); }
60+
61+
inline void UNLock() { mutex_.unlock(); }
62+
63+
private:
64+
std::mutex mutex_;
5765
};
5866
#endif
5967

60-
class RWLockGuard {
68+
class AutoWRLock {
6169
public:
62-
enum Status { kUnLock, kWRLock, kRDLock };
63-
64-
RWLockGuard(RWLock* rw_lock, Status init_status)
65-
: lock_(rw_lock), status_(Status::kUnLock) {
66-
switch (init_status) {
67-
case Status::kRDLock: {
68-
RDLock();
69-
break;
70-
}
71-
case Status::kWRLock: {
72-
WRLock();
73-
break;
74-
}
75-
case Status::kUnLock: {
76-
break;
77-
}
78-
}
79-
}
70+
explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
8071

81-
void WRLock() {
82-
switch (status_) {
83-
case Status::kUnLock: {
84-
lock_->WRLock();
85-
status_ = Status::kWRLock;
86-
break;
87-
}
88-
case Status::kWRLock: {
89-
break;
90-
}
91-
case Status::kRDLock: {
92-
PADDLE_THROW(
93-
"Please unlock read lock first before invoking write lock.");
94-
break;
95-
}
96-
}
97-
}
72+
~AutoWRLock() { UnLock(); }
9873

99-
void RDLock() {
100-
switch (status_) {
101-
case Status::kUnLock: {
102-
lock_->RDLock();
103-
status_ = Status::kRDLock;
104-
break;
105-
}
106-
case Status::kRDLock: {
107-
break;
108-
}
109-
case Status::kWRLock: {
110-
PADDLE_THROW(
111-
"Please unlock write lock first before invoking read lock.");
112-
break;
113-
}
114-
}
115-
}
74+
private:
75+
inline void Lock() { lock_->WRLock(); }
11676

117-
void UnLock() {
118-
if (status_ != Status::kUnLock) {
119-
lock_->UNLock();
120-
status_ = Status::kUnLock;
121-
}
122-
}
77+
inline void UnLock() { lock_->UNLock(); }
78+
79+
private:
80+
RWLock* lock_;
81+
};
82+
83+
class AutoRDLock {
84+
public:
85+
explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
86+
87+
~AutoRDLock() { UnLock(); }
88+
89+
private:
90+
inline void Lock() { lock_->RDLock(); }
12391

124-
~RWLockGuard() { UnLock(); }
92+
inline void UnLock() { lock_->UNLock(); }
12593

12694
private:
12795
RWLock* lock_;
128-
Status status_;
12996
};
13097

13198
} // namespace framework

paddle/fluid/framework/scope.cc

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,15 @@ DEFINE_double(
4343
// the mutex will cause serious performance issue.
4444
// So the mutex is disabled when `ON_INFER`.
4545
#ifdef PADDLE_ON_INFERENCE
46-
#define SCOPE_LOCK_GUARD
46+
#define SCOPE_KIDS_READER_LOCK
47+
#define SCOPE_KIDS_WRITER_LOCK
48+
#define SCOPE_VARS_READER_LOCK
49+
#define SCOPE_VARS_WRITER_LOCK
4750
#else
48-
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
51+
#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
52+
#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
53+
#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
54+
#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
4955
#endif
5056

5157
namespace paddle {
@@ -61,64 +67,69 @@ int64_t GetEagerDeletionThreshold() {
6167
Scope::~Scope() { DropKids(); }
6268

6369
Scope& Scope::NewScope() const {
64-
SCOPE_LOCK_GUARD
65-
kids_.push_back(new Scope(this));
66-
return *kids_.back();
70+
Scope* child = new Scope(this);
71+
{
72+
SCOPE_KIDS_WRITER_LOCK
73+
kids_.push_back(child);
74+
}
75+
return *child;
6776
}
6877

6978
Variable* Scope::Var(const std::string& name) {
70-
SCOPE_LOCK_GUARD
79+
SCOPE_VARS_WRITER_LOCK
7180
return VarInternal(name);
7281
}
7382

7483
Variable* Scope::Var(std::string* name) {
75-
SCOPE_LOCK_GUARD
7684
auto new_name = string::Sprintf("%p.%d", this, vars_.size());
7785
if (name != nullptr) {
7886
*name = new_name;
7987
}
88+
SCOPE_VARS_WRITER_LOCK
8089
return VarInternal(new_name);
8190
}
8291

8392
Variable* Scope::FindVar(const std::string& name) const {
84-
SCOPE_LOCK_GUARD
93+
SCOPE_VARS_READER_LOCK
8594
return FindVarInternal(name);
8695
}
8796

8897
Variable* Scope::FindLocalVar(const std::string& name) const {
89-
SCOPE_LOCK_GUARD
98+
SCOPE_VARS_READER_LOCK
9099
return FindVarLocally(name);
91100
}
92101

93102
const Scope* Scope::FindScope(const Variable* var) const {
94-
SCOPE_LOCK_GUARD
103+
SCOPE_VARS_READER_LOCK
95104
return FindScopeInternal(var);
96105
}
97106

98107
void Scope::DropKids() {
99-
SCOPE_LOCK_GUARD
108+
SCOPE_KIDS_WRITER_LOCK
100109
for (Scope* s : kids_) delete s;
101110
kids_.clear();
102111
}
103112

104113
bool Scope::HasKid(const Scope* scope) const {
105-
SCOPE_LOCK_GUARD
114+
SCOPE_KIDS_READER_LOCK
106115
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
107116
return it != this->kids_.end();
108117
}
109118

110119
std::vector<std::string> Scope::LocalVarNames() const {
111-
SCOPE_LOCK_GUARD
112120
std::vector<std::string> known_vars;
113-
known_vars.reserve(this->vars_.size());
114-
for (auto& p : vars_) {
115-
known_vars.emplace_back(p.first);
121+
{
122+
SCOPE_VARS_READER_LOCK
123+
known_vars.reserve(this->vars_.size());
124+
for (auto& p : vars_) {
125+
known_vars.emplace_back(p.first);
126+
}
116127
}
117128
return known_vars;
118129
}
119130

120131
void Scope::DeleteScope(Scope* scope) const {
121-
SCOPE_LOCK_GUARD
132+
SCOPE_KIDS_WRITER_LOCK
122133
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
123134
PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
124135
this, scope);
@@ -132,8 +143,8 @@ void Scope::DeleteScope(Scope* scope) const {
132143
}
133144

134145
void Scope::EraseVars(const std::vector<std::string>& var_names) {
135-
SCOPE_LOCK_GUARD
136146
std::set<std::string> var_set(var_names.begin(), var_names.end());
147+
SCOPE_VARS_WRITER_LOCK
137148
for (auto it = vars_.begin(); it != vars_.end();) {
138149
if (var_set.find(it->first) != var_set.end()) {
139150
it = vars_.erase(it);
@@ -145,12 +156,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
145156

146157
void Scope::Rename(const std::string& origin_name,
147158
const std::string& new_name) const {
148-
SCOPE_LOCK_GUARD
159+
SCOPE_VARS_WRITER_LOCK
149160
RenameInternal(origin_name, new_name);
150161
}
151162

152163
std::string Scope::Rename(const std::string& origin_name) const {
153-
SCOPE_LOCK_GUARD
164+
SCOPE_VARS_WRITER_LOCK
154165
auto new_name = string::Sprintf("%p.%d", this, vars_.size());
155166
RenameInternal(origin_name, new_name);
156167
return new_name;

0 commit comments

Comments
 (0)