Skip to content

Commit 234a1d9

Browse files
committed
Merge remote-tracking branch 'origin/develop' into windows/debug
test=develop
2 parents 2835e04 + a270fdf commit 234a1d9

File tree

79 files changed

+1279
-1419
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+1279
-1419
lines changed

paddle/fluid/framework/details/execution_strategy.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// limitations under the License.
1414

1515
#pragma once
16+
#include <cstddef> // for size_t
1617

1718
namespace paddle {
1819
namespace framework {
@@ -26,6 +27,7 @@ struct ExecutionStrategy {
2627
bool allow_op_delay_{false};
2728
size_t num_iteration_per_drop_scope_{100};
2829
ExecutorType type_{kDefault};
30+
bool dry_run_{false};
2931
};
3032

3133
} // namespace details

paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
128128
size_t complete = 0;
129129
while (op_to_run != nullptr) {
130130
try {
131-
op_to_run->Run(strategy_.use_cuda_);
131+
if (LIKELY(!strategy_.dry_run_)) {
132+
op_to_run->Run(strategy_.use_cuda_);
133+
}
132134
++complete;
133135
} catch (...) {
134136
exception_.Catch(std::current_exception());

paddle/fluid/framework/details/threaded_ssa_graph_executor.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
211211
if (VLOG_IS_ON(10)) {
212212
VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
213213
}
214-
op->Run(strategy_.use_cuda_);
214+
if (LIKELY(!strategy_.dry_run_)) {
215+
op->Run(strategy_.use_cuda_);
216+
}
215217
VLOG(10) << op << " " << op->Name() << " Done ";
216218
running_ops_--;
217219
ready_var_q->Extend(op->Outputs());

paddle/fluid/framework/details/threaded_ssa_graph_executor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
4848
// Use topological sort algorithm
4949
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
5050

51-
~ThreadedSSAGraphExecutor() {}
51+
~ThreadedSSAGraphExecutor() final = default;
5252

5353
private:
5454
void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,

paddle/fluid/framework/parallel_executor.cc

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
3838
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
3939
: places_(places) {}
4040

41+
~ParallelExecutorPrivate() {
42+
if (own_local_scope_) {
43+
for (size_t i = 1; i < local_scopes_.size(); ++i) {
44+
// Skip the first scope, since it is the global scope.
45+
Scope *local_scope = local_scopes_[i];
46+
if (global_scope_->HasKid(local_scope)) {
47+
global_scope_->DeleteScope(local_scope);
48+
}
49+
}
50+
}
51+
}
4152
std::vector<platform::Place> places_;
4253
std::vector<Scope *> local_scopes_;
43-
Scope *global_scope_;
54+
Scope *global_scope_; // not owned
4455
std::unique_ptr<details::SSAGraphExecutor> executor_;
4556

4657
#ifdef PADDLE_WITH_CUDA
@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
306317
for (auto &p : member_->places_) {
307318
platform::DeviceContextPool::Instance().Get(p)->Wait();
308319
}
309-
310-
if (member_->own_local_scope_) {
311-
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
312-
Scope *local_scope = member_->local_scopes_[i];
313-
if (member_->global_scope_->HasKid(local_scope)) {
314-
member_->global_scope_->DeleteScope(local_scope);
315-
}
316-
}
317-
}
318-
319320
// member_ must be destructed before gcs_ since the destructor of
320321
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
321322
member_.reset();

paddle/fluid/framework/threadpool.cc

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
5757
ThreadPool::~ThreadPool() {
5858
{
5959
// notify all threads to stop running
60-
std::lock_guard<std::mutex> l(mutex_);
60+
std::unique_lock<std::mutex> l(mutex_);
6161
running_ = false;
62-
scheduled_.notify_all();
6362
}
63+
scheduled_.notify_all();
6464

6565
for (auto& t : threads_) {
6666
t->join();
@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
7070

7171
void ThreadPool::TaskLoop() {
7272
while (true) {
73-
std::unique_lock<std::mutex> lock(mutex_);
73+
Task task;
7474

75-
scheduled_.wait(
76-
lock, [this] { return !this->tasks_.empty() || !this->running_; });
75+
{
76+
std::unique_lock<std::mutex> lock(mutex_);
77+
scheduled_.wait(
78+
lock, [this] { return !this->tasks_.empty() || !this->running_; });
7779

78-
if (!running_ || tasks_.empty()) {
79-
return;
80-
}
80+
if (!running_ && tasks_.empty()) {
81+
return;
82+
}
83+
84+
if (tasks_.empty()) {
85+
PADDLE_THROW("This thread has no task to Run");
86+
}
8187

82-
// pop a task from the task queue
83-
auto task = std::move(tasks_.front());
84-
tasks_.pop();
85-
lock.unlock();
88+
// pop a task from the task queue
89+
task = std::move(tasks_.front());
90+
tasks_.pop();
91+
}
8692

8793
// run the task
8894
task();

paddle/fluid/framework/threadpool.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class ThreadPool {
5858
~ThreadPool();
5959

6060
// Run pushes a function to the task queue and returns a std::future
61-
// object. To wait for the completion of the task, call
61+
// object. To wait for the completion of the task, call
6262
// std::future::wait().
6363
template <typename Callback>
6464
std::future<void> Run(Callback fn) {
@@ -69,7 +69,6 @@ class ThreadPool {
6969
template <typename Callback>
7070
std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
7171
Callback fn) {
72-
std::unique_lock<std::mutex> lock(mutex_);
7372
Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
7473
try {
7574
fn();
@@ -84,7 +83,13 @@ class ThreadPool {
8483
return nullptr;
8584
});
8685
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
87-
tasks_.push(std::move(task));
86+
{
87+
std::unique_lock<std::mutex> lock(mutex_);
88+
if (!running_) {
89+
PADDLE_THROW("enqueue on stopped ThreadPool");
90+
}
91+
tasks_.push(std::move(task));
92+
}
8893
scheduled_.notify_one();
8994
return f;
9095
}

paddle/fluid/inference/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
if(WITH_TESTING)
2-
include(test.cmake) # some generic cmake funtion for inference
2+
include(tests/test.cmake) # some generic cmake funtion for inference
33
endif()
44
# analysis and tensorrt must be added before creating static library,
55
# otherwise, there would be undefined reference to them in static library.

paddle/fluid/inference/tensorrt/convert/conv2d_op.cc

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,21 @@ namespace paddle {
1818
namespace inference {
1919
namespace tensorrt {
2020

21+
bool to_skip_merging_optimize(TensorRTEngine* engine_,
22+
const std::vector<int>& filters,
23+
const std::vector<int>& strides,
24+
const std::vector<int>& paddings,
25+
std::string input_name) {
26+
if (engine_->itensor_quote_num[input_name] > 0) {
27+
return true;
28+
}
29+
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
30+
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
31+
engine_->itensor_quote_num[input_name] += 1;
32+
33+
return false;
34+
}
35+
2136
class Conv2dOpConverter : public OpConverter {
2237
public:
2338
void operator()(const framework::proto::OpDesc& op,
@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
3146
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
3247

3348
auto* X = engine_->GetITensor(op_desc.Input("Input").front());
49+
3450
// Declare weights
3551
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
3652
PADDLE_ENFORCE_NOT_NULL(Y_v);
@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
8399
std::move(weight_tensor);
84100
layer->getOutput(0)->setName(output_name.c_str());
85101
engine_->SetITensor(output_name, layer->getOutput(0));
86-
if (test_mode) {
102+
103+
if (test_mode ||
104+
to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
105+
paddings, op_desc.Input("Input").front())) {
87106
engine_->DeclareOutput(output_name);
88107
}
89108
}

paddle/fluid/inference/tensorrt/engine.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
133133
buffer_sizes_[name] = 0;
134134
}
135135

136+
bool TensorRTEngine::HasDeclared(const std::string &name) {
137+
return buffer_sizes_.count(name) > 0;
138+
}
139+
136140
void TensorRTEngine::DeclareOutput(const std::string &name) {
137141
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
138142
name);

0 commit comments

Comments
 (0)