Skip to content

Commit caf10b4

Browse files
committed
make profiler use thread_id from g_thread_id
Add a few more RecordEvent. Cleanup
1 parent baff71d commit caf10b4

File tree

10 files changed

+96
-36
lines changed

10 files changed

+96
-36
lines changed

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "paddle/fluid/framework/details/container_cast.h"
1818
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1919
#include "paddle/fluid/framework/details/variable_visitor.h"
20+
#include "paddle/fluid/platform/profiler.h"
2021

2122
namespace paddle {
2223
namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
4546
#endif
4647

4748
void AllReduceOpHandle::RunImpl() {
49+
platform::RecordEvent r("all_reduce", nullptr);
4850
if (NoDummyInputSize() == 1) {
4951
return; // No need to all reduce when GPU count = 1;
5052
} else {

paddle/fluid/framework/details/reduce_op_handle.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
#include "paddle/fluid/framework/details/container_cast.h"
1717
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1818
#include "paddle/fluid/framework/details/variable_visitor.h"
19+
#include "paddle/fluid/platform/profiler.h"
1920

2021
namespace paddle {
2122
namespace framework {
2223
namespace details {
2324

2425
void ReduceOpHandle::RunImpl() {
26+
platform::RecordEvent r("reduce", nullptr);
2527
if (places_.size() == 1) return;
2628
// the input and output may have dummy var.
2729
auto in_var_handles = DynamicCast<VarHandle>(inputs_);

paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <string>
1818
#include <vector>
1919
#include "paddle/fluid/framework/executor.h"
20+
#include "paddle/fluid/platform/profiler.h"
2021

2122
namespace paddle {
2223
namespace framework {
@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
6263
eptr = std::current_exception();
6364
}
6465

66+
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
6567
drop_scope_counter_ += 1;
6668
if (!fetch_tensors.empty() ||
6769
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {

paddle/fluid/framework/details/threaded_ssa_graph_executor.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
1616

1717
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
18+
#include "paddle/fluid/platform/profiler.h"
1819

1920
namespace paddle {
2021
namespace framework {
@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
3435

3536
FeedFetchList ThreadedSSAGraphExecutor::Run(
3637
const std::vector<std::string> &fetch_tensors) {
38+
std::unique_ptr<platform::RecordEvent> event(
39+
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
3740
std::unordered_map<OpHandleBase *, size_t> pending_ops;
3841
std::unordered_set<VarHandleBase *> pending_vars;
3942
BlockingQueue<VarHandleBase *> ready_vars;
@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
8487
// Clean run context
8588
run_op_futures_.clear();
8689
exception_holder_.Clear();
90+
event.reset(nullptr);
8791

8892
// Step 3. Execution
8993
while (!pending_vars.empty()) {
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
2+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
3+
the Apache License, Version 2.0 (the "License"); you may not use this file
4+
except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
syntax = "proto3";
16+
package sendrecv;
17+
18+
option cc_generic_services = false;
19+
20+
service SendRecvService {
21+
// For parameter server round-robin like hashing, do not split tensors.
22+
// Send and recv only one tensor
23+
// TODO(typhoonzero): add streaming API
24+
rpc SendVariable(VariableMessage) returns (VoidMessage) {}
25+
// Argument VariableMessage for GetVariable should only contain varname.
26+
rpc GetVariable(VariableMessage) returns (VariableMessage) {}
27+
// pre-fetch variable by given variable name and Ids
28+
rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
29+
30+
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
31+
}
32+
33+
// VariableMessage is serialized paddle variable message.
34+
// It can be:
35+
// LoDTensor
36+
// SelectedRows
37+
enum VarType {
38+
LOD_TENSOR = 0;
39+
SELECTED_ROWS = 1;
40+
NCCL_ID = 2;
41+
}
42+
43+
// NOTICE(gongwb):don't modify this proto if you are not
44+
// not familar with how we serialize in sendrecvop_utils.h
45+
// and deserilize it in variable_response.h.
46+
message VariableMessage {
47+
enum Type {
48+
// Pod Types
49+
BOOL = 0;
50+
INT16 = 1;
51+
INT32 = 2;
52+
INT64 = 3;
53+
FP16 = 4;
54+
FP32 = 5;
55+
FP64 = 6;
56+
}
57+
58+
message LodData { repeated int64 lod_data = 1; }
59+
string varname = 1;
60+
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
61+
VarType type = 2;
62+
// bool persistable is not needed for sending.
63+
// tensor info:
64+
Type data_type = 3;
65+
repeated int64 dims = 4;
66+
67+
// lod details:
68+
int64 lod_level = 5;
69+
repeated LodData lod = 6;
70+
// selected_rows height, aka. original dim0
71+
int64 slr_height = 7;
72+
// tensor data
73+
bytes serialized = 8;
74+
// selected_rows data
75+
bytes rows = 9;
76+
// Look up table block execution output variable name.
77+
string out_varname = 10;
78+
// If 1, the ps server will start profiling, the ps
79+
// server stops profiling and generates a profile to /tmp/profile_ps_*
80+
// when profile switches from 1 to 2.
81+
int64 profile = 11;
82+
}
83+
84+
message VoidMessage {}

paddle/fluid/operators/parallel_do_op.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ limitations under the License. */
1818
#include "paddle/fluid/framework/op_registry.h"
1919
#include "paddle/fluid/framework/threadpool.h"
2020
#include "paddle/fluid/operators/detail/safe_ref.h"
21-
#include "paddle/fluid/platform/profiler.h"
2221

2322
namespace paddle {
2423
namespace operators {
@@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase {
166165

167166
workers.emplace_back(
168167
framework::Async([program, cur_scope, place, block, place_idx] {
169-
// Give the thread an id to distinguish parallel block with same id.
170-
platform::RecordThread rt(static_cast<int>(place_idx) + 1);
171168
framework::Executor executor(place);
172169
executor.Run(*program, cur_scope, block->ID(),
173170
false /*create_local_scope*/);
@@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase {
244241
// execute
245242
workers.emplace_back(
246243
framework::Async([program, cur_scope, place, block, i] {
247-
// Give the thread an id to distinguish parallel block with same id.
248-
platform::RecordThread rt(static_cast<int>(i) + 1);
249244
framework::Executor executor(place);
250245
executor.Run(*program, cur_scope, block->ID(),
251246
false /*create_local_scope*/);

paddle/fluid/platform/device_tracer.cc

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ limitations under the License. */
3030
namespace paddle {
3131
namespace platform {
3232
namespace {
33-
// Current thread's id. Note, we don't distinguish nested threads
34-
// for now.
35-
thread_local int cur_thread_id = 0;
3633
// Tracking the nested block stacks of each thread.
3734
thread_local std::deque<int> block_id_stack;
3835
// Tracking the nested event stacks.
@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
413410
void ClearCurBlock() { block_id_stack.pop_back(); }
414411

415412
int BlockDepth() { return block_id_stack.size(); }
416-
417-
void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
418-
419-
void ClearCurThread() { cur_thread_id = 0; }
420-
421-
int CurThread() { return cur_thread_id; }
422-
423413
} // namespace platform
424414
} // namespace paddle

paddle/fluid/platform/device_tracer.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,5 @@ std::string CurAnnotation();
9999
void SetCurBlock(int block_id);
100100
void ClearCurBlock();
101101
int BlockDepth();
102-
103-
void SetCurThread(int thread_id);
104-
void ClearCurThread();
105-
int CurThread();
106102
} // namespace platform
107103
} // namespace paddle

paddle/fluid/platform/profiler.cc

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ RecordEvent::~RecordEvent() {
190190
DeviceTracer* tracer = GetDeviceTracer();
191191
if (tracer) {
192192
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
193-
BlockDepth(), CurThread());
193+
BlockDepth(), g_thread_id);
194194
}
195195
ClearCurAnnotation();
196196
PopEvent(name_, dev_ctx_);
@@ -211,21 +211,11 @@ RecordBlock::~RecordBlock() {
211211
// We try to put all blocks at the same nested depth in the
212212
// same timeline lane. and distinguish the using thread_id.
213213
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
214-
CurThread());
214+
g_thread_id);
215215
}
216216
ClearCurBlock();
217217
}
218218

219-
RecordThread::RecordThread(int thread_id) {
220-
if (g_state == ProfilerState::kDisabled) return;
221-
SetCurThread(thread_id);
222-
}
223-
224-
RecordThread::~RecordThread() {
225-
if (g_state == ProfilerState::kDisabled) return;
226-
ClearCurThread();
227-
}
228-
229219
void EnableProfiler(ProfilerState state) {
230220
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
231221
"Can't enbale profling, since the input state is ",

paddle/fluid/platform/profiler.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,6 @@ struct RecordBlock {
9595
uint64_t start_ns_;
9696
};
9797

98-
struct RecordThread {
99-
explicit RecordThread(int thread_id);
100-
~RecordThread();
101-
};
102-
10398
// Return the event list of all threads. Assumed the returned value calls
10499
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
105100
std::vector<std::vector<Event>> GetAllEvents();

0 commit comments

Comments
 (0)