Skip to content

Commit 750299f

Browse files
authored
Merge pull request #7576 from qingqing01/profiling_py
Refine profiler and expose to Python.
2 parents 42549fb + f18016b commit 750299f

File tree

10 files changed

+190
-41
lines changed

10 files changed

+190
-41
lines changed

paddle/framework/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op)
7474
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
7575
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
7676

77-
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)
77+
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
78+
framework_proto backward glog lod_rank_table profiler)
7879

7980
cc_library(prune SRCS prune.cc DEPS framework_proto)
8081
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

paddle/framework/executor.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ limitations under the License. */
2222
#include "paddle/framework/lod_tensor_array.h"
2323
#include "paddle/framework/op_registry.h"
2424
#include "paddle/platform/place.h"
25+
#include "paddle/platform/profiler.h"
2526

2627
DECLARE_bool(do_memory_benchmark);
2728
DEFINE_bool(check_nan_inf, false,
@@ -117,6 +118,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
117118
for (auto& op_desc : block.AllOps()) {
118119
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
119120
VLOG(4) << op->DebugStringEx(local_scope);
121+
122+
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
123+
platform::RecordEvent record_event(op->Type(), pool.Get(place_));
124+
120125
op->Run(*local_scope, place_);
121126
VLOG(3) << op->DebugStringEx(local_scope);
122127
if (FLAGS_do_memory_benchmark) {

paddle/platform/profiler.cc

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
4747
}
4848

4949
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
50-
DeviceContext* dev_ctx)
50+
const DeviceContext* dev_ctx)
5151
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
5252
#ifdef PADDLE_WITH_CUDA
53-
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
54-
if (cuda_dev_ctx) {
53+
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
54+
if (has_cuda_) {
55+
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
5556
PADDLE_ENFORCE(cudaGetDevice(&device_));
5657
PADDLE_ENFORCE(cudaEventCreate(&event_));
5758
auto stream = cuda_dev_ctx->stream();
5859
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
59-
has_cuda_ = true;
6060
}
6161
#endif
6262
cpu_ns_ = GetTimeInNsec();
@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
114114
return *g_event_list;
115115
}
116116

117-
void Mark(const std::string& name, DeviceContext* dev_ctx) {
117+
void Mark(const std::string& name, const DeviceContext* dev_ctx) {
118118
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
119119
}
120120

121-
void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
121+
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
122122
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
123123
}
124124

125-
void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
125+
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
126126
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
127127
}
128128

129-
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
129+
RecordEvent::RecordEvent(const std::string& name,
130+
const DeviceContext* dev_ctx) {
130131
if (g_state == ProfilerState::kDisabled) return;
131132
dev_ctx_ = dev_ctx;
132133
name_ = name;
@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
155156
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
156157
Mark("_cuda_startup_", dev_ctx);
157158
dev_ctx->Wait();
159+
delete dev_ctx;
158160
});
159161
}
160162
}
@@ -163,21 +165,36 @@ void EnableProfiler(ProfilerState state) {
163165
Mark("_start_profiler_", nullptr);
164166
}
165167

166-
std::vector<std::vector<Event>> DisableProfiler() {
167-
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
168-
"Can't disable profiling, since it's not starting.");
169-
// Mark the profiling stop.
170-
Mark("_stop_profiler_", nullptr);
171-
g_state = ProfilerState::kDisabled;
172-
std::vector<std::vector<Event>> result;
168+
void ResetProfiler() {
173169
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
170+
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
171+
++it) {
172+
(*it)->Clear();
173+
}
174+
}
175+
176+
std::vector<std::vector<Event>> GetAllEvents() {
177+
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
178+
std::vector<std::vector<Event>> result;
174179
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
175180
++it) {
176181
result.emplace_back((*it)->Reduce());
177182
}
178183
return result;
179184
}
180185

186+
void DisableProfiler(EventSortingKey sorted_key) {
187+
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
188+
"Can't disable profiling, since it's not starting.");
189+
// Mark the profiling stop.
190+
Mark("_stop_profiler_", nullptr);
191+
g_state = ProfilerState::kDisabled;
192+
193+
std::vector<std::vector<Event>> all_events = GetAllEvents();
194+
ParseEvents(all_events, sorted_key);
195+
ResetProfiler();
196+
}
197+
181198
void ParseEvents(std::vector<std::vector<Event>>& events,
182199
EventSortingKey sorted_by) {
183200
if (g_profiler_place == "") return;
@@ -291,12 +308,12 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
291308
}
292309

293310
// Print report
294-
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
311+
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
295312
}
296313

297-
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
298-
std::string& sorted_domain, const size_t name_width,
299-
const size_t data_width) {
314+
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
315+
std::string& sorted_domain, const size_t name_width,
316+
const size_t data_width) {
300317
// Output header information
301318
std::cout << "\n------------------------->"
302319
<< " Profiling Report "

paddle/platform/profiler.h

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class Event {
2929
// The DeviceContext is used to get the cuda stream.
3030
// If CPU profiling mode, can pass nullptr.
3131
Event(EventKind kind, std::string name, uint32_t thread_id,
32-
DeviceContext* dev_ctx);
32+
const DeviceContext* dev_ctx);
3333

3434
std::string kind() const;
3535
std::string name() const { return name_; }
@@ -84,6 +84,8 @@ struct EventList {
8484
return result;
8585
}
8686

87+
void Clear() { event_blocks.clear(); }
88+
8789
std::forward_list<std::vector<Event>> event_blocks;
8890
};
8991

@@ -93,29 +95,26 @@ enum ProfilerState {
9395
kCUDA, // GPU profiling state
9496
};
9597

96-
void Mark(const std::string& name, DeviceContext* dev_ctx);
98+
void Mark(const std::string& name, const DeviceContext* dev_ctx);
9799

98-
void PushEvent(const std::string& name, DeviceContext* dev_ctx);
100+
void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
99101

100-
void PopEvent(const std::string& name, DeviceContext* dev_ctx);
102+
void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
101103

102104
struct RecordEvent {
103-
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
105+
explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
104106

105107
~RecordEvent();
106108

107109
// The device context is used by Event to get the current cuda stream.
108-
DeviceContext* dev_ctx_;
110+
const DeviceContext* dev_ctx_;
109111
// Event name
110112
std::string name_;
111113
};
112114

113-
// Enable the profiling function.
114-
void EnableProfiler(ProfilerState state);
115-
116115
// Return the event list of all threads. Asummed the returned value calls
117116
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
118-
std::vector<std::vector<Event>> DisableProfiler();
117+
std::vector<std::vector<Event>> GetAllEvents();
119118

120119
// The information of each event given in the profiling report
121120
struct EventItem {
@@ -130,13 +129,22 @@ struct EventItem {
130129
// Candidate keys to sort the profiling report
131130
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
132131

132+
// Enable the profiling function.
133+
void EnableProfiler(ProfilerState state);
134+
135+
// Clear the g_all_event_lists, which is total event lists of all threads.
136+
void ResetProfiler();
137+
138+
void DisableProfiler(EventSortingKey sorted_key);
139+
133140
// Parse the event list and output the profiling report
134141
void ParseEvents(std::vector<std::vector<Event>>&,
135142
EventSortingKey sorted_by = EventSortingKey::kDefault);
136143

137144
// Print results
138-
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
139-
std::string& sorted_domain, const size_t name_width,
140-
const size_t data_width);
145+
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
146+
std::string& sorted_domain, const size_t name_width,
147+
const size_t data_width);
148+
141149
} // namespace platform
142150
} // namespace paddle

paddle/platform/profiler_test.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) {
103103
// Bad Usage:
104104
PushEvent("event_without_pop", dev_ctx);
105105
PopEvent("event_without_push", dev_ctx);
106-
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
107-
// Will remove parsing-related code from test later
108-
ParseEvents(events, EventSortingKey::kTotal);
106+
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
109107

110108
int cuda_startup_count = 0;
111109
int start_profiler_count = 0;
112-
int stop_profiler_count = 0;
113110
for (size_t i = 0; i < events.size(); ++i) {
114111
for (size_t j = 0; j < events[i].size(); ++j) {
115112
if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
116113
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
117-
if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
118114
if (events[i][j].name() == "push") {
119115
EXPECT_EQ(events[i][j + 1].name(), "pop");
120116
#ifdef PADDLE_WITH_CUDA
@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) {
127123
}
128124
EXPECT_EQ(cuda_startup_count % 5, 0);
129125
EXPECT_EQ(start_profiler_count, 1);
130-
EXPECT_EQ(stop_profiler_count, 1);
126+
127+
// Will remove parsing-related code from test later
128+
DisableProfiler(EventSortingKey::kTotal);
131129
}

paddle/pybind/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
if(WITH_PYTHON)
22
cc_library(paddle_pybind SHARED
33
SRCS pybind.cc exception.cc protobuf.cc const_value.cc
4-
DEPS pybind python backward proto_desc paddle_memory executor prune init
4+
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler
55
${GLOB_OP_LIB})
66
if(NOT APPLE AND NOT ANDROID)
77
target_link_libraries(paddle_pybind rt)

paddle/pybind/protobuf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License. */
1717
#include <Python.h>
1818
#include <fstream>
1919
#include <vector>
20+
#include "paddle/platform/variant.h"
2021
#include "pybind11/numpy.h"
2122
#include "pybind11/pybind11.h"
2223
#include "pybind11/stl.h"

paddle/pybind/pybind.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ limitations under the License. */
3030
#include "paddle/operators/net_op.h"
3131
#include "paddle/platform/enforce.h"
3232
#include "paddle/platform/place.h"
33+
#include "paddle/platform/profiler.h"
3334
#include "paddle/pybind/const_value.h"
3435
#include "paddle/pybind/exception.h"
3536
#include "paddle/pybind/pybind.h"
@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle.
476477
m.def("nvprof_stop", platform::CudaProfilerStop);
477478
#endif
478479

480+
py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
481+
.value("kDisabled", platform::ProfilerState::kDisabled)
482+
.value("kCPU", platform::ProfilerState::kCPU)
483+
.value("kCUDA", platform::ProfilerState::kCUDA)
484+
.export_values();
485+
486+
py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
487+
.value("kDefault", platform::EventSortingKey::kDefault)
488+
.value("kCalls", platform::EventSortingKey::kCalls)
489+
.value("kTotal", platform::EventSortingKey::kTotal)
490+
.value("kMin", platform::EventSortingKey::kMin)
491+
.value("kMax", platform::EventSortingKey::kMax)
492+
.value("kAve", platform::EventSortingKey::kAve)
493+
.export_values();
494+
495+
m.def("enable_profiler", platform::EnableProfiler);
496+
m.def("disable_profiler", platform::DisableProfiler);
497+
m.def("reset_profiler", platform::ResetProfiler);
479498
return m.ptr();
480499
}
481500
} // namespace pybind

python/paddle/v2/fluid/profiler.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None):
6363
# Disables profiler collection.
6464
core.nvprof_stop()
6565
os.remove(config_file)
66+
67+
68+
def reset_profiler():
69+
"""The profiler clear interface.
70+
reset_profiler will clear the previous time record.
71+
"""
72+
core.reset_profiler()
73+
74+
75+
@contextmanager
76+
def profiler(state, sorted_key=None):
77+
"""The profiler interface.
78+
Different from cuda_profiler, this profiler can be used to profile both CPU
79+
and GPU program. By defalut, it records the CPU and GPU operator kernels,
80+
if you want to profile other program, you can refer the profiling tutorial
81+
to add more records.
82+
83+
Args:
84+
state (string) : The profiling state, which should be 'CPU' or 'GPU',
85+
telling the profiler to use CPU timer or GPU timer for profiling.
86+
Although users may have already specified the execution place
87+
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
88+
would not inherit this place.
89+
sorted_key (string) : If None, the profiling results will be printed
90+
in the order of first end time of events. Otherwise, the profiling
91+
results will be sorted by the this flag. This flag should be one
92+
of 'calls', 'total', 'max', 'min' or 'ave'.
93+
The `calls` means sorting by the number of calls.
94+
The `total` means sorting by the total execution time.
95+
The `max` means sorting by the maximum execution time.
96+
The `min` means sorting by the minimum execution time.
97+
The `ave` means sorting by the average execution time.
98+
"""
99+
100+
if state not in ['CPU', 'GPU']:
101+
raise ValueError("The state must be 'CPU' or 'GPU'.")
102+
prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
103+
core.enable_profiler(prof_state)
104+
yield
105+
106+
if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
107+
raise ValueError("The state must be in 'calls', 'total', "
108+
"'max', 'min', 'ave'")
109+
sorted_key = 'default' if sorted_key is None else sorted_key
110+
key_map = {
111+
'default': core.EventSortingKey.kDefault,
112+
'calls': core.EventSortingKey.kCalls,
113+
'total': core.EventSortingKey.kTotal,
114+
'max': core.EventSortingKey.kMax,
115+
'min': core.EventSortingKey.kMin,
116+
'ave': core.EventSortingKey.kAve,
117+
}
118+
# TODO(qingqing) : redirect C++ ostream to Python stream.
119+
# with core.ostream_redirect(stdout=True, stderr=True):
120+
core.disable_profiler(key_map[sorted_key])

0 commit comments

Comments
 (0)