Skip to content

Commit 5e90f5e

Browse files
author
Yibing Liu
authored
Merge pull request #7043 from kuke/profiler_tool
Add the parsing part for the profiling tool
2 parents c62383e + 6416eee commit 5e90f5e

File tree

3 files changed

+238
-21
lines changed

3 files changed

+238
-21
lines changed

paddle/platform/profiler.cc

Lines changed: 170 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/platform/profiler.h"
16+
#include <iomanip>
17+
#include <map>
18+
#include "glog/logging.h"
1619

1720
namespace paddle {
1821
namespace platform {
1922

2023
// The profiler state, the initial value is ProfilerState::kDisabled
2124
static ProfilerState g_state = ProfilerState::kDisabled;
25+
// To record which timer the profiler used, CUDA or CPU.
26+
static std::string g_profiler_place = "";
2227
// The thread local event list only can be accessed by the specific thread
2328
// The thread index of each thread
2429
static thread_local int32_t g_thread_id;
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
4348

4449
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
4550
DeviceContext* dev_ctx)
46-
: kind_(kind),
47-
name_(std::move(name)),
48-
thread_id_(thread_id),
49-
has_cuda_(false) {
51+
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
5052
#ifdef PADDLE_WITH_CUDA
5153
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
5254
if (cuda_dev_ctx) {
@@ -72,19 +74,19 @@ std::string Event::kind() const {
7274
PADDLE_THROW("Unknown EventKind.");
7375
}
7476

75-
double Event::CpuElapsedUs(const Event& e) const {
76-
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
77+
double Event::CpuElapsedMs(const Event& e) const {
78+
return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
7779
}
7880

79-
double Event::CudaElapsedUs(const Event& e) const {
81+
double Event::CudaElapsedMs(const Event& e) const {
8082
#ifdef PADDLE_WITH_CUDA
8183
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
8284
PADDLE_ENFORCE(e.device() == device());
8385
PADDLE_ENFORCE(cudaEventSynchronize(event_));
8486
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
8587
float ms;
8688
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
87-
return ms * 1000.0;
89+
return ms;
8890
#else
8991
PADDLE_THROW("CUDA is not enabled");
9092
#endif
@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
113115
}
114116

115117
void Mark(const std::string& name, DeviceContext* dev_ctx) {
116-
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
117-
dev_ctx);
118+
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
119+
}
120+
121+
void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
122+
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
123+
}
124+
125+
void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
126+
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
118127
}
119128

120129
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
121130
if (g_state == ProfilerState::kDisabled) return;
122131
dev_ctx_ = dev_ctx;
123-
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
124-
dev_ctx_);
132+
name_ = name;
133+
PushEvent(name_, dev_ctx_);
125134
}
126135

127136
RecordEvent::~RecordEvent() {
128137
if (g_state == ProfilerState::kDisabled) return;
129-
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
130-
dev_ctx_);
138+
PopEvent(name_, dev_ctx_);
131139
}
132140

133141
void EnableProfiler(ProfilerState state) {
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
138146
"The profiling state should be disabled when calling ",
139147
"EnableProfiler.");
140148
g_state = state;
149+
g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
141150
#ifdef PADDLE_WITH_CUDA
142151
if (g_state == ProfilerState::kCUDA) {
143152
// Generate some dummy evenets first to reduce the startup overhead.
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
169178
return result;
170179
}
171180

181+
void ParseEvents(std::vector<std::vector<Event>>& events,
182+
EventSortingKey sorted_by) {
183+
if (g_profiler_place == "") return;
184+
185+
std::string sorted_domain;
186+
std::function<bool(EventItem&, EventItem&)> sorted_func;
187+
switch (sorted_by) {
188+
case EventSortingKey::kCalls:
189+
sorted_domain = "number of calls";
190+
sorted_func = [](EventItem& a, EventItem& b) {
191+
return a.calls > b.calls;
192+
};
193+
break;
194+
case EventSortingKey::kTotal:
195+
sorted_domain = "total time";
196+
sorted_func = [](EventItem& a, EventItem& b) {
197+
return a.total_time > b.total_time;
198+
};
199+
break;
200+
case EventSortingKey::kMin:
201+
sorted_domain = "minimum time";
202+
sorted_func = [](EventItem& a, EventItem& b) {
203+
return a.min_time > b.min_time;
204+
};
205+
break;
206+
case EventSortingKey::kMax:
207+
sorted_domain = "maximum time";
208+
sorted_func = [](EventItem& a, EventItem& b) {
209+
return a.max_time > b.max_time;
210+
};
211+
break;
212+
case EventSortingKey::kAve:
213+
sorted_domain = "average time";
214+
sorted_func = [](EventItem& a, EventItem& b) {
215+
return a.ave_time > b.ave_time;
216+
};
217+
break;
218+
default:
219+
sorted_domain = "event end time";
220+
}
221+
222+
std::vector<std::vector<EventItem>> events_table;
223+
size_t max_name_width = 0;
224+
for (size_t i = 0; i < events.size(); i++) {
225+
std::list<Event> pushed_events;
226+
std::vector<EventItem> event_items;
227+
std::unordered_map<std::string, int> event_idx;
228+
229+
for (size_t j = 0; j < events[i].size(); j++) {
230+
if (events[i][j].kind() == "push") {
231+
pushed_events.push_back(events[i][j]);
232+
} else if (events[i][j].kind() == "pop") {
233+
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
234+
while (rit != pushed_events.rend() &&
235+
rit->name() != events[i][j].name()) {
236+
++rit;
237+
}
238+
239+
if (rit != pushed_events.rend()) {
240+
double event_time = (g_profiler_place == "CUDA")
241+
? rit->CudaElapsedMs(events[i][j])
242+
: rit->CpuElapsedMs(events[i][j]);
243+
std::string event_name =
244+
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
245+
max_name_width = std::max(max_name_width, event_name.size());
246+
247+
if (event_idx.find(event_name) == event_idx.end()) {
248+
event_idx[event_name] = event_items.size();
249+
EventItem event_item = {event_name, 1, event_time,
250+
event_time, event_time, event_time};
251+
event_items.push_back(event_item);
252+
} else {
253+
int index = event_idx[event_name];
254+
event_items[index].calls += 1;
255+
// total time
256+
event_items[index].total_time += event_time;
257+
// min time
258+
event_items[index].min_time =
259+
std::min(event_time, event_items[index].min_time);
260+
// max time
261+
event_items[index].max_time =
262+
std::max(event_time, event_items[index].max_time);
263+
}
264+
265+
// remove the push marker from the list
266+
pushed_events.erase((++rit).base());
267+
} else {
268+
LOG(WARNING) << "Cannot find the push marker of event \'"
269+
<< events[i][j].name()
270+
<< "\', which will be ignored in profiling report.";
271+
}
272+
}
273+
}
274+
// average time
275+
for (auto& item : event_items) {
276+
item.ave_time = item.total_time / item.calls;
277+
}
278+
// sort
279+
if (sorted_by != EventSortingKey::kDefault) {
280+
std::sort(event_items.begin(), event_items.end(), sorted_func);
281+
}
282+
283+
events_table.push_back(event_items);
284+
// log warning if there are events with `push` but without `pop`
285+
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
286+
while (rit != pushed_events.rend()) {
287+
LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
288+
<< "\', which will be ignored in profiling report.";
289+
++rit;
290+
}
291+
}
292+
293+
// Print report
294+
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
295+
}
296+
297+
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
298+
std::string& sorted_domain, const size_t name_width,
299+
const size_t data_width) {
300+
// Output header information
301+
std::cout << "\n------------------------->"
302+
<< " Profiling Report "
303+
<< "<-------------------------\n\n";
304+
std::cout << "Place: " << g_profiler_place << std::endl;
305+
std::cout << "Time unit: ms" << std::endl;
306+
std::cout << "Sorted by " << sorted_domain
307+
<< " in descending order in the same thread\n\n";
308+
// Output events table
309+
std::cout.setf(std::ios::left);
310+
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
311+
<< "Calls" << std::setw(data_width) << "Total"
312+
<< std::setw(data_width) << "Min." << std::setw(data_width)
313+
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
314+
for (size_t i = 0; i < events_table.size(); ++i) {
315+
for (size_t j = 0; j < events_table[i].size(); ++j) {
316+
EventItem& event_item = events_table[i][j];
317+
std::cout << std::setw(name_width) << event_item.name
318+
<< std::setw(data_width) << event_item.calls
319+
<< std::setw(data_width) << event_item.total_time
320+
<< std::setw(data_width) << event_item.min_time
321+
<< std::setw(data_width) << event_item.max_time
322+
<< std::setw(data_width) << event_item.ave_time << std::endl;
323+
}
324+
}
325+
std::cout << std::endl;
326+
}
327+
172328
} // namespace platform
173329
} // namespace paddle

paddle/platform/profiler.h

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,16 @@ class Event {
3333

3434
std::string kind() const;
3535
std::string name() const { return name_; }
36+
uint32_t thread_id() const { return thread_id_; }
3637
bool has_cuda() const { return has_cuda_; }
3738

3839
#ifdef PADDLE_WITH_CUDA
3940
cudaEvent_t event() const { return event_; }
4041
int device() const { return device_; }
4142
#endif
4243

43-
double CpuElapsedUs(const Event& e) const;
44-
double CudaElapsedUs(const Event& e) const;
44+
double CpuElapsedMs(const Event& e) const;
45+
double CudaElapsedMs(const Event& e) const;
4546

4647
private:
4748
EventKind kind_;
@@ -94,13 +95,19 @@ enum ProfilerState {
9495

9596
void Mark(const std::string& name, DeviceContext* dev_ctx);
9697

98+
void PushEvent(const std::string& name, DeviceContext* dev_ctx);
99+
100+
void PopEvent(const std::string& name, DeviceContext* dev_ctx);
101+
97102
struct RecordEvent {
98103
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
99104

100105
~RecordEvent();
101106

102107
// The device context is used by Event to get the current cuda stream.
103108
DeviceContext* dev_ctx_;
109+
// Event name
110+
std::string name_;
104111
};
105112

106113
// Enable the profiling function.
@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
110117
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
111118
std::vector<std::vector<Event>> DisableProfiler();
112119

120+
// The information of each event given in the profiling report
121+
struct EventItem {
122+
std::string name;
123+
int calls;
124+
double total_time;
125+
double min_time;
126+
double max_time;
127+
double ave_time;
128+
};
129+
130+
// Candidate keys to sort the profiling report
131+
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
132+
133+
// Parse the event list and output the profiling report
134+
void ParseEvents(std::vector<std::vector<Event>>&,
135+
EventSortingKey sorted_by = EventSortingKey::kDefault);
136+
137+
// Print results
138+
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
139+
std::string& sorted_domain, const size_t name_width,
140+
const size_t data_width);
113141
} // namespace platform
114142
} // namespace paddle

paddle/platform/profiler_test.cc

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
2626
counter++;
2727
}
2828
Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
29-
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
29+
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
3030
}
3131

3232
#ifdef PADDLE_WITH_CUDA
@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
4545
counter++;
4646
}
4747
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
48-
EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
48+
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
4949
}
5050
#endif
5151

@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
5555
using paddle::platform::EventKind;
5656
using paddle::platform::RecordEvent;
5757
using paddle::platform::ProfilerState;
58+
using paddle::platform::EventSortingKey;
5859

5960
ProfilerState state = ProfilerState::kCPU;
6061
DeviceContext* dev_ctx = nullptr;
@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
6768
#endif
6869
EnableProfiler(state);
6970

71+
/* Usage 1:
72+
* PushEvent(evt_name, dev_ctx);
73+
* ...
74+
* code to be analyzed
75+
* ...
76+
* PopEvent(evt_name, dev_ctx);
77+
*/
78+
for (int loop = 0; loop < 3; ++loop) {
79+
for (int i = 1; i < 5; ++i) {
80+
std::string name = "op_" + std::to_string(i);
81+
PushEvent(name, dev_ctx);
82+
int counter = 1;
83+
while (counter != i * 1000) counter++;
84+
PopEvent(name, dev_ctx);
85+
}
86+
}
87+
88+
/* Usage 2:
89+
* {
90+
* RecordEvent record_event(name, dev_ctx);
91+
* ...
92+
* code to be analyzed
93+
* ...
94+
* }
95+
*/
7096
for (int i = 1; i < 5; ++i) {
71-
std::string name = "op_" + std::to_string(i);
97+
std::string name = "evs_op_" + std::to_string(i);
7298
RecordEvent record_event(name, dev_ctx);
7399
int counter = 1;
74100
while (counter != i * 1000) counter++;
75101
}
102+
103+
// Bad Usage:
104+
PushEvent("event_without_pop", dev_ctx);
105+
PopEvent("event_without_push", dev_ctx);
76106
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
107+
// Will remove parsing-related code from test later
108+
ParseEvents(events, EventSortingKey::kTotal);
109+
77110
int cuda_startup_count = 0;
78111
int start_profiler_count = 0;
79112
int stop_profiler_count = 0;
@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
85118
if (events[i][j].name() == "push") {
86119
EXPECT_EQ(events[i][j + 1].name(), "pop");
87120
#ifdef PADDLE_WITH_CUDA
88-
EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
121+
EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
89122
#else
90-
EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
123+
EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
91124
#endif
92125
}
93126
}

0 commit comments

Comments
 (0)