@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
13
13
limitations under the License. */
14
14
15
15
#include " paddle/platform/profiler.h"
16
+ #include < iomanip>
17
+ #include < map>
18
+ #include " glog/logging.h"
16
19
17
20
namespace paddle {
18
21
namespace platform {
19
22
20
23
// The profiler state, the initial value is ProfilerState::kDisabled
21
24
static ProfilerState g_state = ProfilerState::kDisabled ;
25
+ // To record which timer the profiler used, CUDA or CPU.
26
+ static std::string g_profiler_place = " " ;
22
27
// The thread local event list only can be accessed by the specific thread
23
28
// The thread index of each thread
24
29
static thread_local int32_t g_thread_id;
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
43
48
44
49
Event::Event (EventKind kind, std::string name, uint32_t thread_id,
45
50
DeviceContext* dev_ctx)
46
- : kind_(kind),
47
- name_ (std::move(name)),
48
- thread_id_(thread_id),
49
- has_cuda_(false ) {
51
+ : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false ) {
50
52
#ifdef PADDLE_WITH_CUDA
51
53
auto * cuda_dev_ctx = static_cast <const CUDADeviceContext*>(dev_ctx);
52
54
if (cuda_dev_ctx) {
@@ -72,19 +74,19 @@ std::string Event::kind() const {
72
74
PADDLE_THROW (" Unknown EventKind." );
73
75
}
74
76
75
- double Event::CpuElapsedUs (const Event& e) const {
76
- return (e.cpu_ns_ - cpu_ns_) / (1000 .0 );
77
+ double Event::CpuElapsedMs (const Event& e) const {
78
+ return (e.cpu_ns_ - cpu_ns_) / (1000000 .0 );
77
79
}
78
80
79
- double Event::CudaElapsedUs (const Event& e) const {
81
+ double Event::CudaElapsedMs (const Event& e) const {
80
82
#ifdef PADDLE_WITH_CUDA
81
83
PADDLE_ENFORCE (e.has_cuda () && has_cuda ());
82
84
PADDLE_ENFORCE (e.device () == device ());
83
85
PADDLE_ENFORCE (cudaEventSynchronize (event_));
84
86
PADDLE_ENFORCE (cudaEventSynchronize (e.event ()));
85
87
float ms;
86
88
PADDLE_ENFORCE (cudaEventElapsedTime (&ms, event_, e.event ()));
87
- return ms * 1000.0 ;
89
+ return ms;
88
90
#else
89
91
PADDLE_THROW (" CUDA is not enabled" );
90
92
#endif
@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
113
115
}
114
116
115
117
void Mark (const std::string& name, DeviceContext* dev_ctx) {
116
- GetEventList ().Record (EventKind::kMark , std::move (name), g_thread_id,
117
- dev_ctx);
118
+ GetEventList ().Record (EventKind::kMark , name, g_thread_id, dev_ctx);
119
+ }
120
+
121
+ void PushEvent (const std::string& name, DeviceContext* dev_ctx) {
122
+ GetEventList ().Record (EventKind::kPushRange , name, g_thread_id, dev_ctx);
123
+ }
124
+
125
+ void PopEvent (const std::string& name, DeviceContext* dev_ctx) {
126
+ GetEventList ().Record (EventKind::kPopRange , name, g_thread_id, dev_ctx);
118
127
}
119
128
120
129
RecordEvent::RecordEvent (const std::string& name, DeviceContext* dev_ctx) {
121
130
if (g_state == ProfilerState::kDisabled ) return ;
122
131
dev_ctx_ = dev_ctx;
123
- GetEventList (). Record (EventKind:: kPushRange , std::move (name), g_thread_id,
124
- dev_ctx_);
132
+ name_ = name;
133
+ PushEvent (name_, dev_ctx_);
125
134
}
126
135
127
136
RecordEvent::~RecordEvent () {
128
137
if (g_state == ProfilerState::kDisabled ) return ;
129
- GetEventList ().Record (EventKind::kPopRange , std::string (), g_thread_id,
130
- dev_ctx_);
138
+ PopEvent (name_, dev_ctx_);
131
139
}
132
140
133
141
void EnableProfiler (ProfilerState state) {
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
138
146
" The profiling state should be disabled when calling " ,
139
147
" EnableProfiler." );
140
148
g_state = state;
149
+ g_profiler_place = (g_state == ProfilerState::kCUDA ) ? " CUDA" : " CPU" ;
141
150
#ifdef PADDLE_WITH_CUDA
142
151
if (g_state == ProfilerState::kCUDA ) {
143
152
// Generate some dummy evenets first to reduce the startup overhead.
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
169
178
return result;
170
179
}
171
180
181
+ void ParseEvents (std::vector<std::vector<Event>>& events,
182
+ EventSortingKey sorted_by) {
183
+ if (g_profiler_place == " " ) return ;
184
+
185
+ std::string sorted_domain;
186
+ std::function<bool (EventItem&, EventItem&)> sorted_func;
187
+ switch (sorted_by) {
188
+ case EventSortingKey::kCalls :
189
+ sorted_domain = " number of calls" ;
190
+ sorted_func = [](EventItem& a, EventItem& b) {
191
+ return a.calls > b.calls ;
192
+ };
193
+ break ;
194
+ case EventSortingKey::kTotal :
195
+ sorted_domain = " total time" ;
196
+ sorted_func = [](EventItem& a, EventItem& b) {
197
+ return a.total_time > b.total_time ;
198
+ };
199
+ break ;
200
+ case EventSortingKey::kMin :
201
+ sorted_domain = " minimum time" ;
202
+ sorted_func = [](EventItem& a, EventItem& b) {
203
+ return a.min_time > b.min_time ;
204
+ };
205
+ break ;
206
+ case EventSortingKey::kMax :
207
+ sorted_domain = " maximum time" ;
208
+ sorted_func = [](EventItem& a, EventItem& b) {
209
+ return a.max_time > b.max_time ;
210
+ };
211
+ break ;
212
+ case EventSortingKey::kAve :
213
+ sorted_domain = " average time" ;
214
+ sorted_func = [](EventItem& a, EventItem& b) {
215
+ return a.ave_time > b.ave_time ;
216
+ };
217
+ break ;
218
+ default :
219
+ sorted_domain = " event end time" ;
220
+ }
221
+
222
+ std::vector<std::vector<EventItem>> events_table;
223
+ size_t max_name_width = 0 ;
224
+ for (size_t i = 0 ; i < events.size (); i++) {
225
+ std::list<Event> pushed_events;
226
+ std::vector<EventItem> event_items;
227
+ std::unordered_map<std::string, int > event_idx;
228
+
229
+ for (size_t j = 0 ; j < events[i].size (); j++) {
230
+ if (events[i][j].kind () == " push" ) {
231
+ pushed_events.push_back (events[i][j]);
232
+ } else if (events[i][j].kind () == " pop" ) {
233
+ std::list<Event>::reverse_iterator rit = pushed_events.rbegin ();
234
+ while (rit != pushed_events.rend () &&
235
+ rit->name () != events[i][j].name ()) {
236
+ ++rit;
237
+ }
238
+
239
+ if (rit != pushed_events.rend ()) {
240
+ double event_time = (g_profiler_place == " CUDA" )
241
+ ? rit->CudaElapsedMs (events[i][j])
242
+ : rit->CpuElapsedMs (events[i][j]);
243
+ std::string event_name =
244
+ " thread" + std::to_string (rit->thread_id ()) + " ::" + rit->name ();
245
+ max_name_width = std::max (max_name_width, event_name.size ());
246
+
247
+ if (event_idx.find (event_name) == event_idx.end ()) {
248
+ event_idx[event_name] = event_items.size ();
249
+ EventItem event_item = {event_name, 1 , event_time,
250
+ event_time, event_time, event_time};
251
+ event_items.push_back (event_item);
252
+ } else {
253
+ int index = event_idx[event_name];
254
+ event_items[index].calls += 1 ;
255
+ // total time
256
+ event_items[index].total_time += event_time;
257
+ // min time
258
+ event_items[index].min_time =
259
+ std::min (event_time, event_items[index].min_time );
260
+ // max time
261
+ event_items[index].max_time =
262
+ std::max (event_time, event_items[index].max_time );
263
+ }
264
+
265
+ // remove the push marker from the list
266
+ pushed_events.erase ((++rit).base ());
267
+ } else {
268
+ LOG (WARNING) << " Cannot find the push marker of event \' "
269
+ << events[i][j].name ()
270
+ << " \' , which will be ignored in profiling report." ;
271
+ }
272
+ }
273
+ }
274
+ // average time
275
+ for (auto & item : event_items) {
276
+ item.ave_time = item.total_time / item.calls ;
277
+ }
278
+ // sort
279
+ if (sorted_by != EventSortingKey::kDefault ) {
280
+ std::sort (event_items.begin (), event_items.end (), sorted_func);
281
+ }
282
+
283
+ events_table.push_back (event_items);
284
+ // log warning if there are events with `push` but without `pop`
285
+ std::list<Event>::reverse_iterator rit = pushed_events.rbegin ();
286
+ while (rit != pushed_events.rend ()) {
287
+ LOG (WARNING) << " Cannot find the pop marker of event \' " << rit->name ()
288
+ << " \' , which will be ignored in profiling report." ;
289
+ ++rit;
290
+ }
291
+ }
292
+
293
+ // Print report
294
+ PrintProfilingReport (events_table, sorted_domain, max_name_width + 4 , 12 );
295
+ }
296
+
297
+ void PrintProfilingReport (std::vector<std::vector<EventItem>>& events_table,
298
+ std::string& sorted_domain, const size_t name_width,
299
+ const size_t data_width) {
300
+ // Output header information
301
+ std::cout << " \n ------------------------->"
302
+ << " Profiling Report "
303
+ << " <-------------------------\n\n " ;
304
+ std::cout << " Place: " << g_profiler_place << std::endl;
305
+ std::cout << " Time unit: ms" << std::endl;
306
+ std::cout << " Sorted by " << sorted_domain
307
+ << " in descending order in the same thread\n\n " ;
308
+ // Output events table
309
+ std::cout.setf (std::ios::left);
310
+ std::cout << std::setw (name_width) << " Event" << std::setw (data_width)
311
+ << " Calls" << std::setw (data_width) << " Total"
312
+ << std::setw (data_width) << " Min." << std::setw (data_width)
313
+ << " Max." << std::setw (data_width) << " Ave." << std::endl;
314
+ for (size_t i = 0 ; i < events_table.size (); ++i) {
315
+ for (size_t j = 0 ; j < events_table[i].size (); ++j) {
316
+ EventItem& event_item = events_table[i][j];
317
+ std::cout << std::setw (name_width) << event_item.name
318
+ << std::setw (data_width) << event_item.calls
319
+ << std::setw (data_width) << event_item.total_time
320
+ << std::setw (data_width) << event_item.min_time
321
+ << std::setw (data_width) << event_item.max_time
322
+ << std::setw (data_width) << event_item.ave_time << std::endl;
323
+ }
324
+ }
325
+ std::cout << std::endl;
326
+ }
327
+
172
328
} // namespace platform
173
329
} // namespace paddle
0 commit comments