Skip to content

Commit 55b2d3d

Browse files
committed
Add CPU time to the timeline.
1 parent 9fbe2e3 commit 55b2d3d

File tree

6 files changed

+64
-14
lines changed

6 files changed

+64
-14
lines changed

paddle/fluid/platform/device_tracer.cc

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License. */
1818
#include <map>
1919
#include <mutex>
2020
#include <numeric>
21+
#include <thread>
2122
#include "glog/logging.h"
2223
#include "paddle/fluid/framework/block_desc.h"
2324
#include "paddle/fluid/string/printf.h"
@@ -140,6 +141,13 @@ class DeviceTracerImpl : public DeviceTracer {
140141
correlations_[id] = anno;
141142
}
142143

144+
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
145+
std::lock_guard<std::mutex> l(trace_mu_);
146+
cpu_records_.push_back(
147+
CPURecord{anno, start_ns, end_ns,
148+
std::hash<std::thread::id>{}(std::this_thread::get_id())});
149+
}
150+
143151
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
144152
uint32_t stream_id, uint32_t correlation_id) {
145153
std::lock_guard<std::mutex> l(trace_mu_);
@@ -185,7 +193,6 @@ class DeviceTracerImpl : public DeviceTracer {
185193
proto::Profile profile_pb;
186194
profile_pb.set_start_ns(start_ns_);
187195
profile_pb.set_end_ns(end_ns_);
188-
std::map<std::string, std::vector<uint64_t>> event_times;
189196
for (const KernelRecord &r : kernel_records_) {
190197
if (correlations_.find(r.correlation_id) == correlations_.end()) {
191198
fprintf(stderr, "cannot relate a kernel activity\n");
@@ -197,7 +204,15 @@ class DeviceTracerImpl : public DeviceTracer {
197204
event->set_end_ns(r.end_ns);
198205
event->set_stream_id(r.stream_id);
199206
event->set_device_id(r.device_id);
200-
event_times[event->name()].push_back(r.end_ns - r.start_ns);
207+
}
208+
209+
for (const CPURecord &r : cpu_records_) {
210+
auto *event = profile_pb.add_events();
211+
event->set_name(r.name);
212+
event->set_start_ns(r.start_ns);
213+
event->set_end_ns(r.end_ns);
214+
event->set_stream_id(r.thread_id);
215+
event->set_device_id(-1);
201216
}
202217
std::string profile_str;
203218
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
@@ -242,6 +257,7 @@ class DeviceTracerImpl : public DeviceTracer {
242257
uint64_t start_ns_;
243258
uint64_t end_ns_;
244259
std::vector<KernelRecord> kernel_records_;
260+
std::vector<CPURecord> cpu_records_;
245261
std::unordered_map<uint32_t, std::string> correlations_;
246262
CUpti_SubscriberHandle subscriber_;
247263
};
@@ -254,6 +270,8 @@ class DeviceTracerDummy : public DeviceTracer {
254270

255271
void AddAnnotation(uint64_t id, const std::string &anno) {}
256272

273+
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
274+
257275
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
258276
uint32_t stream_id, uint32_t correlation_id) {}
259277

@@ -285,5 +303,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
285303

286304
void ClearCurAnnotation() { cur_annotation = nullptr; }
287305

306+
const char *CurAnnotation() { return cur_annotation; }
307+
288308
} // namespace platform
289309
} // namespace paddle

paddle/fluid/platform/device_tracer.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ class DeviceTracer {
3636
uint32_t stream_id;
3737
uint32_t correlation_id;
3838
};
39+
struct CPURecord {
40+
std::string name;
41+
uint64_t start_ns;
42+
uint64_t end_ns;
43+
uint64_t thread_id;
44+
};
3945

4046
virtual ~DeviceTracer() {}
4147
// Needs to be called once before use.
@@ -48,6 +54,9 @@ class DeviceTracer {
4854
// human-readable annotations.
4955
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
5056

57+
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
58+
uint64_t end_ns) = 0;
59+
5160
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
5261
// added before for human readability.
5362
virtual void AddKernelRecords(uint64_t start, uint64_t end,
@@ -67,6 +76,7 @@ DeviceTracer* GetDeviceTracer();
6776
void SetCurAnnotation(const char* anno);
6877
// Clear the name after the operation is done.
6978
void ClearCurAnnotation();
70-
79+
// Current name of the operation being run in the thread.
80+
const char* CurAnnotation();
7181
} // namespace platform
7282
} // namespace paddle

paddle/fluid/platform/profiler.cc

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/platform/profiler.h"
16+
#include <sys/time.h>
17+
#include <time.h>
1618
#include <iomanip>
1719
#include <map>
1820
#ifdef PADDLE_WITH_CUDA
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
5254
.count();
5355
}
5456

57+
inline uint64_t PosixInNsec() {
58+
struct timeval tv;
59+
gettimeofday(&tv, nullptr);
60+
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
61+
}
62+
5563
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
5664
const DeviceContext* dev_ctx)
5765
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
132140
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
133141
}
134142

135-
RecordEvent::RecordEvent(const std::string& name,
136-
const DeviceContext* dev_ctx) {
143+
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
144+
: start_ns_(PosixInNsec()) {
137145
if (g_state == ProfilerState::kDisabled) return;
138146
dev_ctx_ = dev_ctx;
139147
name_ = name;
@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
144152

145153
RecordEvent::~RecordEvent() {
146154
if (g_state == ProfilerState::kDisabled) return;
155+
DeviceTracer* tracer = GetDeviceTracer();
156+
if (tracer) {
157+
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
158+
}
147159
ClearCurAnnotation();
148160
PopEvent(name_, dev_ctx_);
149161
}
@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
207219
Mark("_stop_profiler_", nullptr);
208220
g_state = ProfilerState::kDisabled;
209221

222+
std::vector<std::vector<Event>> all_events = GetAllEvents();
223+
ParseEvents(all_events, sorted_key);
224+
ResetProfiler();
210225
DeviceTracer* tracer = GetDeviceTracer();
211226
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
212227
tracer->Disable();
213228
tracer->GenProfile(profile_path);
214229
}
215-
216-
std::vector<std::vector<Event>> all_events = GetAllEvents();
217-
ParseEvents(all_events, sorted_key);
218-
ResetProfiler();
219230
}
220231

221232
void ParseEvents(std::vector<std::vector<Event>>& events,

paddle/fluid/platform/profiler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ struct RecordEvent {
108108

109109
~RecordEvent();
110110

111+
uint64_t start_ns_;
111112
// The device context is used by Event to get the current cuda stream.
112113
const DeviceContext* dev_ctx_;
113114
// Event name

paddle/fluid/platform/profiler.proto

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ message Event {
1919
optional string name = 1;
2020
optional uint64 start_ns = 2;
2121
optional uint64 end_ns = 3;
22-
optional uint32 device_id = 5;
22+
// When positive, it represents gpu id. When -1, it represents CPU.
23+
optional int32 device_id = 5;
2324
optional uint32 stream_id = 6;
2425
}
2526

tools/timeline.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,22 @@ def _allocate_pids(self):
124124
if event.device_id not in self._devices:
125125
pid = self._allocate_pid()
126126
self._devices[event.device_id] = pid
127-
self._chrome_trace.emit_pid("device:%s" % pid, pid)
127+
if event.device_id >= 0:
128+
self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
129+
(pid, event.stream_id), pid)
130+
elif event.device_id == -1:
131+
self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
132+
event.stream_id, pid)
128133

129134
def _allocate_events(self):
130135
for event in self._profile_pb.events:
131136
pid = self._devices[event.device_id]
132137
args = {'name': event.name}
133-
self._chrome_trace.emit_region(
134-
event.start_ns, (event.end_ns - event.start_ns) / 1000000.0,
135-
pid, 0, 'Op', event.name, args)
138+
# TODO(panyx0718): Chrome tracing only handles ms. However, some
139+
# ops takes micro-seconds. Hence, we keep the ns here.
140+
self._chrome_trace.emit_region(event.start_ns,
141+
(event.end_ns - event.start_ns) /
142+
1.0, pid, 0, 'Op', event.name, args)
136143

137144
def generate_chrome_trace(self):
138145
self._allocate_pids()

0 commit comments

Comments
 (0)