Skip to content

Commit 92974d4

Browse files
authored
Merge pull request #8679 from panyx0718/test2
Add CPU time and MemCopy to the timeline.
2 parents 0240bb7 + f3cbfc0 commit 92974d4

File tree

7 files changed

+157
-16
lines changed

7 files changed

+157
-16
lines changed

paddle/fluid/platform/device_tracer.cc

Lines changed: 93 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License. */
1818
#include <map>
1919
#include <mutex>
2020
#include <numeric>
21+
#include <thread>
2122
#include "glog/logging.h"
2223
#include "paddle/fluid/framework/block_desc.h"
2324
#include "paddle/fluid/string/printf.h"
@@ -54,6 +55,36 @@ uint64_t kAlignSize = 8;
5455
} \
5556
} while (0)
5657

58+
std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
59+
switch (kind) {
60+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
61+
return "MEMCPY_HtoD";
62+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
63+
return "MEMCPY_DtoH";
64+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
65+
return "MEMCPY_HtoA";
66+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
67+
return "MEMCPY_AtoH";
68+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
69+
return "MEMCPY_AtoA";
70+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
71+
return "MEMCPY_AtoD";
72+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
73+
return "MEMCPY_DtoA";
74+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
75+
return "MEMCPY_DtoD";
76+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
77+
return "MEMCPY_HtoH";
78+
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
79+
return "MEMCPY_PtoP";
80+
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
81+
return "MEMCPY_FORCE_INT";
82+
default:
83+
break;
84+
}
85+
return "MEMCPY";
86+
}
87+
5788
void EnableActivity() {
5889
// Device activity record is created when CUDA initializes, so we
5990
// want to enable it before cuInit() or any CUDA runtime call.
@@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
110141
kernel->correlationId);
111142
break;
112143
}
144+
case CUPTI_ACTIVITY_KIND_MEMCPY: {
145+
auto *memcpy =
146+
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
147+
tracer->AddMemRecords(
148+
MemcpyKind(
149+
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
150+
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
151+
memcpy->correlationId, memcpy->bytes);
152+
break;
153+
}
154+
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
155+
auto *memcpy =
156+
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
157+
tracer->AddMemRecords(
158+
MemcpyKind(
159+
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
160+
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
161+
memcpy->correlationId, memcpy->bytes);
162+
break;
163+
}
113164
default: { break; }
114165
}
115166
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -140,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
140191
correlations_[id] = anno;
141192
}
142193

194+
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
195+
std::lock_guard<std::mutex> l(trace_mu_);
196+
cpu_records_.push_back(
197+
CPURecord{anno, start_ns, end_ns,
198+
std::hash<std::thread::id>{}(std::this_thread::get_id())});
199+
}
200+
201+
void AddMemRecords(const std::string &name, uint64_t start_ns,
202+
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
203+
uint32_t correlation_id, uint64_t bytes) {
204+
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
205+
stream_id, correlation_id, bytes});
206+
}
207+
143208
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
144209
uint32_t stream_id, uint32_t correlation_id) {
145210
std::lock_guard<std::mutex> l(trace_mu_);
@@ -175,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
175240
CUPTI_CALL(
176241
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
177242
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
178-
179243
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
180244
enabled_ = true;
181245
}
@@ -185,7 +249,6 @@ class DeviceTracerImpl : public DeviceTracer {
185249
proto::Profile profile_pb;
186250
profile_pb.set_start_ns(start_ns_);
187251
profile_pb.set_end_ns(end_ns_);
188-
std::map<std::string, std::vector<uint64_t>> event_times;
189252
for (const KernelRecord &r : kernel_records_) {
190253
if (correlations_.find(r.correlation_id) == correlations_.end()) {
191254
fprintf(stderr, "cannot relate a kernel activity\n");
@@ -197,7 +260,24 @@ class DeviceTracerImpl : public DeviceTracer {
197260
event->set_end_ns(r.end_ns);
198261
event->set_stream_id(r.stream_id);
199262
event->set_device_id(r.device_id);
200-
event_times[event->name()].push_back(r.end_ns - r.start_ns);
263+
}
264+
265+
for (const CPURecord &r : cpu_records_) {
266+
auto *event = profile_pb.add_events();
267+
event->set_name(r.name);
268+
event->set_start_ns(r.start_ns);
269+
event->set_end_ns(r.end_ns);
270+
event->set_stream_id(r.thread_id);
271+
event->set_device_id(-1);
272+
}
273+
for (const MemRecord &r : mem_records_) {
274+
auto *event = profile_pb.add_events();
275+
event->set_name(r.name);
276+
event->set_start_ns(r.start_ns);
277+
event->set_end_ns(r.end_ns);
278+
event->set_stream_id(r.stream_id);
279+
event->set_device_id(r.device_id);
280+
event->mutable_memcopy()->set_bytes(r.bytes);
201281
}
202282
std::string profile_str;
203283
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
@@ -242,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
242322
uint64_t start_ns_;
243323
uint64_t end_ns_;
244324
std::vector<KernelRecord> kernel_records_;
325+
std::vector<MemRecord> mem_records_;
326+
std::vector<CPURecord> cpu_records_;
245327
std::unordered_map<uint32_t, std::string> correlations_;
246328
CUpti_SubscriberHandle subscriber_;
247329
};
@@ -254,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {
254336

255337
void AddAnnotation(uint64_t id, const std::string &anno) {}
256338

339+
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
340+
341+
void AddMemRecords(const std::string &name, uint64_t start_ns,
342+
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
343+
uint32_t correlation_id, uint64_t bytes) {}
344+
257345
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
258346
uint32_t stream_id, uint32_t correlation_id) {}
259347

@@ -285,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
285373

286374
void ClearCurAnnotation() { cur_annotation = nullptr; }
287375

376+
const char *CurAnnotation() { return cur_annotation; }
377+
288378
} // namespace platform
289379
} // namespace paddle

paddle/fluid/platform/device_tracer.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,21 @@ class DeviceTracer {
3636
uint32_t stream_id;
3737
uint32_t correlation_id;
3838
};
39+
struct CPURecord {
40+
std::string name;
41+
uint64_t start_ns;
42+
uint64_t end_ns;
43+
uint64_t thread_id;
44+
};
45+
struct MemRecord {
46+
std::string name;
47+
uint64_t start_ns;
48+
uint64_t end_ns;
49+
uint32_t device_id;
50+
uint32_t stream_id;
51+
uint32_t correlation_id;
52+
uint64_t bytes;
53+
};
3954

4055
virtual ~DeviceTracer() {}
4156
// Needs to be called once before use.
@@ -48,6 +63,14 @@ class DeviceTracer {
4863
// human-readable annotations.
4964
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
5065

66+
virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
67+
uint64_t end_ns, uint32_t device_id,
68+
uint32_t stream_id, uint32_t correlation_id,
69+
uint64_t bytes) = 0;
70+
71+
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
72+
uint64_t end_ns) = 0;
73+
5174
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
5275
// added before for human readability.
5376
virtual void AddKernelRecords(uint64_t start, uint64_t end,
@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
6790
void SetCurAnnotation(const char* anno);
6891
// Clear the name after the operation is done.
6992
void ClearCurAnnotation();
70-
93+
// Current name of the operation being run in the thread.
94+
const char* CurAnnotation();
7195
} // namespace platform
7296
} // namespace paddle

paddle/fluid/platform/dynload/cupti.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
7474
__macro(cuptiFinalize); \
7575
__macro(cuptiSubscribe); \
7676
__macro(cuptiUnsubscribe); \
77-
__macro(cuptiEnableCallback);
77+
__macro(cuptiEnableCallback); \
78+
__macro(cuptiEnableDomain);
7879

7980
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
8081

paddle/fluid/platform/profiler.cc

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/platform/profiler.h"
16+
#include <sys/time.h>
17+
#include <time.h>
1618
#include <iomanip>
1719
#include <map>
1820
#ifdef PADDLE_WITH_CUDA
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
5254
.count();
5355
}
5456

57+
inline uint64_t PosixInNsec() {
58+
struct timeval tv;
59+
gettimeofday(&tv, nullptr);
60+
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
61+
}
62+
5563
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
5664
const DeviceContext* dev_ctx)
5765
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
132140
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
133141
}
134142

135-
RecordEvent::RecordEvent(const std::string& name,
136-
const DeviceContext* dev_ctx) {
143+
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
144+
: start_ns_(PosixInNsec()) {
137145
if (g_state == ProfilerState::kDisabled) return;
138146
dev_ctx_ = dev_ctx;
139147
name_ = name;
@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
144152

145153
RecordEvent::~RecordEvent() {
146154
if (g_state == ProfilerState::kDisabled) return;
155+
DeviceTracer* tracer = GetDeviceTracer();
156+
if (tracer) {
157+
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
158+
}
147159
ClearCurAnnotation();
148160
PopEvent(name_, dev_ctx_);
149161
}
@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
207219
Mark("_stop_profiler_", nullptr);
208220
g_state = ProfilerState::kDisabled;
209221

222+
std::vector<std::vector<Event>> all_events = GetAllEvents();
223+
ParseEvents(all_events, sorted_key);
224+
ResetProfiler();
210225
DeviceTracer* tracer = GetDeviceTracer();
211226
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
212227
tracer->Disable();
213228
tracer->GenProfile(profile_path);
214229
}
215-
216-
std::vector<std::vector<Event>> all_events = GetAllEvents();
217-
ParseEvents(all_events, sorted_key);
218-
ResetProfiler();
219230
}
220231

221232
void ParseEvents(std::vector<std::vector<Event>>& events,

paddle/fluid/platform/profiler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ struct RecordEvent {
108108

109109
~RecordEvent();
110110

111+
uint64_t start_ns_;
111112
// The device context is used by Event to get the current cuda stream.
112113
const DeviceContext* dev_ctx_;
113114
// Event name

paddle/fluid/platform/profiler.proto

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,17 @@ limitations under the License. */
1515
syntax = "proto2";
1616
package paddle.platform.proto;
1717

18+
message MemCopy { optional uint64 bytes = 3; }
19+
1820
message Event {
1921
optional string name = 1;
2022
optional uint64 start_ns = 2;
2123
optional uint64 end_ns = 3;
22-
optional uint32 device_id = 5;
24+
// When positive, it represents gpu id. When -1, it represents CPU.
25+
optional int64 device_id = 5;
2326
optional uint32 stream_id = 6;
27+
28+
optional MemCopy memcopy = 7;
2429
}
2530

2631
message Profile {

tools/timeline.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,24 @@ def _allocate_pids(self):
124124
if event.device_id not in self._devices:
125125
pid = self._allocate_pid()
126126
self._devices[event.device_id] = pid
127-
self._chrome_trace.emit_pid("device:%s" % pid, pid)
127+
if event.device_id >= 0:
128+
self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
129+
(pid, event.stream_id), pid)
130+
elif event.device_id == -1:
131+
self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
132+
event.stream_id, pid)
128133

129134
def _allocate_events(self):
130135
for event in self._profile_pb.events:
131136
pid = self._devices[event.device_id]
132137
args = {'name': event.name}
133-
self._chrome_trace.emit_region(
134-
event.start_ns, (event.end_ns - event.start_ns) / 1000000.0,
135-
pid, 0, 'Op', event.name, args)
138+
if event.memcopy.bytes > 0:
139+
args = {'mem_bytes': event.memcopy.bytes}
140+
# TODO(panyx0718): Chrome tracing only handles ms. However, some
141+
# ops takes micro-seconds. Hence, we keep the ns here.
142+
self._chrome_trace.emit_region(event.start_ns,
143+
(event.end_ns - event.start_ns) /
144+
1.0, pid, 0, 'Op', event.name, args)
136145

137146
def generate_chrome_trace(self):
138147
self._allocate_pids()

0 commit comments

Comments
 (0)