Skip to content

Commit f3cbfc0

Browse files
committed
Add MEMCPY information
1 parent 55b2d3d commit f3cbfc0

File tree

5 files changed

+94
-3
lines changed

5 files changed

+94
-3
lines changed

paddle/fluid/platform/device_tracer.cc

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,36 @@ uint64_t kAlignSize = 8;
5555
} \
5656
} while (0)
5757

58+
std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
59+
switch (kind) {
60+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
61+
return "MEMCPY_HtoD";
62+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
63+
return "MEMCPY_DtoH";
64+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
65+
return "MEMCPY_HtoA";
66+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
67+
return "MEMCPY_AtoH";
68+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
69+
return "MEMCPY_AtoA";
70+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
71+
return "MEMCPY_AtoD";
72+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
73+
return "MEMCPY_DtoA";
74+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
75+
return "MEMCPY_DtoD";
76+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
77+
return "MEMCPY_HtoH";
78+
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
79+
return "MEMCPY_PtoP";
80+
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
81+
return "MEMCPY_FORCE_INT";
82+
default:
83+
break;
84+
}
85+
return "MEMCPY";
86+
}
87+
5888
void EnableActivity() {
5989
// Device activity record is created when CUDA initializes, so we
6090
// want to enable it before cuInit() or any CUDA runtime call.
@@ -111,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
111141
kernel->correlationId);
112142
break;
113143
}
144+
case CUPTI_ACTIVITY_KIND_MEMCPY: {
145+
auto *memcpy =
146+
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
147+
tracer->AddMemRecords(
148+
MemcpyKind(
149+
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
150+
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
151+
memcpy->correlationId, memcpy->bytes);
152+
break;
153+
}
154+
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
155+
auto *memcpy =
156+
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
157+
tracer->AddMemRecords(
158+
MemcpyKind(
159+
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
160+
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
161+
memcpy->correlationId, memcpy->bytes);
162+
break;
163+
}
114164
default: { break; }
115165
}
116166
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -148,6 +198,13 @@ class DeviceTracerImpl : public DeviceTracer {
148198
std::hash<std::thread::id>{}(std::this_thread::get_id())});
149199
}
150200

201+
void AddMemRecords(const std::string &name, uint64_t start_ns,
202+
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
203+
uint32_t correlation_id, uint64_t bytes) {
204+
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
205+
stream_id, correlation_id, bytes});
206+
}
207+
151208
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
152209
uint32_t stream_id, uint32_t correlation_id) {
153210
std::lock_guard<std::mutex> l(trace_mu_);
@@ -183,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
183240
CUPTI_CALL(
184241
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
185242
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
186-
187243
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
188244
enabled_ = true;
189245
}
@@ -214,6 +270,15 @@ class DeviceTracerImpl : public DeviceTracer {
214270
event->set_stream_id(r.thread_id);
215271
event->set_device_id(-1);
216272
}
273+
for (const MemRecord &r : mem_records_) {
274+
auto *event = profile_pb.add_events();
275+
event->set_name(r.name);
276+
event->set_start_ns(r.start_ns);
277+
event->set_end_ns(r.end_ns);
278+
event->set_stream_id(r.stream_id);
279+
event->set_device_id(r.device_id);
280+
event->mutable_memcopy()->set_bytes(r.bytes);
281+
}
217282
std::string profile_str;
218283
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
219284
std::ofstream profile_f;
@@ -257,6 +322,7 @@ class DeviceTracerImpl : public DeviceTracer {
257322
uint64_t start_ns_;
258323
uint64_t end_ns_;
259324
std::vector<KernelRecord> kernel_records_;
325+
std::vector<MemRecord> mem_records_;
260326
std::vector<CPURecord> cpu_records_;
261327
std::unordered_map<uint32_t, std::string> correlations_;
262328
CUpti_SubscriberHandle subscriber_;
@@ -272,6 +338,10 @@ class DeviceTracerDummy : public DeviceTracer {
272338

273339
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
274340

341+
void AddMemRecords(const std::string &name, uint64_t start_ns,
342+
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
343+
uint32_t correlation_id, uint64_t bytes) {}
344+
275345
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
276346
uint32_t stream_id, uint32_t correlation_id) {}
277347

paddle/fluid/platform/device_tracer.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ class DeviceTracer {
4242
uint64_t end_ns;
4343
uint64_t thread_id;
4444
};
45+
struct MemRecord {
46+
std::string name;
47+
uint64_t start_ns;
48+
uint64_t end_ns;
49+
uint32_t device_id;
50+
uint32_t stream_id;
51+
uint32_t correlation_id;
52+
uint64_t bytes;
53+
};
4554

4655
virtual ~DeviceTracer() {}
4756
// Needs to be called once before use.
@@ -54,6 +63,11 @@ class DeviceTracer {
5463
// human-readable annotations.
5564
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
5665

66+
virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
67+
uint64_t end_ns, uint32_t device_id,
68+
uint32_t stream_id, uint32_t correlation_id,
69+
uint64_t bytes) = 0;
70+
5771
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
5872
uint64_t end_ns) = 0;
5973

paddle/fluid/platform/dynload/cupti.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
7474
__macro(cuptiFinalize); \
7575
__macro(cuptiSubscribe); \
7676
__macro(cuptiUnsubscribe); \
77-
__macro(cuptiEnableCallback);
77+
__macro(cuptiEnableCallback); \
78+
__macro(cuptiEnableDomain);
7879

7980
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
8081

paddle/fluid/platform/profiler.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,17 @@ limitations under the License. */
1515
syntax = "proto2";
1616
package paddle.platform.proto;
1717

18+
message MemCopy { optional uint64 bytes = 3; }
19+
1820
message Event {
1921
optional string name = 1;
2022
optional uint64 start_ns = 2;
2123
optional uint64 end_ns = 3;
2224
// When positive, it represents gpu id. When -1, it represents CPU.
23-
optional int32 device_id = 5;
25+
optional int64 device_id = 5;
2426
optional uint32 stream_id = 6;
27+
28+
optional MemCopy memcopy = 7;
2529
}
2630

2731
message Profile {

tools/timeline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ def _allocate_events(self):
135135
for event in self._profile_pb.events:
136136
pid = self._devices[event.device_id]
137137
args = {'name': event.name}
138+
if event.memcopy.bytes > 0:
139+
args = {'mem_bytes': event.memcopy.bytes}
138140
# TODO(panyx0718): Chrome tracing only handles ms. However, some
139141
# ops takes micro-seconds. Hence, we keep the ns here.
140142
self._chrome_trace.emit_region(event.start_ns,

0 commit comments

Comments
 (0)