@@ -55,6 +55,36 @@ uint64_t kAlignSize = 8;
55
55
} \
56
56
} while (0 )
57
57
58
+ std::string MemcpyKind (CUpti_ActivityMemcpyKind kind) {
59
+ switch (kind) {
60
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
61
+ return " MEMCPY_HtoD" ;
62
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
63
+ return " MEMCPY_DtoH" ;
64
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
65
+ return " MEMCPY_HtoA" ;
66
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
67
+ return " MEMCPY_AtoH" ;
68
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
69
+ return " MEMCPY_AtoA" ;
70
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
71
+ return " MEMCPY_AtoD" ;
72
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
73
+ return " MEMCPY_DtoA" ;
74
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
75
+ return " MEMCPY_DtoD" ;
76
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
77
+ return " MEMCPY_HtoH" ;
78
+ case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
79
+ return " MEMCPY_PtoP" ;
80
+ case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
81
+ return " MEMCPY_FORCE_INT" ;
82
+ default :
83
+ break ;
84
+ }
85
+ return " MEMCPY" ;
86
+ }
87
+
58
88
void EnableActivity () {
59
89
// Device activity record is created when CUDA initializes, so we
60
90
// want to enable it before cuInit() or any CUDA runtime call.
@@ -111,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
111
141
kernel->correlationId );
112
142
break ;
113
143
}
144
+ case CUPTI_ACTIVITY_KIND_MEMCPY: {
145
+ auto *memcpy =
146
+ reinterpret_cast <const CUpti_ActivityMemcpy *>(record);
147
+ tracer->AddMemRecords (
148
+ MemcpyKind (
149
+ static_cast <CUpti_ActivityMemcpyKind>(memcpy->copyKind )),
150
+ memcpy->start , memcpy->end , memcpy->deviceId , memcpy->streamId ,
151
+ memcpy->correlationId , memcpy->bytes );
152
+ break ;
153
+ }
154
+ case CUPTI_ACTIVITY_KIND_MEMCPY2: {
155
+ auto *memcpy =
156
+ reinterpret_cast <const CUpti_ActivityMemcpy2 *>(record);
157
+ tracer->AddMemRecords (
158
+ MemcpyKind (
159
+ static_cast <CUpti_ActivityMemcpyKind>(memcpy->copyKind )),
160
+ memcpy->start , memcpy->end , memcpy->deviceId , memcpy->streamId ,
161
+ memcpy->correlationId , memcpy->bytes );
162
+ break ;
163
+ }
114
164
default : { break ; }
115
165
}
116
166
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -148,6 +198,13 @@ class DeviceTracerImpl : public DeviceTracer {
148
198
std::hash<std::thread::id>{}(std::this_thread::get_id ())});
149
199
}
150
200
201
+ void AddMemRecords (const std::string &name, uint64_t start_ns,
202
+ uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
203
+ uint32_t correlation_id, uint64_t bytes) {
204
+ mem_records_.push_back (MemRecord{name, start_ns, end_ns, device_id,
205
+ stream_id, correlation_id, bytes});
206
+ }
207
+
151
208
void AddKernelRecords (uint64_t start, uint64_t end, uint32_t device_id,
152
209
uint32_t stream_id, uint32_t correlation_id) {
153
210
std::lock_guard<std::mutex> l (trace_mu_);
@@ -183,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
183
240
CUPTI_CALL (
184
241
dynload::cuptiEnableCallback (1 , subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
185
242
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
186
-
187
243
CUPTI_CALL (dynload::cuptiGetTimestamp (&start_ns_));
188
244
enabled_ = true ;
189
245
}
@@ -214,6 +270,15 @@ class DeviceTracerImpl : public DeviceTracer {
214
270
event->set_stream_id (r.thread_id );
215
271
event->set_device_id (-1 );
216
272
}
273
+ for (const MemRecord &r : mem_records_) {
274
+ auto *event = profile_pb.add_events ();
275
+ event->set_name (r.name );
276
+ event->set_start_ns (r.start_ns );
277
+ event->set_end_ns (r.end_ns );
278
+ event->set_stream_id (r.stream_id );
279
+ event->set_device_id (r.device_id );
280
+ event->mutable_memcopy ()->set_bytes (r.bytes );
281
+ }
217
282
std::string profile_str;
218
283
google::protobuf::TextFormat::PrintToString (profile_pb, &profile_str);
219
284
std::ofstream profile_f;
@@ -257,6 +322,7 @@ class DeviceTracerImpl : public DeviceTracer {
257
322
uint64_t start_ns_;
258
323
uint64_t end_ns_;
259
324
std::vector<KernelRecord> kernel_records_;
325
+ std::vector<MemRecord> mem_records_;
260
326
std::vector<CPURecord> cpu_records_;
261
327
std::unordered_map<uint32_t , std::string> correlations_;
262
328
CUpti_SubscriberHandle subscriber_;
@@ -272,6 +338,10 @@ class DeviceTracerDummy : public DeviceTracer {
272
338
273
339
void AddCPURecords (const char *anno, uint64_t start_ns, uint64_t end_ns) {}
274
340
341
+ void AddMemRecords (const std::string &name, uint64_t start_ns,
342
+ uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
343
+ uint32_t correlation_id, uint64_t bytes) {}
344
+
275
345
void AddKernelRecords (uint64_t start, uint64_t end, uint32_t device_id,
276
346
uint32_t stream_id, uint32_t correlation_id) {}
277
347
0 commit comments