@@ -18,6 +18,7 @@ limitations under the License. */
18
18
#include < map>
19
19
#include < mutex>
20
20
#include < numeric>
21
+ #include < thread>
21
22
#include " glog/logging.h"
22
23
#include " paddle/fluid/framework/block_desc.h"
23
24
#include " paddle/fluid/string/printf.h"
@@ -54,6 +55,36 @@ uint64_t kAlignSize = 8;
54
55
} \
55
56
} while (0 )
56
57
58
+ std::string MemcpyKind (CUpti_ActivityMemcpyKind kind) {
59
+ switch (kind) {
60
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
61
+ return " MEMCPY_HtoD" ;
62
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
63
+ return " MEMCPY_DtoH" ;
64
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
65
+ return " MEMCPY_HtoA" ;
66
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
67
+ return " MEMCPY_AtoH" ;
68
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
69
+ return " MEMCPY_AtoA" ;
70
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
71
+ return " MEMCPY_AtoD" ;
72
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
73
+ return " MEMCPY_DtoA" ;
74
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
75
+ return " MEMCPY_DtoD" ;
76
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
77
+ return " MEMCPY_HtoH" ;
78
+ case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
79
+ return " MEMCPY_PtoP" ;
80
+ case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
81
+ return " MEMCPY_FORCE_INT" ;
82
+ default :
83
+ break ;
84
+ }
85
+ return " MEMCPY" ;
86
+ }
87
+
57
88
void EnableActivity () {
58
89
// Device activity record is created when CUDA initializes, so we
59
90
// want to enable it before cuInit() or any CUDA runtime call.
@@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
110
141
kernel->correlationId );
111
142
break ;
112
143
}
144
+ case CUPTI_ACTIVITY_KIND_MEMCPY: {
145
+ auto *memcpy =
146
+ reinterpret_cast <const CUpti_ActivityMemcpy *>(record);
147
+ tracer->AddMemRecords (
148
+ MemcpyKind (
149
+ static_cast <CUpti_ActivityMemcpyKind>(memcpy->copyKind )),
150
+ memcpy->start , memcpy->end , memcpy->deviceId , memcpy->streamId ,
151
+ memcpy->correlationId , memcpy->bytes );
152
+ break ;
153
+ }
154
+ case CUPTI_ACTIVITY_KIND_MEMCPY2: {
155
+ auto *memcpy =
156
+ reinterpret_cast <const CUpti_ActivityMemcpy2 *>(record);
157
+ tracer->AddMemRecords (
158
+ MemcpyKind (
159
+ static_cast <CUpti_ActivityMemcpyKind>(memcpy->copyKind )),
160
+ memcpy->start , memcpy->end , memcpy->deviceId , memcpy->streamId ,
161
+ memcpy->correlationId , memcpy->bytes );
162
+ break ;
163
+ }
113
164
default : { break ; }
114
165
}
115
166
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -140,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
140
191
correlations_[id] = anno;
141
192
}
142
193
194
+ void AddCPURecords (const char *anno, uint64_t start_ns, uint64_t end_ns) {
195
+ std::lock_guard<std::mutex> l (trace_mu_);
196
+ cpu_records_.push_back (
197
+ CPURecord{anno, start_ns, end_ns,
198
+ std::hash<std::thread::id>{}(std::this_thread::get_id ())});
199
+ }
200
+
201
+ void AddMemRecords (const std::string &name, uint64_t start_ns,
202
+ uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
203
+ uint32_t correlation_id, uint64_t bytes) {
204
+ mem_records_.push_back (MemRecord{name, start_ns, end_ns, device_id,
205
+ stream_id, correlation_id, bytes});
206
+ }
207
+
143
208
void AddKernelRecords (uint64_t start, uint64_t end, uint32_t device_id,
144
209
uint32_t stream_id, uint32_t correlation_id) {
145
210
std::lock_guard<std::mutex> l (trace_mu_);
@@ -175,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
175
240
CUPTI_CALL (
176
241
dynload::cuptiEnableCallback (1 , subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
177
242
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
178
-
179
243
CUPTI_CALL (dynload::cuptiGetTimestamp (&start_ns_));
180
244
enabled_ = true ;
181
245
}
@@ -185,7 +249,6 @@ class DeviceTracerImpl : public DeviceTracer {
185
249
proto::Profile profile_pb;
186
250
profile_pb.set_start_ns (start_ns_);
187
251
profile_pb.set_end_ns (end_ns_);
188
- std::map<std::string, std::vector<uint64_t >> event_times;
189
252
for (const KernelRecord &r : kernel_records_) {
190
253
if (correlations_.find (r.correlation_id ) == correlations_.end ()) {
191
254
fprintf (stderr, " cannot relate a kernel activity\n " );
@@ -197,7 +260,24 @@ class DeviceTracerImpl : public DeviceTracer {
197
260
event->set_end_ns (r.end_ns );
198
261
event->set_stream_id (r.stream_id );
199
262
event->set_device_id (r.device_id );
200
- event_times[event->name ()].push_back (r.end_ns - r.start_ns );
263
+ }
264
+
265
+ for (const CPURecord &r : cpu_records_) {
266
+ auto *event = profile_pb.add_events ();
267
+ event->set_name (r.name );
268
+ event->set_start_ns (r.start_ns );
269
+ event->set_end_ns (r.end_ns );
270
+ event->set_stream_id (r.thread_id );
271
+ event->set_device_id (-1 );
272
+ }
273
+ for (const MemRecord &r : mem_records_) {
274
+ auto *event = profile_pb.add_events ();
275
+ event->set_name (r.name );
276
+ event->set_start_ns (r.start_ns );
277
+ event->set_end_ns (r.end_ns );
278
+ event->set_stream_id (r.stream_id );
279
+ event->set_device_id (r.device_id );
280
+ event->mutable_memcopy ()->set_bytes (r.bytes );
201
281
}
202
282
std::string profile_str;
203
283
google::protobuf::TextFormat::PrintToString (profile_pb, &profile_str);
@@ -242,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
242
322
uint64_t start_ns_;
243
323
uint64_t end_ns_;
244
324
std::vector<KernelRecord> kernel_records_;
325
+ std::vector<MemRecord> mem_records_;
326
+ std::vector<CPURecord> cpu_records_;
245
327
std::unordered_map<uint32_t , std::string> correlations_;
246
328
CUpti_SubscriberHandle subscriber_;
247
329
};
@@ -254,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {
254
336
255
337
void AddAnnotation (uint64_t id, const std::string &anno) {}
256
338
339
+ void AddCPURecords (const char *anno, uint64_t start_ns, uint64_t end_ns) {}
340
+
341
+ void AddMemRecords (const std::string &name, uint64_t start_ns,
342
+ uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
343
+ uint32_t correlation_id, uint64_t bytes) {}
344
+
257
345
void AddKernelRecords (uint64_t start, uint64_t end, uint32_t device_id,
258
346
uint32_t stream_id, uint32_t correlation_id) {}
259
347
@@ -285,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
285
373
286
374
void ClearCurAnnotation () { cur_annotation = nullptr ; }
287
375
376
+ const char *CurAnnotation () { return cur_annotation; }
377
+
288
378
} // namespace platform
289
379
} // namespace paddle
0 commit comments