Skip to content

Commit 7235fd6

Browse files
author
chengduo
authored
Add Event for TensorCopy (#15953)
Add Event for TensorCopy
1 parent 46c5e37 commit 7235fd6

File tree

8 files changed

+111
-23
lines changed

8 files changed

+111
-23
lines changed

paddle/fluid/framework/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ if(WITH_GPU)
3838
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
3939
add_dependencies(tensor tensor_util)
4040
else()
41-
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
41+
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
4242
endif(WIN32)
4343
else()
44-
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
44+
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
4545
endif()
4646

4747
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)

paddle/fluid/framework/tensor_util.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,11 @@
1414
#include "paddle/fluid/framework/tensor_util.h"
1515
#include <algorithm>
1616
#include <limits>
17+
#include <memory>
18+
#include <utility>
1719
#include <vector>
1820
#include "paddle/fluid/framework/data_type.h"
21+
#include "paddle/fluid/platform/profiler.h"
1922

2023
namespace paddle {
2124
namespace framework {
@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
135138
#ifdef PADDLE_WITH_CUDA
136139
else if (platform::is_gpu_place(src_place) && // NOLINT
137140
platform::is_cpu_place(dst_place)) {
141+
platform::RecordEvent record_event("TensorCopy:GPU->CPU");
138142
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
139143
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
140144
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
141145
} else if (platform::is_cpu_place(src_place) &&
142146
platform::is_gpu_place(dst_place)) {
147+
platform::RecordEvent record_event("TensorCopy:CPU->GPU");
143148
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
144149
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
145150
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
146151
} else if (platform::is_gpu_place(src_place) &&
147152
platform::is_gpu_place(dst_place)) {
153+
platform::RecordEvent record_event("TensorCopy:GPU->GPU");
148154
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
149155
VLOG(3) << "Skip copy the same data from " << src_place << " to "
150156
<< dst_place;
@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
155161
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
156162
} else if (platform::is_cuda_pinned_place(src_place) &&
157163
platform::is_gpu_place(dst_place)) {
164+
platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
158165
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
159166
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
160167
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,

paddle/fluid/memory/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
add_subdirectory(detail)
22
add_subdirectory(allocation)
3-
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
3+
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler)
44
cc_library(memcpy SRCS memcpy.cc DEPS place)
55

66
cc_library(memory

paddle/fluid/memory/memcpy.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ limitations under the License. */
1515
#include "paddle/fluid/memory/memcpy.h"
1616

1717
#include <cstring> // for memcpy
18+
#include "paddle/fluid/platform/profiler.h"
1819

1920
namespace paddle {
2021
namespace memory {
@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
2930
#ifdef PADDLE_WITH_CUDA
3031
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
3132

33+
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
34+
// because GpuMemcpySync issues the copying command to the default stream,
35+
// which will make two commands from different streams cannot run concurrently.
36+
// Reference:
37+
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
38+
3239
template <>
3340
void Copy<platform::CPUPlace, platform::CUDAPlace>(
3441
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
3542
const void* src, size_t num, cudaStream_t stream) {
3643
platform::SetDeviceId(src_place.device);
44+
3745
if (stream) {
46+
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
3847
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
3948
} else {
49+
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
4050
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
4151
// FIXME(zjl): do we really need it?
4252
if (num <= kMaxGpuAsyncCopyBytes) {
@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
5161
const void* src, size_t num, cudaStream_t stream) {
5262
platform::SetDeviceId(dst_place.device);
5363
if (stream) {
64+
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
5465
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
5566
} else {
67+
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
5668
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
5769
// FIXME(zjl): do we really need it?
5870
if (num <= kMaxGpuAsyncCopyBytes) {
@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
6880
if (dst_place == src_place) {
6981
platform::SetDeviceId(src_place.device);
7082
if (stream) {
83+
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
7184
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
7285
} else {
86+
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
7387
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
7488
}
7589
} else {
7690
if (stream) {
91+
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
7792
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
7893
num, stream);
7994
} else {
95+
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
8096
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
8197
num);
8298
}
@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
111127
cudaStream_t stream) {
112128
platform::SetDeviceId(src_place.device);
113129
if (stream) {
130+
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
114131
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
115132
} else {
133+
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
116134
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
117135
}
118136
}
@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
124142
cudaStream_t stream) {
125143
platform::SetDeviceId(dst_place.device);
126144
if (stream) {
145+
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
127146
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
128147
} else {
148+
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
129149
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
130150
}
131151
}

paddle/fluid/operators/reader/buffered_reader.cc

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@
1313
// limitations under the License.
1414

1515
#include "paddle/fluid/operators/reader/buffered_reader.h"
16+
#include <memory>
1617
#include <vector>
1718
#include "paddle/fluid/framework/data_type.h"
1819

20+
#include "paddle/fluid/platform/profiler.h"
1921
namespace paddle {
2022
namespace operators {
2123
namespace reader {
@@ -49,9 +51,10 @@ BufferedReader::BufferedReader(
4951
.Get(place_)))
5052
->stream();
5153
events.resize(buffer_size);
52-
for (auto &event : events)
54+
PADDLE_ENFORCE(cudaStreamCreate(&stream));
55+
for (auto &event : events) {
5356
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
54-
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
57+
}
5558
}
5659
#endif
5760
cpu_buffer_.resize(buffer_size);
@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) {
8386

8487
#ifdef PADDLE_WITH_CUDA
8588
// NOTE(liangdun): using async copy instead of TensorCopySync
86-
// TensorCopySync would block other stream
89+
// TensorCopySync would block other stream, because TensorCopySync
90+
// issues the copying command to the default stream, it will make two
91+
// commands from different streams cannot run concurrently.
8792
if (platform::is_gpu_place(place_)) {
8893
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
8994
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
9095
TensorVec &gpu = gpu_buffer_[i];
9196
gpu.resize(cpu.size());
97+
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
9298
for (size_t i = 0; i < cpu.size(); ++i) {
9399
gpu[i].Resize(cpu[i].dims());
94100
gpu[i].set_layout(cpu[i].layout());
@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) {
97103
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
98104
auto size =
99105
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
100-
if (platform::is_cuda_pinned_place(cpu_place))
106+
if (platform::is_cuda_pinned_place(cpu_place)) {
101107
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
102108
boost::get<platform::CUDAPinnedPlace>(cpu_place),
103109
cpu_ptr, size, stream);
104-
else if ((platform::is_gpu_place(cpu_place)))
110+
} else if ((platform::is_gpu_place(cpu_place))) {
105111
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
106112
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
107113
size, stream);
108-
else
109-
// if cpu place is not pinned, async copy is slower than sync copy,
110-
// so we use sync copy instead.
114+
} else {
111115
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
112116
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
113-
0);
117+
stream);
118+
}
114119
gpu[i].set_lod(cpu[i].lod());
115120
}
116121
PADDLE_ENFORCE(cudaStreamSynchronize(stream));

paddle/fluid/platform/device_tracer.cc

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ limitations under the License. */
3030
#include "glog/logging.h"
3131
#include "google/protobuf/text_format.h"
3232
#include "paddle/fluid/framework/block_desc.h"
33-
#include "paddle/fluid/platform/profiler.h"
3433
#include "paddle/fluid/string/printf.h"
3534

3635
namespace paddle {
@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
222221
}
223222
case CUPTI_ACTIVITY_KIND_DRIVER: {
224223
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
225-
if (api->start != 0 && api->end != 0)
226-
// -1 device id represents CUDA api call
227-
tracer->AddCPURecords(
224+
if (api->start != 0 && api->end != 0) {
225+
// -1 device id represents ActiveKind api call
226+
tracer->AddActiveKindRecords(
228227
DriverKind(api->cbid), api->start, api->end, -1,
229-
GetThreadIdFromSystemThreadId(api->threadId));
228+
GetThreadIdFromSystemThreadId(api->threadId),
229+
api->correlationId);
230+
}
230231
break;
231232
}
232233
case CUPTI_ACTIVITY_KIND_RUNTIME: {
233234
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
234-
if (api->start != 0 && api->end != 0)
235-
tracer->AddCPURecords(
235+
if (api->start != 0 && api->end != 0) {
236+
// -1 device id represents ActiveKind api call
237+
tracer->AddActiveKindRecords(
236238
RuntimeKind(api->cbid), api->start, api->end, -1,
237-
GetThreadIdFromSystemThreadId(api->threadId));
239+
GetThreadIdFromSystemThreadId(api->threadId),
240+
api->correlationId);
241+
}
238242
break;
239243
}
240244
default: { break; }
@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer {
313317
stream_id, correlation_id, bytes});
314318
}
315319

320+
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
321+
uint64_t end_ns, int64_t device_id,
322+
int64_t thread_id, uint32_t correlation_id) {
323+
if (anno.empty()) {
324+
VLOG(1) << "Empty timeline annotation.";
325+
return;
326+
}
327+
thread_local std::forward_list<ActiveKindRecord>
328+
*local_active_kind_records = nullptr;
329+
if (local_active_kind_records == nullptr) {
330+
std::lock_guard<std::mutex> l(trace_mu_);
331+
active_kind_records_.emplace_front();
332+
local_active_kind_records = &active_kind_records_.front();
333+
}
334+
// lock is not needed, only one thread call this function.
335+
local_active_kind_records->push_front(ActiveKindRecord{
336+
anno, start_ns, end_ns, device_id, thread_id, correlation_id});
337+
}
338+
316339
void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
317340
int64_t device_id, int64_t stream_id,
318341
uint32_t correlation_id) {
@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer {
355378
}
356379
const std::vector<int> cbids {
357380
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
381+
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
358382
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
359383
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
360384
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer {
385409
correlations_.clear();
386410
for (auto &tmp : correlations_pairs) tmp.clear();
387411
for (auto &tmp : cpu_records_) tmp.clear();
412+
for (auto &tmp : active_kind_records_) tmp.clear();
388413
}
389414

390415
void GenEventKernelCudaElapsedTime() {
@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer {
437462
event->set_device_id(r.device_id);
438463
}
439464
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
440-
for (auto &tmp : cpu_records_)
465+
for (auto &tmp : cpu_records_) {
441466
for (const CPURecord &r : tmp) {
442467
auto *event = profile_pb.add_events();
443468
event->set_type(proto::Event::CPU);
@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer {
447472
event->set_sub_device_id(r.thread_id);
448473
event->set_device_id(r.device_id);
449474
}
475+
}
476+
for (auto &tmp : active_kind_records_) {
477+
for (const ActiveKindRecord &r : tmp) {
478+
auto *event = profile_pb.add_events();
479+
event->set_type(proto::Event::CPU);
480+
auto c = correlations_.find(r.correlation_id);
481+
if (c != correlations_.end() && c->second != nullptr) {
482+
event->set_name(c->second->name());
483+
event->set_detail_info(r.name);
484+
} else {
485+
event->set_name(r.name);
486+
}
487+
event->set_start_ns(r.start_ns);
488+
event->set_end_ns(r.end_ns);
489+
event->set_sub_device_id(r.thread_id);
490+
event->set_device_id(r.device_id);
491+
}
492+
}
450493
miss = find = 0;
451494
for (const MemRecord &r : mem_records_) {
452495
auto *event = profile_pb.add_events();
@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer {
510553
std::forward_list<KernelRecord> kernel_records_;
511554
std::forward_list<MemRecord> mem_records_;
512555
std::forward_list<std::forward_list<CPURecord>> cpu_records_;
556+
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
513557
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
514558
correlations_pairs;
515559
std::unordered_map<uint32_t, Event *> correlations_;
@@ -613,6 +657,7 @@ void initCuptiCbidStr() {
613657
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
614658
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
615659
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
660+
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
616661
#if CUDA_VERSION >= 9000
617662
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
618663
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);

paddle/fluid/platform/device_tracer.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,14 @@ class DeviceTracer {
6363
uint32_t correlation_id;
6464
uint64_t bytes;
6565
};
66-
66+
struct ActiveKindRecord {
67+
std::string name;
68+
uint64_t start_ns;
69+
uint64_t end_ns;
70+
int64_t device_id;
71+
int64_t thread_id;
72+
uint32_t correlation_id;
73+
};
6774
virtual ~DeviceTracer() {}
6875
// Needs to be called once before use.
6976
virtual void Enable() = 0;
@@ -85,6 +92,10 @@ class DeviceTracer {
8592
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
8693
uint64_t end_ns, int64_t device_id,
8794
int64_t thread_id) = 0;
95+
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
96+
uint64_t end_ns, int64_t device_id,
97+
int64_t thread_id,
98+
uint32_t correlation_id) = 0;
8899

89100
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
90101
// added before for human readability.

tools/timeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def _allocate_pids(self):
131131
if (k, event.device_id, "CPU") not in self._devices:
132132
pid = self._allocate_pid()
133133
self._devices[(k, event.device_id, "CPU")] = pid
134-
# -1 device id represents CUDA api call
134+
# -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
135135
if event.device_id == -1:
136136
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
137137
else:

0 commit comments

Comments
 (0)