Skip to content

Commit e2da3a5

Browse files
author
chengduo
authored
Revert "Add Event for TensorCopy" (#16022)
* Revert "Add Event for TensorCopy (#15953)" This reverts commit 7235fd6. test=develop * fix CI test=develop
1 parent cae6614 commit e2da3a5

File tree

8 files changed

+23
-108
lines changed

8 files changed

+23
-108
lines changed

paddle/fluid/framework/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ if(WITH_GPU)
3838
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
3939
add_dependencies(tensor tensor_util)
4040
else()
41-
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
41+
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
4242
endif(WIN32)
4343
else()
44-
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
44+
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
4545
endif()
4646

4747
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)

paddle/fluid/framework/tensor_util.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
#include <utility>
1919
#include <vector>
2020
#include "paddle/fluid/framework/data_type.h"
21-
#include "paddle/fluid/platform/profiler.h"
2221

2322
namespace paddle {
2423
namespace framework {
@@ -138,19 +137,16 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
138137
#ifdef PADDLE_WITH_CUDA
139138
else if (platform::is_gpu_place(src_place) && // NOLINT
140139
platform::is_cpu_place(dst_place)) {
141-
platform::RecordEvent record_event("TensorCopy:GPU->CPU");
142140
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
143141
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
144142
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
145143
} else if (platform::is_cpu_place(src_place) &&
146144
platform::is_gpu_place(dst_place)) {
147-
platform::RecordEvent record_event("TensorCopy:CPU->GPU");
148145
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
149146
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
150147
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
151148
} else if (platform::is_gpu_place(src_place) &&
152149
platform::is_gpu_place(dst_place)) {
153-
platform::RecordEvent record_event("TensorCopy:GPU->GPU");
154150
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
155151
VLOG(3) << "Skip copy the same data from " << src_place << " to "
156152
<< dst_place;
@@ -161,7 +157,6 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
161157
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
162158
} else if (platform::is_cuda_pinned_place(src_place) &&
163159
platform::is_gpu_place(dst_place)) {
164-
platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
165160
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
166161
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
167162
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,

paddle/fluid/memory/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
add_subdirectory(detail)
22
add_subdirectory(allocation)
3-
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler)
3+
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
44
cc_library(memcpy SRCS memcpy.cc DEPS place)
55

66
cc_library(memory

paddle/fluid/memory/memcpy.cc

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ limitations under the License. */
1515
#include "paddle/fluid/memory/memcpy.h"
1616

1717
#include <cstring> // for memcpy
18-
#include "paddle/fluid/platform/profiler.h"
1918

2019
namespace paddle {
2120
namespace memory {
@@ -30,23 +29,14 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
3029
#ifdef PADDLE_WITH_CUDA
3130
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
3231

33-
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
34-
// because GpuMemcpySync issues the copying command to the default stream,
35-
// which will make two commands from different streams cannot run concurrently.
36-
// Reference:
37-
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
38-
3932
template <>
4033
void Copy<platform::CPUPlace, platform::CUDAPlace>(
4134
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
4235
const void* src, size_t num, cudaStream_t stream) {
4336
platform::SetDeviceId(src_place.device);
44-
4537
if (stream) {
46-
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
4738
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
4839
} else {
49-
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
5040
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
5141
// FIXME(zjl): do we really need it?
5242
if (num <= kMaxGpuAsyncCopyBytes) {
@@ -61,10 +51,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
6151
const void* src, size_t num, cudaStream_t stream) {
6252
platform::SetDeviceId(dst_place.device);
6353
if (stream) {
64-
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
6554
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
6655
} else {
67-
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
6856
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
6957
// FIXME(zjl): do we really need it?
7058
if (num <= kMaxGpuAsyncCopyBytes) {
@@ -80,19 +68,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
8068
if (dst_place == src_place) {
8169
platform::SetDeviceId(src_place.device);
8270
if (stream) {
83-
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
8471
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
8572
} else {
86-
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
8773
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
8874
}
8975
} else {
9076
if (stream) {
91-
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
9277
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
9378
num, stream);
9479
} else {
95-
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
9680
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
9781
num);
9882
}
@@ -127,10 +111,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
127111
cudaStream_t stream) {
128112
platform::SetDeviceId(src_place.device);
129113
if (stream) {
130-
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
131114
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
132115
} else {
133-
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
134116
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
135117
}
136118
}
@@ -142,10 +124,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
142124
cudaStream_t stream) {
143125
platform::SetDeviceId(dst_place.device);
144126
if (stream) {
145-
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
146127
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
147128
} else {
148-
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
149129
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
150130
}
151131
}

paddle/fluid/operators/reader/buffered_reader.cc

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include <vector>
1818
#include "paddle/fluid/framework/data_type.h"
1919

20-
#include "paddle/fluid/platform/profiler.h"
2120
namespace paddle {
2221
namespace operators {
2322
namespace reader {
@@ -51,10 +50,9 @@ BufferedReader::BufferedReader(
5150
.Get(place_)))
5251
->stream();
5352
events.resize(buffer_size);
54-
PADDLE_ENFORCE(cudaStreamCreate(&stream));
55-
for (auto &event : events) {
53+
for (auto &event : events)
5654
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
57-
}
55+
PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
5856
}
5957
#endif
6058
cpu_buffer_.resize(buffer_size);
@@ -86,15 +84,12 @@ void BufferedReader::ReadAsync(size_t i) {
8684

8785
#ifdef PADDLE_WITH_CUDA
8886
// NOTE(liangdun): using async copy instead of TensorCopySync
89-
// TensorCopySync would block other stream, because TensorCopySync
90-
// issues the copying command to the default stream, it will make two
91-
// commands from different streams cannot run concurrently.
87+
// TensorCopySync would block other stream
9288
if (platform::is_gpu_place(place_)) {
9389
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
9490
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
9591
TensorVec &gpu = gpu_buffer_[i];
9692
gpu.resize(cpu.size());
97-
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
9893
for (size_t i = 0; i < cpu.size(); ++i) {
9994
gpu[i].Resize(cpu[i].dims());
10095
gpu[i].set_layout(cpu[i].layout());
@@ -103,19 +98,20 @@ void BufferedReader::ReadAsync(size_t i) {
10398
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
10499
auto size =
105100
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
106-
if (platform::is_cuda_pinned_place(cpu_place)) {
101+
if (platform::is_cuda_pinned_place(cpu_place))
107102
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
108103
boost::get<platform::CUDAPinnedPlace>(cpu_place),
109104
cpu_ptr, size, stream);
110-
} else if ((platform::is_gpu_place(cpu_place))) {
105+
else if ((platform::is_gpu_place(cpu_place)))
111106
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
112107
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
113108
size, stream);
114-
} else {
109+
else
110+
// if cpu place is not pinned, async copy is slower than sync copy,
111+
// so we use sync copy instead.
115112
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
116113
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
117-
stream);
118-
}
114+
0);
119115
gpu[i].set_lod(cpu[i].lod());
120116
}
121117
PADDLE_ENFORCE(cudaStreamSynchronize(stream));

paddle/fluid/platform/device_tracer.cc

Lines changed: 9 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ limitations under the License. */
3030
#include "glog/logging.h"
3131
#include "google/protobuf/text_format.h"
3232
#include "paddle/fluid/framework/block_desc.h"
33+
#include "paddle/fluid/platform/profiler.h"
3334
#include "paddle/fluid/string/printf.h"
3435

3536
namespace paddle {
@@ -221,24 +222,19 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
221222
}
222223
case CUPTI_ACTIVITY_KIND_DRIVER: {
223224
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
224-
if (api->start != 0 && api->end != 0) {
225-
// -1 device id represents ActiveKind api call
226-
tracer->AddActiveKindRecords(
225+
if (api->start != 0 && api->end != 0)
226+
// -1 device id represents CUDA api call
227+
tracer->AddCPURecords(
227228
DriverKind(api->cbid), api->start, api->end, -1,
228-
GetThreadIdFromSystemThreadId(api->threadId),
229-
api->correlationId);
230-
}
229+
GetThreadIdFromSystemThreadId(api->threadId));
231230
break;
232231
}
233232
case CUPTI_ACTIVITY_KIND_RUNTIME: {
234233
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
235-
if (api->start != 0 && api->end != 0) {
236-
// -1 device id represents ActiveKind api call
237-
tracer->AddActiveKindRecords(
234+
if (api->start != 0 && api->end != 0)
235+
tracer->AddCPURecords(
238236
RuntimeKind(api->cbid), api->start, api->end, -1,
239-
GetThreadIdFromSystemThreadId(api->threadId),
240-
api->correlationId);
241-
}
237+
GetThreadIdFromSystemThreadId(api->threadId));
242238
break;
243239
}
244240
default: { break; }
@@ -317,25 +313,6 @@ class DeviceTracerImpl : public DeviceTracer {
317313
stream_id, correlation_id, bytes});
318314
}
319315

320-
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
321-
uint64_t end_ns, int64_t device_id,
322-
int64_t thread_id, uint32_t correlation_id) {
323-
if (anno.empty()) {
324-
VLOG(1) << "Empty timeline annotation.";
325-
return;
326-
}
327-
thread_local std::forward_list<ActiveKindRecord>
328-
*local_active_kind_records = nullptr;
329-
if (local_active_kind_records == nullptr) {
330-
std::lock_guard<std::mutex> l(trace_mu_);
331-
active_kind_records_.emplace_front();
332-
local_active_kind_records = &active_kind_records_.front();
333-
}
334-
// lock is not needed, only one thread call this function.
335-
local_active_kind_records->push_front(ActiveKindRecord{
336-
anno, start_ns, end_ns, device_id, thread_id, correlation_id});
337-
}
338-
339316
void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
340317
int64_t device_id, int64_t stream_id,
341318
uint32_t correlation_id) {
@@ -378,7 +355,6 @@ class DeviceTracerImpl : public DeviceTracer {
378355
}
379356
const std::vector<int> cbids {
380357
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
381-
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
382358
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
383359
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
384360
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
@@ -409,7 +385,6 @@ class DeviceTracerImpl : public DeviceTracer {
409385
correlations_.clear();
410386
for (auto &tmp : correlations_pairs) tmp.clear();
411387
for (auto &tmp : cpu_records_) tmp.clear();
412-
for (auto &tmp : active_kind_records_) tmp.clear();
413388
}
414389

415390
void GenEventKernelCudaElapsedTime() {
@@ -462,7 +437,7 @@ class DeviceTracerImpl : public DeviceTracer {
462437
event->set_device_id(r.device_id);
463438
}
464439
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
465-
for (auto &tmp : cpu_records_) {
440+
for (auto &tmp : cpu_records_)
466441
for (const CPURecord &r : tmp) {
467442
auto *event = profile_pb.add_events();
468443
event->set_type(proto::Event::CPU);
@@ -472,24 +447,6 @@ class DeviceTracerImpl : public DeviceTracer {
472447
event->set_sub_device_id(r.thread_id);
473448
event->set_device_id(r.device_id);
474449
}
475-
}
476-
for (auto &tmp : active_kind_records_) {
477-
for (const ActiveKindRecord &r : tmp) {
478-
auto *event = profile_pb.add_events();
479-
event->set_type(proto::Event::CPU);
480-
auto c = correlations_.find(r.correlation_id);
481-
if (c != correlations_.end() && c->second != nullptr) {
482-
event->set_name(c->second->name());
483-
event->set_detail_info(r.name);
484-
} else {
485-
event->set_name(r.name);
486-
}
487-
event->set_start_ns(r.start_ns);
488-
event->set_end_ns(r.end_ns);
489-
event->set_sub_device_id(r.thread_id);
490-
event->set_device_id(r.device_id);
491-
}
492-
}
493450
miss = find = 0;
494451
for (const MemRecord &r : mem_records_) {
495452
auto *event = profile_pb.add_events();
@@ -553,7 +510,6 @@ class DeviceTracerImpl : public DeviceTracer {
553510
std::forward_list<KernelRecord> kernel_records_;
554511
std::forward_list<MemRecord> mem_records_;
555512
std::forward_list<std::forward_list<CPURecord>> cpu_records_;
556-
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
557513
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
558514
correlations_pairs;
559515
std::unordered_map<uint32_t, Event *> correlations_;
@@ -657,7 +613,6 @@ void initCuptiCbidStr() {
657613
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
658614
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
659615
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
660-
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
661616
#if CUDA_VERSION >= 9000
662617
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
663618
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);

paddle/fluid/platform/device_tracer.h

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,7 @@ class DeviceTracer {
6363
uint32_t correlation_id;
6464
uint64_t bytes;
6565
};
66-
struct ActiveKindRecord {
67-
std::string name;
68-
uint64_t start_ns;
69-
uint64_t end_ns;
70-
int64_t device_id;
71-
int64_t thread_id;
72-
uint32_t correlation_id;
73-
};
66+
7467
virtual ~DeviceTracer() {}
7568
// Needs to be called once before use.
7669
virtual void Enable() = 0;
@@ -92,10 +85,6 @@ class DeviceTracer {
9285
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
9386
uint64_t end_ns, int64_t device_id,
9487
int64_t thread_id) = 0;
95-
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
96-
uint64_t end_ns, int64_t device_id,
97-
int64_t thread_id,
98-
uint32_t correlation_id) = 0;
9988

10089
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
10190
// added before for human readability.

tools/timeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def _allocate_pids(self):
131131
if (k, event.device_id, "CPU") not in self._devices:
132132
pid = self._allocate_pid()
133133
self._devices[(k, event.device_id, "CPU")] = pid
134-
# -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
134+
# -1 device id represents CUDA api call
135135
if event.device_id == -1:
136136
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
137137
else:

0 commit comments

Comments
 (0)