Skip to content

Commit 9f11da5

Browse files
committed
Add synchronous TensorCopy and use it in double buffer
1 parent 3863c6a commit 9f11da5

File tree

5 files changed

+83
-29
lines changed

5 files changed

+83
-29
lines changed

paddle/fluid/framework/tensor_util.cc

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ namespace paddle {
2020
namespace framework {
2121

2222
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
23-
const platform::DeviceContext& ctx, Tensor* dst) {
23+
const platform::DeviceContext& ctx, Tensor* dst,
24+
bool sync = false) {
2425
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
2526
<< dst_place;
2627
src.check_memory_size();
@@ -47,9 +48,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
4748
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
4849
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
4950
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
50-
memory::Copy(
51-
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
52-
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
51+
auto stream =
52+
sync ? nullptr
53+
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
54+
.stream();
55+
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
5356
} else if (platform::is_cpu_place(src_place) &&
5457
platform::is_gpu_place(dst_place)) {
5558
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
@@ -58,18 +61,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
5861
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
5962
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
6063
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
61-
memory::Copy(
62-
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
63-
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
64+
auto stream =
65+
sync ? nullptr
66+
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
67+
.stream();
68+
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
6469
} else if (platform::is_gpu_place(src_place) &&
6570
platform::is_gpu_place(dst_place)) {
6671
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
6772
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
6873
auto ctx_place = ctx.GetPlace();
6974
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
70-
memory::Copy(
71-
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
72-
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
75+
auto stream =
76+
sync ? nullptr
77+
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
78+
.stream();
79+
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
7380
}
7481
#endif
7582
}

paddle/fluid/memory/memcpy.cc

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,29 +30,46 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
3030
template <>
3131
void Copy<platform::CPUPlace, platform::CUDAPlace>(
3232
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
33-
const void* src, size_t num, cudaStream_t stream) {
33+
const void* src, size_t num, cudaStream_t stream = nullptr) {
3434
platform::SetDeviceId(src_place.device);
35-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
35+
if (stream) {
36+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
37+
} else {
38+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
39+
}
3640
}
3741

3842
template <>
3943
void Copy<platform::CUDAPlace, platform::CPUPlace>(
4044
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
41-
const void* src, size_t num, cudaStream_t stream) {
45+
const void* src, size_t num, cudaStream_t stream = nullptr) {
4246
platform::SetDeviceId(dst_place.device);
43-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
47+
if (stream) {
48+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
49+
} else {
50+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
51+
}
4452
}
4553

4654
template <>
4755
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
4856
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
49-
const void* src, size_t num, cudaStream_t stream) {
57+
const void* src, size_t num, cudaStream_t stream = nullptr) {
5058
if (dst_place == src_place) {
5159
platform::SetDeviceId(src_place.device);
52-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
60+
if (stream) {
61+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
62+
} else {
63+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
64+
}
5365
} else {
54-
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
55-
stream);
66+
if (stream) {
67+
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
68+
num, stream);
69+
} else {
70+
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
71+
num, stream);
72+
}
5673
}
5774
}
5875

@@ -81,18 +98,26 @@ template <>
8198
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
8299
platform::CUDAPinnedPlace dst_place, void* dst,
83100
platform::CUDAPlace src_place, const void* src, size_t num,
84-
cudaStream_t stream) {
101+
cudaStream_t stream = nullptr) {
85102
platform::SetDeviceId(src_place.device);
86-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
103+
if (stream) {
104+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
105+
} else {
106+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
107+
}
87108
}
88109

89110
template <>
90111
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
91112
platform::CUDAPlace dst_place, void* dst,
92113
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
93-
cudaStream_t stream) {
114+
cudaStream_t stream = nullptr) {
94115
platform::SetDeviceId(dst_place.device);
95-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
116+
if (stream) {
117+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
118+
} else {
119+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
120+
}
96121
}
97122

98123
#endif

paddle/fluid/operators/reader/create_double_buffer_reader_op.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ void DoubleBufferReader::PrefetchThreadFunc() {
180180
auto* gpu_ctx = ctxs_[cached_tensor_id].get();
181181
gpu_batch.resize(cpu_batch.size());
182182
for (size_t i = 0; i < cpu_batch.size(); ++i) {
183-
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
183+
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i],
184+
true);
184185
gpu_batch[i].set_lod(cpu_batch[i].lod());
185186
}
186187
}

paddle/fluid/platform/gpu_info.cc

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,24 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
127127
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
128128
}
129129

130-
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
131-
size_t count, cudaStream_t stream) {
130+
void GpuMemcpySync(void *dst, const void *src, size_t count,
131+
enum cudaMemcpyKind kind) {
132+
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
133+
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
134+
}
135+
136+
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
137+
int src_device, size_t count, cudaStream_t stream) {
132138
PADDLE_ENFORCE(
133139
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
134-
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
140+
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync");
141+
}
142+
143+
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
144+
int src_device, size_t count) {
145+
PADDLE_ENFORCE(
146+
cudaMemcpyPeer(dst, dst_device, src, src_device, count),
147+
"cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync");
135148
}
136149

137150
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {

paddle/fluid/platform/gpu_info.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,17 @@ size_t GpuMaxChunkSize();
5757
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
5858
enum cudaMemcpyKind kind, cudaStream_t stream);
5959

60-
//! Copy memory from one device to another device.
61-
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
62-
size_t count, cudaStream_t stream);
60+
//! Copy memory from address src to dst synchronously.
61+
void GpuMemcpySync(void *dst, const void *src, size_t count,
62+
enum cudaMemcpyKind kind);
63+
64+
//! Copy memory from one device to another device asynchronously.
65+
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
66+
int src_device, size_t count, cudaStream_t stream);
67+
68+
//! Copy memory from one device to another device synchronously.
69+
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
70+
int src_device, size_t count);
6371

6472
//! Set memory dst with value count size asynchronously
6573
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);

0 commit comments

Comments
 (0)