Skip to content

Commit dd59465

Browse files
authored
Merge pull request #10142 from JiayiFeng/Add_TensorCopySync
Add synchronous TensorCopy
2 parents 2486d56 + c5e178f commit dd59465

File tree

6 files changed

+79
-25
lines changed

6 files changed

+79
-25
lines changed

paddle/fluid/framework/tensor_util.cc

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ namespace paddle {
2020
namespace framework {
2121

2222
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
23-
const platform::DeviceContext& ctx, Tensor* dst) {
23+
const platform::DeviceContext& ctx, Tensor* dst, bool sync) {
2424
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
2525
<< dst_place;
2626
src.check_memory_size();
@@ -47,9 +47,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
4747
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
4848
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
4949
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
50-
memory::Copy(
51-
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
52-
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
50+
auto stream =
51+
sync ? nullptr
52+
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
53+
.stream();
54+
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
5355
} else if (platform::is_cpu_place(src_place) &&
5456
platform::is_gpu_place(dst_place)) {
5557
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
@@ -58,18 +60,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
5860
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
5961
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
6062
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
61-
memory::Copy(
62-
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
63-
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
63+
auto stream =
64+
sync ? nullptr
65+
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
66+
.stream();
67+
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
6468
} else if (platform::is_gpu_place(src_place) &&
6569
platform::is_gpu_place(dst_place)) {
6670
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
6771
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
6872
auto ctx_place = ctx.GetPlace();
6973
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
70-
memory::Copy(
71-
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
72-
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
74+
auto stream =
75+
sync ? nullptr
76+
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
77+
.stream();
78+
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
7379
}
7480
#endif
7581
}

paddle/fluid/framework/tensor_util.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ namespace paddle {
2424
namespace framework {
2525

2626
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
27-
const platform::DeviceContext& ctx, Tensor* dst);
27+
const platform::DeviceContext& ctx, Tensor* dst,
28+
bool sync = false);
2829
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
2930
Tensor* dst);
3031

paddle/fluid/memory/memcpy.cc

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,23 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
3232
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
3333
const void* src, size_t num, cudaStream_t stream) {
3434
platform::SetDeviceId(src_place.device);
35-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
35+
if (stream) {
36+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
37+
} else {
38+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
39+
}
3640
}
3741

3842
template <>
3943
void Copy<platform::CUDAPlace, platform::CPUPlace>(
4044
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
4145
const void* src, size_t num, cudaStream_t stream) {
4246
platform::SetDeviceId(dst_place.device);
43-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
47+
if (stream) {
48+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
49+
} else {
50+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
51+
}
4452
}
4553

4654
template <>
@@ -49,10 +57,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
4957
const void* src, size_t num, cudaStream_t stream) {
5058
if (dst_place == src_place) {
5159
platform::SetDeviceId(src_place.device);
52-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
60+
if (stream) {
61+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
62+
} else {
63+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
64+
}
5365
} else {
54-
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
55-
stream);
66+
if (stream) {
67+
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
68+
num, stream);
69+
} else {
70+
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
71+
num);
72+
}
5673
}
5774
}
5875

@@ -83,7 +100,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
83100
platform::CUDAPlace src_place, const void* src, size_t num,
84101
cudaStream_t stream) {
85102
platform::SetDeviceId(src_place.device);
86-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
103+
if (stream) {
104+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
105+
} else {
106+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
107+
}
87108
}
88109

89110
template <>
@@ -92,7 +113,11 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
92113
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
93114
cudaStream_t stream) {
94115
platform::SetDeviceId(dst_place.device);
95-
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
116+
if (stream) {
117+
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
118+
} else {
119+
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
120+
}
96121
}
97122

98123
#endif

paddle/fluid/operators/reader/create_double_buffer_reader_op.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ void DoubleBufferReader::PrefetchThreadFunc() {
180180
auto* gpu_ctx = ctxs_[cached_tensor_id].get();
181181
gpu_batch.resize(cpu_batch.size());
182182
for (size_t i = 0; i < cpu_batch.size(); ++i) {
183-
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
183+
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i],
184+
true);
184185
gpu_batch[i].set_lod(cpu_batch[i].lod());
185186
}
186187
}

paddle/fluid/platform/gpu_info.cc

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,24 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
127127
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
128128
}
129129

130-
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
131-
size_t count, cudaStream_t stream) {
130+
void GpuMemcpySync(void *dst, const void *src, size_t count,
131+
enum cudaMemcpyKind kind) {
132+
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
133+
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
134+
}
135+
136+
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
137+
int src_device, size_t count, cudaStream_t stream) {
132138
PADDLE_ENFORCE(
133139
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
134-
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
140+
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync");
141+
}
142+
143+
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
144+
int src_device, size_t count) {
145+
PADDLE_ENFORCE(
146+
cudaMemcpyPeer(dst, dst_device, src, src_device, count),
147+
"cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync");
135148
}
136149

137150
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {

paddle/fluid/platform/gpu_info.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,17 @@ size_t GpuMaxChunkSize();
5757
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
5858
enum cudaMemcpyKind kind, cudaStream_t stream);
5959

60-
//! Copy memory from one device to another device.
61-
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
62-
size_t count, cudaStream_t stream);
60+
//! Copy memory from address src to dst synchronously.
61+
void GpuMemcpySync(void *dst, const void *src, size_t count,
62+
enum cudaMemcpyKind kind);
63+
64+
//! Copy memory from one device to another device asynchronously.
65+
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
66+
int src_device, size_t count, cudaStream_t stream);
67+
68+
//! Copy memory from one device to another device synchronously.
69+
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
70+
int src_device, size_t count);
6371

6472
//! Set memory dst with value count size asynchronously
6573
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);

0 commit comments

Comments
 (0)