Skip to content

Commit 2603cb7

Browse files
QiJunewangkuiyi
authored andcommitted
Unify CUDA stream in Tensor CopyFrom interface (#4692)
* init * unify CopyFrom interface * fix gpu build error * fix bug in tensor_py.h * refine code comments and add TODO list * fix conflicts in FeedOp and FetchOp
1 parent d92f8de commit 2603cb7

14 files changed

+147
-86
lines changed

paddle/framework/tensor.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,26 +87,31 @@ class Tensor {
8787
/**
8888
* @brief Copy the content of external tensor to a new place.
8989
*
90-
* @param[in] src The external tensor.
91-
* @param[in] ctx The device context contains place where to store.
90+
* @param[in] src The external tensor.
91+
* @param[in] dst_place The dst place.
92+
* @param[in] ctx The device context contains device resources.
9293
*
9394
* @note CopyFrom supports CPU <-> GPU, GPU <-> GPU.
9495
*/
96+
// TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
97+
// Remove `CopyFrom` and `CopyFromVector` from Tensor interface
98+
// and make them global functions
9599
template <typename T>
96-
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);
100+
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
101+
const platform::DeviceContext& ctx);
97102

98103
/**
99104
* @brief Copy the content of an external vector to a tensor.
100105
*
101-
* @param[in] src The external vector.
102-
* @param[in] ctx The device context contains place where to store.
106+
* @param[in] src The external tensor.
107+
* @param[in] ctx The device context contains device resources.
103108
*
104109
* * @note CopyFromVector assumes that the tensor has been resized
105110
* before invoking.
106111
*/
107112
template <typename T>
108113
inline void CopyFromVector(const std::vector<T>& src,
109-
const platform::Place& dst_place);
114+
const platform::DeviceContext& ctx);
110115

111116
/**
112117
* @brief Return the slice of the tensor.

paddle/framework/tensor_array.cc

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {
9595

9696
values_[index].Resize(value.dims());
9797
values_[index].mutable_data<value_type>(platform::CPUPlace());
98-
values_[index].CopyFrom<value_type>(value, platform::CPUPlace());
98+
values_[index].CopyFrom<value_type>(value, platform::CPUPlace(),
99+
platform::CPUDeviceContext());
99100
}
100101

101102
void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
@@ -151,7 +152,8 @@ LoDTensor TensorArray::Stack() const {
151152

152153
for (size_t idx = 0; idx < size(); idx++) {
153154
result.Slice<value_type>(idx, idx + 1)
154-
.CopyFrom<value_type>(Read(idx), platform::CPUPlace());
155+
.CopyFrom<value_type>(Read(idx), platform::CPUPlace(),
156+
platform::CPUDeviceContext());
155157
}
156158
return result;
157159
}
@@ -182,7 +184,8 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
182184
// copy
183185
value.Resize(value_dims);
184186
value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
185-
platform::CPUPlace());
187+
platform::CPUPlace(),
188+
platform::CPUDeviceContext());
186189
}
187190
}
188191
}
@@ -236,7 +239,8 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
236239
auto target = result.Slice<value_type>(i, i + 1);
237240
auto source_ = source->Slice<value_type>(index, index + 1);
238241

239-
target.CopyFrom<value_type>(source_, platform::CPUPlace());
242+
target.CopyFrom<value_type>(source_, platform::CPUPlace(),
243+
platform::CPUDeviceContext());
240244
}
241245

242246
return result;
@@ -269,7 +273,8 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
269273
if (index >= seq_meta.end) break;
270274
auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
271275
auto target = result.Slice<float>(index, index + 1);
272-
target.CopyFrom<float>(source_, platform::CPUPlace());
276+
target.CopyFrom<float>(source_, platform::CPUPlace(),
277+
platform::CPUDeviceContext());
273278
}
274279
}
275280

paddle/framework/tensor_impl.h

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
8888

8989
template <typename T>
9090
inline void Tensor::CopyFrom(const Tensor& src,
91-
const platform::Place& dst_place) {
91+
const platform::Place& dst_place,
92+
const platform::DeviceContext& ctx) {
9293
src.check_memory_size<T>();
9394
Resize(src.dims());
9495

@@ -106,26 +107,45 @@ inline void Tensor::CopyFrom(const Tensor& src,
106107
#ifdef PADDLE_WITH_CUDA
107108
else if (platform::is_gpu_place(src_place) &&
108109
platform::is_cpu_place(dst_place)) {
109-
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
110-
boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
110+
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
111+
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
112+
auto ctx_place = ctx.GetPlace();
113+
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
114+
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
115+
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
116+
memory::Copy(
117+
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
118+
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
111119
} else if (platform::is_cpu_place(src_place) &&
112120
platform::is_gpu_place(dst_place)) {
113-
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
114-
boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
121+
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
122+
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
123+
auto ctx_place = ctx.GetPlace();
124+
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
125+
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
126+
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
127+
memory::Copy(
128+
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
129+
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
115130
} else if (platform::is_gpu_place(src_place) &&
116131
platform::is_gpu_place(dst_place)) {
117-
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
118-
boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
132+
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
133+
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
134+
auto ctx_place = ctx.GetPlace();
135+
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
136+
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
137+
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
138+
memory::Copy(
139+
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
140+
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
119141
}
120-
PADDLE_ENFORCE(cudaStreamSynchronize(0),
121-
"cudaStreamSynchronize failed in Tensor CopyFrom");
122-
123142
#endif
124143
}
125144

126145
template <typename T>
127146
inline void Tensor::CopyFromVector(const std::vector<T>& src,
128-
const platform::Place& dst_place) {
147+
const platform::DeviceContext& ctx) {
148+
auto dst_place = ctx.GetPlace();
129149
auto src_ptr = static_cast<const void*>(src.data());
130150
platform::CPUPlace src_place;
131151
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
@@ -137,12 +157,11 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
137157
}
138158
#ifdef PADDLE_WITH_CUDA
139159
else if (platform::is_gpu_place(dst_place)) {
140-
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place,
141-
src_ptr, size, 0);
160+
memory::Copy(
161+
boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
162+
size,
163+
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
142164
}
143-
PADDLE_ENFORCE(cudaStreamSynchronize(0),
144-
"cudaStreamSynchronize failed in Tensor CopyFromVector");
145-
146165
#endif
147166
}
148167

paddle/framework/tensor_test.cc

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -194,14 +194,15 @@ TEST(Tensor, CopyFrom) {
194194
{
195195
Tensor src_tensor;
196196
Tensor dst_tensor;
197+
CPUDeviceContext cpu_ctx((CPUPlace()));
197198

198199
int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
199200

200201
int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
201202
memcpy(src_ptr, arr, 9 * sizeof(int));
202203

203204
auto cpu_place = new paddle::platform::CPUPlace();
204-
dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);
205+
dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx);
205206

206207
const int* dst_ptr = dst_tensor.data<int>();
207208
ASSERT_NE(src_ptr, dst_ptr);
@@ -210,7 +211,7 @@ TEST(Tensor, CopyFrom) {
210211
}
211212

212213
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
213-
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
214+
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx);
214215
const int* slice_ptr = slice_tensor.data<int>();
215216
dst_ptr = dst_tensor.data<int>();
216217
ASSERT_NE(dst_ptr, slice_ptr);
@@ -231,13 +232,15 @@ TEST(Tensor, CopyFrom) {
231232

232233
// CPU Tensor to GPU Tensor
233234
auto gpu_place = new paddle::platform::GPUPlace(0);
234-
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);
235+
CUDADeviceContext gpu_ctx(*gpu_place);
236+
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx);
235237

236238
// GPU Tensor to CPU Tensor
237239
auto cpu_place = new paddle::platform::CPUPlace();
238-
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
240+
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
239241

240-
// Compare Tensors
242+
// Sync before Compare Tensors
243+
gpu_ctx.Wait();
241244
const int* dst_ptr = dst_tensor.data<int>();
242245
ASSERT_NE(src_ptr, dst_ptr);
243246
for (size_t i = 0; i < 9; ++i) {
@@ -247,12 +250,13 @@ TEST(Tensor, CopyFrom) {
247250
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
248251

249252
// CPU Slice Tensor to GPU Tensor
250-
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);
253+
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx);
251254

252255
// GPU Tensor to CPU Tensor
253-
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
256+
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
254257

255-
// Compare Slice Tensors
258+
// Sync before Compare Slice Tensors
259+
gpu_ctx.Wait();
256260
const int* slice_ptr = slice_tensor.data<int>();
257261
dst_ptr = dst_tensor.data<int>();
258262
ASSERT_NE(dst_ptr, slice_ptr);
@@ -273,7 +277,8 @@ TEST(Tensor, CopyFromVector) {
273277
// Copy to CPU Tensor
274278
cpu_tensor.Resize(make_ddim({3, 3}));
275279
auto cpu_place = new paddle::platform::CPUPlace();
276-
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
280+
CPUDeviceContext cpu_ctx(*cpu_place);
281+
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
277282

278283
// Compare Tensors
279284
const int* cpu_ptr = cpu_tensor.data<int>();
@@ -285,7 +290,7 @@ TEST(Tensor, CopyFromVector) {
285290

286291
src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
287292
cpu_tensor.Resize(make_ddim({2, 2}));
288-
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
293+
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
289294
cpu_ptr = cpu_tensor.data<int>();
290295
src_ptr = src_vec.data();
291296
ASSERT_NE(src_ptr, cpu_ptr);
@@ -306,16 +311,19 @@ TEST(Tensor, CopyFromVector) {
306311
// Copy to CPU Tensor
307312
cpu_tensor.Resize(make_ddim({3, 3}));
308313
auto cpu_place = new paddle::platform::CPUPlace();
309-
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
314+
CPUDeviceContext cpu_ctx(*cpu_place);
315+
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
310316

311317
// Copy to GPUTensor
312318
gpu_tensor.Resize(make_ddim({3, 3}));
313319
auto gpu_place = new paddle::platform::GPUPlace();
314-
gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place);
320+
CUDADeviceContext gpu_ctx(*gpu_place);
321+
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
315322
// Copy from GPU to CPU tensor for comparison
316-
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
323+
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
317324

318-
// Compare Tensors
325+
// Sync before Compare Tensors
326+
gpu_ctx.Wait();
319327
const int* src_ptr = src_vec.data();
320328
const int* cpu_ptr = cpu_tensor.data<int>();
321329
const int* dst_ptr = dst_tensor.data<int>();
@@ -329,11 +337,13 @@ TEST(Tensor, CopyFromVector) {
329337
src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
330338

331339
cpu_tensor.Resize(make_ddim({2, 2}));
332-
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
340+
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
333341
gpu_tensor.Resize(make_ddim({2, 2}));
334-
gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place);
335-
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
342+
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
343+
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
336344

345+
// Sync before Compare Tensors
346+
gpu_ctx.Wait();
337347
src_ptr = src_vec.data();
338348
cpu_ptr = cpu_tensor.data<int>();
339349
dst_ptr = dst_tensor.data<int>();

paddle/operators/feed_op.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class FeedKernel : public framework::OpKernel<T> {
3434
// TODO(qijun):
3535
// check tensors[col].dims() with attribute,
3636
// except the first dimenson.
37-
out->CopyFrom<T>(tensors[col], ctx.GetPlace());
37+
out->CopyFrom<T>(tensors[col], ctx.GetPlace(), ctx.device_context());
3838
}
3939
};
4040

paddle/operators/fetch_op.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ class FetchKernel : public framework::OpKernel<T> {
3535
PADDLE_ENFORCE_GT(tensors->size(), static_cast<size_t>(col));
3636
(*tensors)[col].Resize(input->dims());
3737
(*tensors)[col].mutable_data<T>(platform::CPUPlace());
38-
(*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace());
38+
(*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace(),
39+
ctx.device_context());
3940
// TODO(qijun): need to handle LodTensor later
4041
}
4142
};

paddle/operators/math/im2col_test.cc

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,22 @@ void testIm2col() {
4949
memcpy(input_ptr, arr, 6 * sizeof(float));
5050

5151
auto* place = new Place();
52+
paddle::platform::DeviceContext* context;
53+
if (paddle::platform::is_cpu_place(*place)) {
54+
context =
55+
new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
56+
} else {
57+
#ifdef PADDLE_WITH_CUDA
58+
context =
59+
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
60+
#else
61+
PADDLE_THROW("no GPU support");
62+
#endif // PADDLE_ONLY_CPU
63+
}
5264
if (paddle::platform::is_cpu_place(*place)) {
5365
input = input_tmp;
5466
} else {
55-
input.CopyFrom<float>(input_tmp, *place);
67+
input.CopyFrom<float>(input_tmp, *place, *context);
5668
}
5769
output_cfo.mutable_data<float>(
5870
{1, filter_size, filter_size, output_height, output_width}, *place);
@@ -66,26 +78,15 @@ void testIm2col() {
6678
paddle::operators::math::ColFormat::kOCF, Place, float>
6779
im2col_ocf;
6880

69-
paddle::platform::DeviceContext* context;
70-
if (paddle::platform::is_cpu_place(*place)) {
71-
context =
72-
new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
73-
} else {
74-
#ifdef PADDLE_WITH_CUDA
75-
context =
76-
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
77-
#else
78-
PADDLE_THROW("no GPU support");
79-
#endif // PADDLE_ONLY_CPU
80-
}
8181
im2col(*context, input, output_cfo, stride, stride, padding, padding);
8282
im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding);
8383

8484
float* out_cfo_ptr;
8585
if (paddle::platform::is_cpu_place(*place)) {
8686
out_cfo_ptr = output_cfo.data<float>();
8787
} else {
88-
output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace());
88+
output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace(),
89+
*context);
8990
out_cfo_ptr = output_tmp.data<float>();
9091
}
9192
EXPECT_EQ(out_cfo_ptr[0], 0);
@@ -101,7 +102,8 @@ void testIm2col() {
101102
if (paddle::platform::is_cpu_place(*place)) {
102103
out_ocf_ptr = output_ocf.data<float>();
103104
} else {
104-
output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace());
105+
output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace(),
106+
*context);
105107
out_ocf_ptr = output_tmp.data<float>();
106108
}
107109
EXPECT_EQ(out_ocf_ptr[0], 0);

0 commit comments

Comments
 (0)