Skip to content

Commit ba168bd

Browse files
committed
modify API.spec
1 parent c73c5ed commit ba168bd

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

paddle/fluid/API.spec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs
162162
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
163163
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
164164
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
165+
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
165166
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
166167
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
167168
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))

paddle/fluid/operators/stack_op.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,17 +154,22 @@ class StackKernel : public framework::OpKernel<T> {
154154
if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
155155
n > kMaxThreshold) {
156156
#ifdef __NVCC__
157+
VLOG(10) << "Stack more than " << kMaxThreshold
158+
<< " tensors on GPU may be slow.";
157159
thrust::device_vector<const T *> device_x_vec(x_datas);
158160
auto x_data_arr = device_x_vec.data().get();
159161
#else
160162
auto x_data_arr = x_datas.data();
161163
#endif
162164
StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
165+
#ifdef __NVCC__
166+
// Wait() must be called because device_x_vec may be destructed before
167+
// kernel ends
168+
dev_ctx.Wait();
169+
#endif
163170
}
164171
#ifdef __NVCC__
165172
else { // NOLINT
166-
VLOG(10) << "Stack more than " << kMaxThreshold
167-
<< " tensors on GPU may be slow.";
168173
framework::Array<const T *, kMaxThreshold> x_data_arr;
169174
for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i];
170175
StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
@@ -243,18 +248,23 @@ class StackGradKernel : public framework::OpKernel<T> {
243248
if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
244249
n > kMaxThreshold) {
245250
#ifdef __NVCC__
251+
VLOG(10) << "Stack more than " << kMaxThreshold
252+
<< " tensors on GPU may be slow.";
246253
thrust::device_vector<T *> device_dx_vec(dx_datas);
247254
auto dx_data_arr = device_dx_vec.data().get();
248255
#else
249256
auto dx_data_arr = dx_datas.data();
250257
#endif
251258
StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
252259
post);
260+
#ifdef __NVCC__
261+
// Wait() must be called because device_dx_vec may be destructed before
262+
// kernel ends
263+
dev_ctx.Wait();
264+
#endif
253265
}
254266
#ifdef __NVCC__
255267
else { // NOLINT
256-
VLOG(10) << "Stack more than " << kMaxThreshold
257-
<< " tensors on GPU may be slow.";
258268
framework::Array<T *, kMaxThreshold> dx_data_arr;
259269
for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i];
260270
StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,

0 commit comments

Comments
 (0)