Skip to content

Commit 00ad751

Browse files
committed
Use stream while memory::Copy in GPU mode
1 parent 74b283c commit 00ad751

File tree

2 files changed

+31
-9
lines changed

2 files changed

+31
-9
lines changed

paddle/operators/seq_expand_op.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class SeqExpandOp : public framework::OperatorWithKernel {
4040
out_dim[0] = out_dim[0] * repeat;
4141
}
4242
PADDLE_ENFORCE(ctx->HasOutput("Out"),
43-
"Output(Out) of PadOp should not be null.");
43+
"Output(Out) of SeqExpandOp should not be null.");
4444
ctx->SetOutputDim("Out", out_dim);
4545
}
4646
};

paddle/operators/seq_expand_op.h

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,37 @@ class SeqExpandKernel : public framework::OpKernel<T> {
7575
T* out_data = out->mutable_data<T>(context.GetPlace());
7676

7777
// copy data
78-
Place place = boost::get<Place>(context.GetPlace());
78+
auto place = context.GetPlace();
7979
size_t count = 0;
80-
for (size_t i = 0; i < scales.size(); ++i) {
81-
count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
82-
for (size_t j = 0; j < scales[i]; ++j) {
83-
memory::Copy(place, out_data, place, x_data, sizeof(T) * count);
84-
out_data += count;
80+
if (platform::is_cpu_place(place)) {
81+
auto& cpu_place = boost::get<platform::CPUPlace>(place);
82+
for (size_t i = 0; i < scales.size(); ++i) {
83+
count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
84+
for (size_t j = 0; j < scales[i]; ++j) {
85+
memory::Copy(cpu_place, out_data, cpu_place, x_data,
86+
sizeof(T) * count);
87+
out_data += count;
88+
}
89+
x_data += count;
8590
}
86-
x_data += count;
91+
} else {
92+
#ifdef PADDLE_WITH_CUDA
93+
auto& gpu_place = boost::get<platform::GPUPlace>(place);
94+
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
95+
context.device_context())
96+
.stream();
97+
for (size_t i = 0; i < scales.size(); ++i) {
98+
count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
99+
for (size_t j = 0; j < scales[i]; ++j) {
100+
memory::Copy(gpu_place, out_data, gpu_place, x_data,
101+
sizeof(T) * count, stream);
102+
out_data += count;
103+
}
104+
x_data += count;
105+
}
106+
#else
107+
PADDLE_THROW("Paddle is not compiled with GPU");
108+
#endif
87109
}
88110

89111
out->set_lod(out_lod);
@@ -113,7 +135,7 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
113135
Eigen::TensorMap<Eigen::Tensor<T, 1>> d_x_t(
114136
d_x_data, static_cast<int>((ele_count * element_len) / repeat));
115137
auto place = context.GetEigenDevice<Place>();
116-
d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({0}));
138+
d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
117139
d_out_data += (ele_count * element_len);
118140
d_x_data += ((ele_count * element_len) / repeat);
119141
}

0 commit comments

Comments
 (0)