Skip to content

Commit 3636329

Browse files
authored
Merge pull request #12904 from tensor-tang/refine/jit
optimize cpu vec activations
2 parents 2b64a19 + 7bdaf09 commit 3636329

File tree

7 files changed

+530
-48
lines changed

7 files changed

+530
-48
lines changed

paddle/fluid/operators/attention_lstm_op.cc

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -232,40 +232,28 @@ use lstm_x_t as input and compute as standard LSTM.
232232
template <typename T>
233233
inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
234234
if (bias) {
235-
for (int i = 0; i < n; ++i) {
236-
y[i] = x[i] + bias[0];
237-
}
238-
math::vec_relu<T>(n, y, y);
235+
math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
236+
math::vec_relu<T, platform::jit::avx>(n, y, y);
239237
} else {
240-
math::vec_relu<T>(n, x, y);
238+
math::vec_relu<T, platform::jit::avx>(n, x, y);
241239
}
242240
}
243241

244-
template <typename DeviceContext, typename T>
245-
inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
246-
const T* x, T* y) {
242+
template <typename T>
243+
inline void vec_softmax(const int n, const T* x, T* y) {
247244
T scalar = x[0];
248245
// max
249246
for (int i = 1; i < n; ++i) {
250247
scalar = scalar < x[i] ? x[i] : scalar;
251248
}
252-
253-
// sub
254-
for (int i = 0; i < n; ++i) {
255-
y[i] = x[i] - scalar;
256-
}
257-
258-
// exp
259-
blas.VEXP(n, y, y);
260-
249+
math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y); // sub
250+
math::vec_exp<T>(n, y, y); // exp
261251
// sum
262252
scalar = T(0);
263253
for (int i = 0; i < n; ++i) {
264254
scalar += y[i];
265255
}
266-
267-
// scale
268-
blas.SCAL(n, static_cast<T>(1) / scalar, y);
256+
math::vec_scal<T>(n, static_cast<T>(1) / scalar, y); // scale
269257
}
270258

271259
template <typename T>
@@ -311,11 +299,21 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
311299
PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
312300
fc_out->Resize({max_seq_len, 1});
313301

314-
math::VecActivations<T> act_functor;
315302
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
316-
act_gate = act_functor(ctx.Attr<std::string>("gate_activation"));
317-
act_cell = act_functor(ctx.Attr<std::string>("cell_activation"));
318-
act_cand = act_functor(ctx.Attr<std::string>("candidate_activation"));
303+
auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
304+
auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
305+
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
306+
if (platform::jit::MayIUse(platform::jit::avx)) {
307+
math::VecActivations<T, platform::jit::avx> act_functor;
308+
act_gate = act_functor(act_gate_str);
309+
act_cell = act_functor(act_cell_str);
310+
act_cand = act_functor(act_cand_str);
311+
} else {
312+
math::VecActivations<T, platform::jit::isa_any> act_functor;
313+
act_gate = act_functor(act_gate_str);
314+
act_cell = act_functor(act_cell_str);
315+
act_cand = act_functor(act_cand_str);
316+
}
319317

320318
const T* x_data = x->data<T>();
321319
const T* h0_data = h0 ? h0->data<T>() : NULL;
@@ -363,7 +361,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
363361
fc_out_data);
364362
}
365363
// 1d. softmax
366-
vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
364+
vec_softmax<T>(seq_len, fc_out_data, fc_out_data);
367365
// mul x(seq_len*M) and sum pool
368366
math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
369367
cur_x_data, lstm_x_data);

paddle/fluid/operators/math/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,4 @@ if(WITH_GPU)
6565
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
6666
endif()
6767
cc_test(concat_test SRCS concat_test.cc DEPS concat)
68+
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)

0 commit comments

Comments
 (0)