PaddlePaddle
diff --git a/‎Dockerfile
Lines changed: 2 additions & 0 deletions b/‎Dockerfile
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/fused/fusion_gru_op.cc
Lines changed: 41 additions & 26 deletions b/‎paddle/fluid/operators/fused/fusion_gru_op.cc
Lines changed: 41 additions & 26 deletions
diff --git a/‎paddle/fluid/operators/fused/fusion_lstm_op.cc
Lines changed: 46 additions & 27 deletions b/‎paddle/fluid/operators/fused/fusion_lstm_op.cc
Lines changed: 46 additions & 27 deletions
@@ -43,6 +43,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
     CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null
 
+RUN rm -r /root/python_build
+
 RUN apt-get update && \
     apt-get install -y --allow-downgrades patchelf \
     python3 python3-dev python3-pip \
 
@@ -183,24 +183,27 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   const int total_T = x_dims[0];           \
   const int D3 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                     \
-  auto* h0 = ctx.Input<Tensor>("H0");                                          \
-  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
-  auto* bias = ctx.Input<Tensor>("Bias");                                      \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
-  const int M = x_dims[1];                                                     \
-  const int D = wh_dims[0];                                                    \
-  const int D2 = D * 2;                                                        \
-  const auto& ker = math::jitkernel::KernelPool::Instance()                    \
-                        .template Get<math::jitkernel::GRUKernel<T>,           \
-                                      const std::string&, const std::string&>( \
-                            ctx.Attr<std::string>("gate_activation"),          \
-                            ctx.Attr<std::string>("activation"), D);           \
-  const T* x_data = x->data<T>();                                              \
-  const T* wx_data = wx->data<T>();                                            \
-  const T* wh_data = wh->data<T>();                                            \
-  auto place = ctx.GetPlace();                                                 \
+#define INIT_OTHER_DEFINES                                         \
+  auto* h0 = ctx.Input<Tensor>("H0");                              \
+  auto* wx = ctx.Input<Tensor>("WeightX");                         \
+  auto* bias = ctx.Input<Tensor>("Bias");                          \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");              \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                  \
+  const int M = x_dims[1];                                         \
+  const int D = wh_dims[0];                                        \
+  const int D2 = D * 2;                                            \
+  const math::jitkernel::gru_attr_t attr(                          \
+      D, ctx.Attr<std::string>("gate_activation"),                 \
+      ctx.Attr<std::string>("activation"));                        \
+  math::jitkernel::gru_t one_step;                                 \
+  const auto& ker =                                                \
+      math::jitkernel::KernelPool::Instance()                      \
+          .template Get<math::jitkernel::GRUKernel<T>,             \
+                        const math::jitkernel::gru_attr_t&>(attr); \
+  const T* x_data = x->data<T>();                                  \
+  const T* wx_data = wx->data<T>();                                \
+  const T* wh_data = wh->data<T>();                                \
+  auto place = ctx.GetPlace();                                     \
   T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
@@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       if (h0_data) {
         prev_hidden_data = h0_data + bid * D;
       } else {
-        ker->ComputeH1(xx_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeH1(&one_step, &attr);
         prev_hidden_data = hidden_out_data;
         tstart = 1;
         move_step();
@@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
                   prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
                   D3);
-        ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht_1 = prev_hidden_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
         // gemm rt * Ws
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                   hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                   xx_data + D2, D3);
-        ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data);
+        ker->ComputeHtPart2(&one_step, &attr);
         // save prev
         prev_hidden_data = hidden_out_data;
         move_step();
@@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       // W: {W_update, W_reset; W_state}
       for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeH1(cur_in_data, cur_out_data);
+        one_step.gates = cur_in_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeH1(&one_step, &attr);
         // add offset
         cur_in_data += D3;
         cur_out_data += D;
@@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       T* cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
+
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
@@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
       cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart2(&one_step, &attr);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
 
@@ -236,27 +236,31 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   const int D = wh_dims[0];                                 \
   const int D4 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                  \
-  const T* x_data = x->data<T>();                                           \
-  const T* wx_data = wx->data<T>();                                         \
-  const T* wh_data = wh->data<T>();                                         \
-  /* diagonal weight*/                                                      \
-  const T* wp_data = bias->data<T>() + D4;                                  \
-  /* for peephole only*/                                                    \
-  T* checked_cell_data = nullptr;                                           \
-  auto place = ctx.GetPlace();                                              \
-  if (use_peepholes) {                                                      \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                        \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                 \
-    checked_cell_data = checked_cell->mutable_data<T>(place);               \
-  }                                                                         \
-  const auto& ker =                                                         \
-      math::jitkernel::KernelPool::Instance()                               \
-          .template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
-                        const std::string&, const std::string&>(            \
-              ctx.Attr<std::string>("gate_activation"),                     \
-              ctx.Attr<std::string>("candidate_activation"),                \
-              ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
+#define INIT_OTHER_DEFINES                                      \
+  const T* x_data = x->data<T>();                               \
+  const T* wx_data = wx->data<T>();                             \
+  const T* wh_data = wh->data<T>();                             \
+  /* diagonal weight*/                                          \
+  const T* wp_data = bias->data<T>() + D4;                      \
+  /* for peephole only*/                                        \
+  T* checked_cell_data = nullptr;                               \
+  auto place = ctx.GetPlace();                                  \
+  if (use_peepholes) {                                          \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/            \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");     \
+    checked_cell_data = checked_cell->mutable_data<T>(place);   \
+  }                                                             \
+  const math::jitkernel::lstm_attr_t attr(                      \
+      D, ctx.Attr<std::string>("gate_activation"),              \
+      ctx.Attr<std::string>("candidate_activation"),            \
+      ctx.Attr<std::string>("cell_activation"), use_peepholes); \
+  math::jitkernel::lstm_t one_step;                             \
+  one_step.wp = wp_data;                                        \
+  one_step.checked = checked_cell_data;                         \
+  const auto& ker =                                             \
+      math::jitkernel::KernelPool::Instance()                   \
+          .template Get<math::jitkernel::LSTMKernel<T>,         \
+                        const math::jitkernel::lstm_attr_t&>(attr)
 
 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
@@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         prev_h_data = h0_data + bid * D;
         prev_c_data = c0_data + bid * D;
       } else {
-        ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
+        one_step.gates = xx_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
         tstart = 1;
         // move one step
         prev_h_data = h_out_data;
@@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       }
       for (int step = tstart; step < seq_len; ++step) {
         GEMM_WH_ADDON(1, prev_h_data, xx_data);
-        ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
-                         checked_cell_data);
+
+        one_step.gates = xx_data;
+        one_step.ct_1 = prev_c_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
         // move one step
         prev_h_data = h_out_data;
         prev_c_data = c_out_data;
@@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_h_out_data = batched_h_out_data;
       T* cur_c_out_data = batched_c_out_data;
       for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
+        one_step.gates = cur_in_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
+
         cur_in_data += D4;
         cur_c_out_data += D;
         cur_h_out_data += D;
@@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_c_out_data = batched_c_out_data;
       T* cur_h_out_data = batched_h_out_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                         cur_h_out_data, wp_data, checked_cell_data);
+        one_step.gates = cur_in_data;
+        one_step.ct_1 = cur_prev_c_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
+
         // move one batch
         cur_in_data += D4;
         cur_prev_c_data += D;