PaddlePaddle
diff --git a/‎paddle/fluid/operators/conv_op.h
Lines changed: 6 additions & 8 deletions b/‎paddle/fluid/operators/conv_op.h
Lines changed: 6 additions & 8 deletions
diff --git a/‎paddle/fluid/operators/conv_transpose_op.h
Lines changed: 5 additions & 9 deletions b/‎paddle/fluid/operators/conv_transpose_op.h
Lines changed: 5 additions & 9 deletions
diff --git a/‎paddle/fluid/operators/lstm_op.h
Lines changed: 14 additions & 18 deletions b/‎paddle/fluid/operators/lstm_op.h
Lines changed: 14 additions & 18 deletions
diff --git a/‎paddle/fluid/operators/lstmp_op.h
Lines changed: 26 additions & 37 deletions b/‎paddle/fluid/operators/lstmp_op.h
Lines changed: 26 additions & 37 deletions
@@ -161,6 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
     math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
       Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -186,8 +187,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
         // gemm
         Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
-                                       false, T(1.0), &out_slice, T(0.0));
+        blas.MatMul(filter_slice, col_matrix, &out_slice);
       }
     }
   }
@@ -274,6 +274,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
@@ -303,9 +304,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(in_grad_slice);
             col_matrix.Resize(col_matrix_shape);
           }
-          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
-                                         out_grad_slice, false, T(1.0),
-                                         &col_matrix, T(0.0));
+          blas.MatMul(filter_slice, true, out_grad_slice, false, &col_matrix);
 
           if (is_expand && data_dim == 2U) {
             col2im(dev_ctx, col, dilations, strides,
@@ -352,9 +351,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
-                                         col_matrix, true, T(1.0),
-                                         &filter_grad_slice, T(1.0));
+          blas.MatMul(out_grad_slice, false, col_matrix, true,
+                      &filter_grad_slice);
         }
       }
     }
 
@@ -118,6 +118,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(context.GetPlace());
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, output, static_cast<T>(0));
 
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
@@ -134,9 +135,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
       // col_matrix = filter * input_batch
       // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
-                                     static_cast<T>(1.0), &col_matrix,
-                                     static_cast<T>(0.0));
+      blas.MatMul(filter, true, input_batch, false, &col_matrix);
 
       if (data_dim == 2U) {
         // col2im: col_matrix -> dy
@@ -213,6 +212,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     // im2col + gemm (similar to conv-forward)
     // input need to compute gradient
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     if (input_grad || filter_grad) {
       Tensor col;
       col.mutable_data<T>(col_shape, context.GetPlace());
@@ -267,9 +267,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          math::matmul<DeviceContext, T>(
-              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
-              &input_grad_batch, static_cast<T>(0.0));
+          blas.MatMul(filter, false, col_matrix, false, &input_grad_batch);
         }
         if (filter_grad) {
           // input batch
@@ -279,9 +277,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
-                                         true, static_cast<T>(1.0),
-                                         &filter_grad_, static_cast<T>(1.0));
+          blas.MatMul(in_batch, false, col_matrix, true, &filter_grad_);
         }
       }
     }
 
@@ -114,6 +114,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     auto cand_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
 
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -129,9 +130,8 @@ class LSTMKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
+        blas.MatMul(pre_hidden_t, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       } else if (hidden_t0) {
         // If n == 0 and there is no initialized hidden state, that is to say
         // the H0 is zeros, the calculation W_h * H0 will be skiped.
@@ -143,9 +143,8 @@ class LSTMKernel : public framework::OpKernel<T> {
         Tensor ordered_h0;
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
-        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
+        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       }
 
       lstm_value.gate_value = gate_t.data<T>();
@@ -282,6 +281,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -320,29 +320,25 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
-                                       static_cast<T>(1.0), &pre_hidden_g,
-                                       static_cast<T>(1.0));
+        blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                    &pre_hidden_g, static_cast<T>(1.0));
         if (weight_g) {
           /* backward weight */
           auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
+          blas.MatMul(pre_hidden, true, gate_g, false, static_cast<T>(1.0),
+                      weight_g, static_cast<T>(1.0));
         }
       } else {
         if (h0 && weight_g) {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
-          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
+          blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
+                      weight_g, static_cast<T>(1.0));
         }
         if (h0 && h0_g) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
-                                         true, static_cast<T>(1.0),
-                                         &ordered_h0_g, static_cast<T>(0.0));
+          blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                      &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
 
@@ -143,7 +143,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     auto proj_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -160,9 +160,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
+        blas.MatMul(pre_proj_t, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       } else if (hidden_t0) {
         // If n == 0 and there is no initialized hidden state, that is to say
         // the H0 is zeros, the calculation W_h * H0 will be skiped.
@@ -176,16 +175,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
         ordered_proj0->mutable_data<T>(ctx.GetPlace());
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
-        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
-                                       *proj_weight, false, static_cast<T>(1.0),
-                                       ordered_proj0, static_cast<T>(0.0));
+        blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
+                    ordered_proj0, static_cast<T>(0.0));
         if (proj_act != math::detail::ActivationType::kIdentity) {
           auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
           ActCompute(cell_act, place, proj0_dev, proj0_dev);
         }
-        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
-                                       *weight, false, static_cast<T>(1.0),
-                                       &gate_t, static_cast<T>(1.0));
+        blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       }
 
       lstmp_value.gate_value = gate_t.data<T>();
@@ -196,9 +193,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
           device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
           cell_act, cand_act);
       lstmp_value.prev_state_value = lstmp_value.state_value;
-      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
-                                     false, static_cast<T>(1.0), &proj_t,
-                                     static_cast<T>(0.0));
+      blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
+                  &proj_t, static_cast<T>(0.0));
       if (proj_act != math::detail::ActivationType::kIdentity) {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
@@ -361,6 +357,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -375,15 +372,13 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       }
       /* hidden state backwarad */
       Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
-                                     true, static_cast<T>(1.0), &out_g,
-                                     static_cast<T>(0.0));
+      blas.MatMul(proj_g, false, *proj_weight, true, static_cast<T>(1.0),
+                  &out_g, static_cast<T>(0.0));
       /* projection weight backward*/
       if (proj_weight_g) {
         Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
-                                       false, static_cast<T>(1.0),
-                                       proj_weight_g, static_cast<T>(1.0));
+        blas.MatMul(hidden_t, true, proj_g, false, static_cast<T>(1.0),
+                    proj_weight_g, static_cast<T>(1.0));
       }
 
       Tensor gate = batch_gate->Slice(bstart, bend);
@@ -419,49 +414,43 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
-                                       static_cast<T>(1.0), &pre_proj_g,
-                                       static_cast<T>(1.0));
+        blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                    &pre_proj_g, static_cast<T>(1.0));
         if (weight_g) {
           /* weight backward*/
           auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
-          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
+          blas.MatMul(pre_proj, true, gate_g, false, static_cast<T>(1.0),
+                      weight_g, static_cast<T>(1.0));
         }
       } else {
         if (h0 && weight_g) {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
           if (weight_g) {
-            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
-                                           gate_g, false, static_cast<T>(1.0),
-                                           weight_g, static_cast<T>(1.0));
+            blas.MatMul(*ordered_proj0, true, gate_g, false,
+                        static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
           }
         }
         if (h0 && (h0_g || proj_weight_g)) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
           Tensor proj0_g;
           proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
           proj0_g.mutable_data<T>(ctx.GetPlace());
-          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
-                                         true, static_cast<T>(1.0), &proj0_g,
-                                         static_cast<T>(0.0));
+          blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                      &proj0_g, static_cast<T>(0.0));
           if (proj_act != math::detail::ActivationType::kIdentity) {
             auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
             auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
             ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
                            proj0_g_dev);
           }
           if (h0_g) {
-            math::matmul<DeviceContext, T>(
-                device_ctx, proj0_g, false, *proj_weight, true,
-                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
+            blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
+                        &ordered_h0_g, static_cast<T>(0.0));
           }
           if (proj_weight_g) {
-            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
-                                           proj0_g, false, static_cast<T>(1.0),
-                                           proj_weight_g, static_cast<T>(1.0));
+            blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
+                        proj_weight_g, static_cast<T>(1.0));
           }
         }
       }