PaddlePaddle
diff --git a/‎paddle/fluid/operators/bilinear_tensor_product_op.h
Lines changed: 11 additions & 12 deletions b/‎paddle/fluid/operators/bilinear_tensor_product_op.h
Lines changed: 11 additions & 12 deletions
diff --git a/‎paddle/fluid/operators/gru_unit_op.h
Lines changed: 23 additions & 29 deletions b/‎paddle/fluid/operators/gru_unit_op.h
Lines changed: 23 additions & 29 deletions
diff --git a/‎paddle/fluid/operators/math/blas_impl.cu.h
Lines changed: 151 additions & 0 deletions b/‎paddle/fluid/operators/math/blas_impl.cu.h
Lines changed: 151 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/blas_impl.h
Lines changed: 74 additions & 0 deletions b/‎paddle/fluid/operators/math/blas_impl.h
Lines changed: 74 additions & 0 deletions
@@ -61,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
       auto output_col_vec = output_mat.chip(i, 1);
       Tensor weight_mat =
           weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
-                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
-                                   weight_mat.data<T>(), 0, left_mul.data<T>());
+      math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
+          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
+          weight_mat.data<T>(), 0, left_mul.data<T>());
       output_col_vec.device(place) =
           (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
     }
@@ -125,6 +125,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
       set_zero(dev_ctx, d_y, static_cast<T>(0));
     }
 
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
     // Caculate the Output(X@Grad) and Output(Y@Grad).
     if (d_x || d_y) {
       Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
@@ -138,18 +140,16 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_x) *
               y_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
+          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
         }
         if (d_y) {
           x_scale_mat.device(place) =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_y) *
               x_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+          blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+                    x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
         }
       }
     }
@@ -166,9 +166,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
             output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                 .broadcast(bcast_for_weight) *
             x_mat;
-        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
-                                     y_dim, batch_size, 1, x_scale.data<T>(),
-                                     y->data<T>(), 0, d_weight_i.data<T>());
+        blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
+                  x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
       }
     }
 
 
@@ -87,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     const T* weight_data = weight->data<T>();
     T* gate_data = gate->data<T>();
     T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
-        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
+              hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
+              gate_data, frame_size * 3);
 
     // calculate activited gate
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
@@ -103,11 +103,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
     r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
-        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
-        gate_data + frame_size * 2, frame_size * 3);
+    blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+              reset_hidden_prev_data, frame_size,
+              weight_data + frame_size * frame_size * 2, frame_size, 1,
+              gate_data + frame_size * 2, frame_size * 3);
 
     Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
     ActCompute(context.Attr<int>("activation"), place,
@@ -188,42 +187,37 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
                    d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, true,
-        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
-        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
-        0, reset_hidden_prev_grad_data, frame_size);
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
+              gate_grad_data + frame_size * 2, frame_size * 3,
+              weight_data + frame_size * frame_size * 2, frame_size, 0,
+              reset_hidden_prev_grad_data, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
     // backward for weight
     if (weight_grad) {
       T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
       // backward for state_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
-          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
-          weight_grad_data + frame_size * frame_size * 2, frame_size);
+      blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                reset_hidden_prev_data, frame_size,
+                gate_grad_data + frame_size * 2, frame_size * 3, 0,
+                weight_grad_data + frame_size * frame_size * 2, frame_size);
 
       // backward for update_gate_weight and reset_gate_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
-          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
-          frame_size * 2);
+      blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
+                hidden_prev_data, frame_size, gate_grad_data, frame_size * 3, 0,
+                weight_grad_data, frame_size * 2);
     }
     // backward for hidden_prev
     if (hidden_prev_grad) {
       T* hidden_prev_grad_data =
           hidden_prev_grad->mutable_data<T>(context.GetPlace());
       auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
       d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), false, true,
-          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
-          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
-          frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
+                gate_grad_data, frame_size * 3, weight_data, frame_size * 2, 1,
+                hidden_prev_grad_data, frame_size);
     }
     // backward for input
     if (input_grad) {
 
@@ -0,0 +1,151 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CUBlas;
+
+template <>
+struct CUBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasSgemm(args...));
+  }
+};
+
+template <>
+struct CUBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasDgemm(args...));
+  }
+};
+
+template <>
+struct CUBlas<platform::float16> {
+  using float16 = platform::float16;
+
+  static void GEMM(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float16 *alpha, const float16 *A, int lda,
+                   const float16 *B, int ldb, const float16 *beta, float16 *C,
+                   int ldc) {
+    PADDLE_ENFORCE(
+        platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
+                                       reinterpret_cast<const __half *>(alpha),
+                                       reinterpret_cast<const __half *>(A), lda,
+                                       reinterpret_cast<const __half *>(B), ldb,
+                                       reinterpret_cast<const __half *>(beta),
+                                       reinterpret_cast<__half *>(C), ldc));
+  }
+};
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
+                                             const CBLAS_TRANSPOSE transB,
+                                             const int M, const int N,
+                                             const int K, const T alpha,
+                                             const T *A, const T *B,
+                                             const T beta, T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, N);
+}
+
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M,
+    const int N, const int K, const platform::float16 alpha,
+    const platform::float16 *A, const platform::float16 *B,
+    const platform::float16 beta, platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
+
+#if CUDA_VERSION >= 8000
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+  if (context_.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
+  }
+#endif  // CUDA_VERSION >= 9000
+
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &h_alpha, h_B, ldb, h_A, lda,
+                                  &h_beta, h_C, N);
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(
+    const bool transA, const bool transB, const int M, const int N, const int K,
+    const T alpha, const T *A, const int lda, const T *B, const int ldb,
+    const T beta, T *C, const int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, ldc);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
@@ -0,0 +1,74 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CBlas;
+
+template <>
+struct CBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_sgemm(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_dgemm(args...);
+  }
+};
+
+template <>
+struct CBlas<platform::float16> {
+  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+};
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
+                                            const CBLAS_TRANSPOSE transB,
+                                            const int M, const int N,
+                                            const int K, const T alpha,
+                                            const T *A, const T *B,
+                                            const T beta, T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(
+    const bool transA, const bool transB, const int M, const int N, const int K,
+    const T alpha, const T *A, const int lda, const T *B, const int ldb,
+    const T beta, T *C, const int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                 lda, B, ldb, beta, C, ldc);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle