PaddlePaddle
diff --git a/‎paddle/fluid/operators/math/blas.cc
Lines changed: 31 additions & 1 deletion b/‎paddle/fluid/operators/math/blas.cc
Lines changed: 31 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/math/blas.h
Lines changed: 49 additions & 0 deletions b/‎paddle/fluid/operators/math/blas.h
Lines changed: 49 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/blas_impl.h
Lines changed: 25 additions & 0 deletions b/‎paddle/fluid/operators/math/blas_impl.h
Lines changed: 25 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/matmul.h
Lines changed: 0 additions & 149 deletions b/‎paddle/fluid/operators/math/matmul.h
Lines changed: 0 additions & 149 deletions
@@ -13,10 +13,40 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/math/blas.h"
+
+#include <utility>
 namespace paddle {
 namespace operators {
 namespace math {
-// Do nothing. Blas is a header only library.
+MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
+                                     int num_flatten_cols, bool trans) {
+  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = framework::vectorize(tensor_dim);
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
@@ -46,6 +46,50 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+/**
+ * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
+ * flag
+ *
+ * @param tensor_dim: The dimension of the tensor. The rank of this dimension
+ * must larger than 1.
+ *
+ * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
+ * dimension(column length) will be the product of tensor's first `num_col_dims`
+ * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
+ * batch_size of descriptor.
+ *
+ * @param trans: True if the matrix is transposed.
+ */
+extern MatDescriptor CreateMatrixDescriptor(const framework::DDim& tensor_dim,
+                                            int num_flatten_cols, bool trans);
+
 template <typename DeviceContext>
 class Blas {
  public:
@@ -90,6 +134,11 @@ class Blas {
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
                    int batchCount, int64_t strideA, int64_t strideB) const;
 
+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, const MatDescriptor& dim_a,
+              const framework::Tensor& mat_b, const MatDescriptor& dim_b,
+              T alpha, framework::Tensor* mat_out, T beta) const;
+
  private:
   const DeviceContext& context_;
 };
 
@@ -180,6 +180,31 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }
 
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
+                                 const MatDescriptor &dim_a,
+                                 const framework::Tensor &mat_b,
+                                 const MatDescriptor &dim_b, T alpha,
+                                 framework::Tensor *mat_out, T beta) const {
+  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
+  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
+    this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
+                           dim_a.width_, alpha, mat_a.data<T>(),
+                           mat_b.data<T>(), beta, mat_out->data<T>());
+  } else {
+    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
+                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+    this->template BatchedGEMM<T>(
+        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
+        mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
+        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
+        dim_a.stride_, dim_b.stride_);
+  }
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle