PaddlePaddle
diff --git a/‎paddle/fluid/operators/math/blas.cc
Lines changed: 38 additions & 1 deletion b/‎paddle/fluid/operators/math/blas.cc
Lines changed: 38 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/math/blas.h
Lines changed: 33 additions & 0 deletions b/‎paddle/fluid/operators/math/blas.h
Lines changed: 33 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/matmul.h
Lines changed: 0 additions & 149 deletions b/‎paddle/fluid/operators/math/matmul.h
Lines changed: 0 additions & 149 deletions
diff --git a/‎paddle/fluid/operators/matmul_op.cc
Lines changed: 26 additions & 108 deletions b/‎paddle/fluid/operators/matmul_op.cc
Lines changed: 26 additions & 108 deletions
@@ -13,10 +13,47 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/math/blas.h"
+
+#include <utility>
 namespace paddle {
 namespace operators {
 namespace math {
-// Do nothing. Blas is a header only library.
+MatDim GetMatDim(const framework::DDim& dim, int num_flatten_cols, bool trans) {
+  MatDim retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = framework::flatten_to_2d(dim, num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (dim.size() == 1) {
+      retv.height_ = 1;
+      retv.width_ = dim[0];
+    } else if (dim.size() == 2) {
+      retv.height_ = dim[0];
+      retv.width_ = dim[1];
+    } else {
+      if (dim.size() == 3) {
+        retv.batch_size_ = dim[0];
+        retv.height_ = dim[1];
+        retv.width_ = dim[2];
+      } else {
+        auto dim_vec = framework::vectorize(dim);
+        retv.batch_size_ = 1;
+        for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+          retv.batch_size_ *= dim_vec[i];
+          retv.height_ = dim_vec[dim_vec.size() - 2];
+          retv.width_ = dim_vec[dim_vec.size() - 1];
+        }
+      }
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
@@ -46,6 +46,17 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+struct MatDim {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+extern MatDim GetMatDim(const framework::DDim& tensor, int num_flatten_cols,
+                        bool trans);
+
 template <typename DeviceContext>
 class Blas {
  public:
@@ -90,6 +101,28 @@ class Blas {
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
                    int batchCount, int64_t strideA, int64_t strideB) const;
 
+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, const MatDim& dim_a,
+              const framework::Tensor& mat_b, const MatDim& dim_b, T alpha,
+              framework::Tensor* mat_out, T beta) const {
+    PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+    CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
+    CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
+    if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
+      this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
+                             dim_a.width_, alpha, mat_a.data<T>(),
+                             mat_b.data<T>(), beta, mat_out->data<T>());
+    } else {
+      PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
+                     dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+      this->template BatchedGEMM<T>(
+          transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
+          mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
+          dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
+          dim_a.stride_, dim_b.stride_);
+    }
+  }
+
  private:
   const DeviceContext& context_;
 };
 
@@ -36,121 +36,39 @@ class MatMulOp : public framework::OperatorWithKernel {
 
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
-    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
-
-    PADDLE_ENFORCE_GE(dim_x.size(), 1,
-                      "Input tensor X must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_y.size(), 1,
-                      "Input tensor Y must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_x.size() > 3) {
-      PADDLE_ENFORCE_EQ(
-          dim_y.size(), dim_x.size(),
-          "The dimensions of X and Y must be the same, and both of "
-          "them should be %d-dimensional.",
-          dim_x.size());
-
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_x.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_x[j]);
-        batch_count *= dim_x[j];
-      }
-    }
 
-    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
-    bool remove_initial_dim = false, remove_final_dim = false;
-
-    switch (dim_x.size()) {
-      case 1:
-        if (transpose_x) {
-          M = dim_x[0];
-          KX = 1;
-        } else {
-          M = 1;
-          KX = dim_x[0];
-          remove_initial_dim = true;
-        }
-        break;
-      case 2:
-        M = transpose_x ? dim_x[1] : dim_x[0];
-        KX = transpose_x ? dim_x[0] : dim_x[1];
-        break;
-      case 3:
-        batchCountX = dim_x[0];
-        M = transpose_x ? dim_x[2] : dim_x[1];
-        KX = transpose_x ? dim_x[1] : dim_x[2];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = dim_x.size() - 2;
-        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
-        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
-        break;
-    }
+    auto mat_dim_x = math::GetMatDim(GetXDim(dim_x), 0,
+                                     context->Attrs().Get<bool>("transpose_X"));
+    auto mat_dim_y = math::GetMatDim(GetYDim(dim_y), 0,
+                                     context->Attrs().Get<bool>("transpose_Y"));
 
-    switch (dim_y.size()) {
-      case 1:
-        if (transpose_y) {
-          N = dim_y[0];
-          KY = 1;
-        } else {
-          N = 1;
-          KY = dim_y[0];
-          remove_final_dim = true;
-        }
-        break;
-      case 2:
-        KY = transpose_y ? dim_y[1] : dim_y[0];
-        N = transpose_y ? dim_y[0] : dim_y[1];
-        break;
-      case 3:
-        batchCountY = dim_y[0];
-        KY = transpose_y ? dim_y[2] : dim_y[1];
-        N = transpose_y ? dim_y[1] : dim_y[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = dim_y.size() - 2;
-        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
-        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
+    PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
+    PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
+                   mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    std::vector<int64_t> dim_out;
+    if (mat_dim_x.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_x);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else if (mat_dim_y.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_y);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else {
+      dim_out = {mat_dim_x.height_, mat_dim_y.width_};
     }
 
-    PADDLE_ENFORCE_EQ(
-        KX, KY,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
+    if (dim_x.size() == 1 && dim_out[dim_out.size() - 2] == 1) {
+      std::swap(dim_out[dim_out.size() - 2], dim_out[dim_out.size() - 1]);
+      dim_out.resize(dim_out.size() - 1);
     }
-    int batchCount = std::max(batchCountX, batchCountY);
 
-    std::vector<int64_t> dim_out;
-    if (batchCount) {
-      if (dim_x.size() > 3) {
-        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
-      } else {
-        dim_out.push_back(batchCount);
-      }
+    if (dim_y.size() == 1 && dim_out[dim_out.size() - 1] == 1) {
+      dim_out.resize(dim_out.size() - 1);
     }
-    if (!remove_initial_dim) {
-      dim_out.push_back(M);
-    }
-    if (!remove_final_dim) {
-      dim_out.push_back(N);
-    }
-    if (dim_out.size() == 0) {
-      // We don't support 0-dimensional Tensors (scalars), so instead
-      // treat the output as a Tensor of shape (1, ) in this case.
-      dim_out.push_back(1);
+
+    if (dim_out.empty()) {
+      dim_out = {1};
     }
     context->SetOutputDim("Out", framework::make_ddim(dim_out));
     context->ShareLoD("X", /*->*/ "Out");