PaddlePaddle
diff --git a/‎paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
Lines changed: 24 additions & 23 deletions b/‎paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
Lines changed: 24 additions & 23 deletions
diff --git a/‎paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
Lines changed: 14 additions & 11 deletions b/‎paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
Lines changed: 14 additions & 11 deletions
diff --git a/‎paddle/fluid/operators/jit/benchmark.cc
Lines changed: 3 additions & 2 deletions b/‎paddle/fluid/operators/jit/benchmark.cc
Lines changed: 3 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/jit/gen/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/jit/gen/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/jit/gen/matmul.cc
Lines changed: 128 additions & 0 deletions b/‎paddle/fluid/operators/jit/gen/matmul.cc
Lines changed: 128 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/jit/gen/matmul.h
Lines changed: 62 additions & 0 deletions b/‎paddle/fluid/operators/jit/gen/matmul.h
Lines changed: 62 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/jit/gen_base.cc
Lines changed: 31 additions & 0 deletions b/‎paddle/fluid/operators/jit/gen_base.cc
Lines changed: 31 additions & 0 deletions
@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() {
 }
 
 template <typename T>
-static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n,
-                    int k) {
+static void fc_relu(const T* x, const T* w, const T* b, T* y,
+                    const jit::matmul_attr_t& attr) {
   auto matmul =
-      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
   auto addbias_relu =
-      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(n);
-  matmul(x, w, y, m, n, k);
+      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
+  matmul(x, w, y, &attr);
   T* dst = y;
-  for (int i = 0; i < m; ++i) {
-    addbias_relu(b, dst, dst, n);
-    dst += n;
+  for (int i = 0; i < attr.m; ++i) {
+    addbias_relu(b, dst, dst, attr.n);
+    dst += attr.n;
   }
 }
 
@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
 
     auto i_dims = in->dims();
     auto w_dims = weights[0]->dims();
-    int m = i_dims[0];
-    int n = w_dims[1];
-    int k = w_dims[0];
-    relus[0]->Resize({m, n});
+    jit::matmul_attr_t attr;
+    attr.m = i_dims[0];
+    attr.n = w_dims[1];
+    attr.k = w_dims[0];
+    relus[0]->Resize({attr.m, attr.n});
     fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
-            relus[0]->mutable_data<T>(place), m, n, k);
+            relus[0]->mutable_data<T>(place), attr);
 
     for (int i = 1; i < weight_sz - 1; ++i) {
       auto i_dims = relus[i - 1]->dims();
       auto w_dims = weights[i]->dims();
-      int m = i_dims[0];
-      int n = w_dims[1];
-      int k = w_dims[0];
-      relus[i]->Resize({m, n});
+      attr.m = i_dims[0];
+      attr.n = w_dims[1];
+      attr.k = w_dims[0];
+      relus[i]->Resize({attr.m, attr.n});
       fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
-              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), m, n, k);
+              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
     }
 
     auto i_dims_last = relus[weight_sz - 2]->dims();
     auto w_dims_last = weights[weight_sz - 1]->dims();
-    m = i_dims_last[0];
-    n = w_dims_last[1];
-    k = w_dims_last[0];
+    attr.m = i_dims_last[0];
+    attr.n = w_dims_last[1];
+    attr.k = w_dims_last[0];
     fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
-            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place), m, n,
-            k);
+            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place),
+            attr);
   }
 };
 
 
@@ -87,23 +87,26 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
 
     auto x_dims = x->dims();
     auto y_dims = y->dims();
-    int m = x_dims[0];
-    int k = x_dims[1];
-    int n = y_dims[1];
-    int o_numel = m * n;
+    jit::matmul_attr_t attr;
+    attr.m = x_dims[0];
+    attr.k = x_dims[1];
+    attr.n = y_dims[1];
+    int o_numel = attr.m * attr.n;
 
     auto vsquare_x =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(m * k);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
+                                                                       attr.k);
     auto vsquare_y =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(k * n);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
+                                                                       attr.n);
     auto vsquare_xy =
         jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
     auto vsub =
         jit::Get<jit::kVSub, jit::XYZNTuples<T>, platform::CPUPlace>(o_numel);
     auto vscal =
         jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
     auto matmul =
-        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
 
     const T* x_data = x->data<T>();
     const T* y_data = y->data<T>();
@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     T* squared_xy_data = squared_xy->mutable_data<T>(place);
     T* o_data = out->mutable_data<T>(place);
 
-    matmul(x_data, y_data, squared_xy_data, m, n, k);
+    matmul(x_data, y_data, squared_xy_data, &attr);
     vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
 
-    vsquare_x(x_data, squared_x_data, m * k);
-    vsquare_y(y_data, squared_y_data, k * n);
-    matmul(squared_x_data, squared_y_data, o_data, m, n, k);
+    vsquare_x(x_data, squared_x_data, attr.m * attr.k);
+    vsquare_y(y_data, squared_y_data, attr.k * attr.n);
+    matmul(squared_x_data, squared_y_data, o_data, &attr);
 
     vsub(squared_xy_data, o_data, o_data, o_numel);
     vscal(&scalar, o_data, o_data, o_numel);
 
@@ -311,8 +311,9 @@ void BenchMatMulKernel() {
         const T* a_data = a.data<T>();
         const T* b_data = b.data<T>();
         T* c_data = c.mutable_data<T>(PlaceType());
-        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
-                                                           c_data, m, n, k);
+        const jit::matmul_attr_t attr{m, n, k};
+        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
+                                                           c_data, &attr);
       }
     }
   }
 
@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET)
 endfunction()
 
 # use gen jitcode kernel by name
+USE_JITKERNEL_GEN(kMatMul)
 USE_JITKERNEL_GEN(kVMul)
 USE_JITKERNEL_GEN(kVAdd)
 USE_JITKERNEL_GEN(kVSub)
 
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/matmul.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void MatMulJitCode::genCode() {
+  preCode();
+  int block, rest;
+  const auto groups = packed_groups(n_, k_, &block, &rest);
+  PADDLE_ENFORCE_GT(groups.front(), 0);
+
+  const int block_len = sizeof(float) * block;
+  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
+  const int w_reg_idx = x_reg_idx - 1;
+  // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
+  // packed_weight)]);
+  mov(reg_ptr_wgt, param_y);
+  size_t z_offset = 0;
+  size_t wgt_offset = 0;
+  for (size_t g = 0; g < groups.size(); ++g) {
+    size_t x_offset = 0;
+    for (int k = 0; k < k_; ++k) {
+      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
+      // clean
+      if (k == 0) {
+        for (int i = 0; i < groups[g]; ++i) {
+          vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
+        }
+      }
+      for (int i = 0; i < groups[g]; ++i) {
+        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
+        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
+        wgt_offset += block_len;
+      }
+      // last one, save
+      if (k == k_ - 1) {
+        for (int i = 0; i < groups[g]; ++i) {
+          // only rest save should be careful
+          if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
+            break;
+          }
+          vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
+        }
+      }
+      x_offset += sizeof(float);
+    }
+    z_offset += block_len * groups[g];
+  }
+
+  if (rest != 0) {
+    // below should refine with mask
+    int reg_idx = groups.back() - 1;
+    z_offset = (n_ - rest) * sizeof(float);
+    int inner_block = 8;
+    while (rest > 0) {
+      if (rest >= 8) {
+        inner_block = 8;
+        vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
+        // shift zmm of inner_block, change reg_idx if update
+      } else if (rest >= 4) {
+        inner_block = 4;
+        vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else if (rest >= 2) {
+        inner_block = 2;
+        vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else {
+        inner_block = 1;
+        vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
+      }
+      z_offset += inner_block * sizeof(float);
+      rest -= inner_block;
+    }
+  }
+
+  postCode();
+}
+
+class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
+ public:
+  bool UseMe(const matmul_attr_t& attr) const override {
+    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
+           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
+  }
+  size_t CodeSize(const matmul_attr_t& attr) const override {
+    int block = YMM_FLOAT_BLOCK;
+    if (platform::MayIUse(platform::avx512f)) {
+      block = ZMM_FLOAT_BLOCK;
+    }
+    return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const matmul_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.m, 0);
+    PADDLE_ENFORCE_GT(attr.n, 0);
+    PADDLE_ENFORCE_GT(attr.k, 0);
+    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>  // for malloc and free
+#include <string>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class MatMulJitCode : public JitCode {
+ public:
+  explicit MatMulJitCode(const matmul_attr_t& attr,
+                         size_t code_size = 256 * 1024,
+                         void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
+    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "MatMulJitCode";
+    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
+           std::to_string(k_);
+    return base.c_str();
+  }
+  void genCode() override;
+
+ private:
+  int m_, n_, k_;
+
+  reg64_t param_x{abi_param1};
+  reg64_t param_y{abi_param2};
+  reg64_t param_z{abi_param3};
+  reg64_t param_attr{abi_param4};
+  reg64_t reg_tmp{rax};
+
+  reg64_t reg_ptr_wgt{r10};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
@@ -16,6 +16,8 @@
 #include <fstream>
 #include <iostream>
 #include <sstream>
+#include <vector>
+#include "paddle/fluid/platform/cpu_info.h"
 
 DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 
@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const {
   }
 }
 
+std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
+  int block;
+  int max_num_regs;
+  if (platform::MayIUse(platform::avx512f)) {
+    block = ZMM_FLOAT_BLOCK;
+    max_num_regs = 32;
+  } else {
+    block = YMM_FLOAT_BLOCK;
+    max_num_regs = 16;
+  }
+  // one for x, one for y, others for z
+  const int max_used_regs_for_n = max_num_regs - 2;
+  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
+  const int num_block = aligned_n / block;
+  const int num_groups = num_block / max_used_regs_for_n;
+  std::vector<int> groups(num_groups, max_used_regs_for_n);
+  int rest_num_regs = num_block % max_used_regs_for_n;
+  if (rest_num_regs != 0) {
+    groups.push_back(rest_num_regs);
+  }
+  if (block_out) {
+    *block_out = block;
+  }
+  if (rest_out) {
+    *rest_out = n % block;
+  }
+  return groups;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
Original file line number	Diff line number	Diff line change
`@@ -311,8 +311,9 @@ void BenchMatMulKernel() {`
`311`	`311`	`const T* a_data = a.data<T>();`
`312`	`312`	`const T* b_data = b.data<T>();`
`313`	`313`	`T* c_data = c.mutable_data<T>(PlaceType());`
`314`		`- BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,`
`315`		`- c_data, m, n, k);`
	`314`	`+ const jit::matmul_attr_t attr{m, n, k};`
	`315`	`+ BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,`
	`316`	`+ c_data, &attr);`
`316`	`317`	`}`
`317`	`318`	`}`
`318`	`319`	`}`