PaddlePaddle
diff --git a/‎paddle/fluid/operators/math/jit_code.cc
Lines changed: 126 additions & 0 deletions b/‎paddle/fluid/operators/math/jit_code.cc
Lines changed: 126 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/jit_code.h
Lines changed: 24 additions & 0 deletions b/‎paddle/fluid/operators/math/jit_code.h
Lines changed: 24 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/jit_kernel.h
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/math/jit_kernel.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/math/jit_kernel_blas.cc
Lines changed: 9 additions & 22 deletions b/‎paddle/fluid/operators/math/jit_kernel_blas.cc
Lines changed: 9 additions & 22 deletions
@@ -151,6 +151,132 @@ void ReluJitCode::generate() {
   }
   ret();
 }
+
+bool VExpJitCode::init(int d) {
+  return MayIUse(avx) && d == 8;  // only 8 yet
+}
+
+#define ALIGN32 __attribute__((aligned(32)))
+#define EXP_HIG 88.3762626647949f
+#define EXP_LOW -88.3762626647949f
+#define CEPHES_LOG2EF 1.44269504088896341
+#define CEPHES_EXP_C1 0.693359375
+#define CEPHES_EXP_C2 -2.12194440e-4
+#define CEPHES_EXP_P0 1.9875691500E-4
+#define CEPHES_EXP_P1 1.3981999507E-3
+#define CEPHES_EXP_P2 8.3334519073E-3
+#define CEPHES_EXP_P3 4.1665795894E-2
+#define CEPHES_EXP_P4 1.6666665459E-1
+#define CEPHES_EXP_P5 5.0000001201E-1
+
+#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
+
+#define OFFSET_EXP_0P5 1 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_HIG 2 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOW 3 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOG2EF 4 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C1 5 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C2 6 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P0 7 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P1 8 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P2 9 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P3 10 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P4 11 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P5 12 * AVX_FLOAT_BLOCK * sizeof(float)
+
+static const float exp_float_consts[] ALIGN32 = {
+    REPEAT_8TIMES(1.f),           REPEAT_8TIMES(0.5f),
+    REPEAT_8TIMES(EXP_HIG),       REPEAT_8TIMES(EXP_LOW),
+    REPEAT_8TIMES(CEPHES_LOG2EF), REPEAT_8TIMES(CEPHES_EXP_C1),
+    REPEAT_8TIMES(CEPHES_EXP_C2), REPEAT_8TIMES(CEPHES_EXP_P0),
+    REPEAT_8TIMES(CEPHES_EXP_P1), REPEAT_8TIMES(CEPHES_EXP_P2),
+    REPEAT_8TIMES(CEPHES_EXP_P3), REPEAT_8TIMES(CEPHES_EXP_P4),
+    REPEAT_8TIMES(CEPHES_EXP_P5)};
+
+static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
+static int g_tmp_mem[16] ALIGN32 = {0};
+
+void VExpJitCode::generate() {
+  preCode();
+  // push some?
+  // in: ymm0, out: ymm1
+  // use ymm 0~5 (and ymm 14~15 if avx only)
+  int offset = 0;
+  vmovups(ymm_src, ptr[param1 + offset]);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
+  vminps(ymm_src, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
+  vmaxps(ymm_src, ymm_src, ymm_tmp);
+  // express exp(x) as exp(g + n*log(2))
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
+  vmulps(ymm_fx, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
+  vaddps(ymm_fx, ymm_fx, ymm_tmp);
+  vroundps(ymm_fy, ymm_fx, 0x01);
+  // if greater, substract 1
+  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vandps(ymm_mask, ymm_mask, ymm_tmp);
+  vsubps(ymm_fx, ymm_fy, ymm_mask);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
+  vmulps(ymm_fy, ymm_fx, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
+  vmulps(ymm_z, ymm_fx, ymm_tmp);  // ymm_z use same with mask
+  vsubps(ymm_src, ymm_src, ymm_fy);
+  vsubps(ymm_src, ymm_src, ymm_z);
+  vmulps(ymm_z, ymm_src, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
+  vmulps(ymm_dst, ymm_src, ymm_tmp);
+  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
+       i += (AVX_FLOAT_BLOCK * sizeof(float))) {
+    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
+    vaddps(ymm_dst, ymm_dst, ymm_tmp);
+    vmulps(ymm_dst, ymm_dst, ymm_src);
+  }
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  vmulps(ymm_dst, ymm_dst, ymm_z);
+  vaddps(ymm_dst, ymm_dst, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+
+  // build 2^n
+  ymm_t ymm_int = ymm_fx;
+  vcvttps2dq(ymm_int, ymm_fx);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
+  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
+  if (MayIUse(avx2)) {
+    vpaddd(ymm_int, ymm_int, ymm_tmp);
+    vpslld(ymm_int, ymm_int, 23);
+  } else if (MayIUse(avx)) {
+    // use ymm_int, ymm_tmp and reg_ptr_global
+    xmm_t xtmp1 = xmm_t(ymm_int);  // or magic number should equal the ymm_int
+    xmm_t xtmp2 = xmm_t(ymm_tmp);  // or magic number should equal the ymm_tmp
+    mov(reg_ptr_global, reinterpret_cast<size_t>(g_tmp_mem));
+    vmovdqa(ptr[reg_ptr_global], ymm_int);
+    vmovdqa(ptr[reg_ptr_global + AVX_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
+    vpaddd(xtmp1, xtmp1, xtmp2);
+    vpslld(xtmp1, xtmp1, 23);
+    vmovdqa(ptr[reg_ptr_global], xtmp1);
+    // next 128bits
+    vmovdqa(xtmp1, ptr[reg_ptr_global + 4 /*xmm float block*/ * sizeof(float)]);
+    vmovdqa(xtmp2,
+            ptr[reg_ptr_global +
+                (AVX_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
+    vpaddd(xtmp1, xtmp1, xtmp2);
+    vpslld(xtmp1, xtmp1, 23);
+    vmovdqa(ptr[reg_ptr_global + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
+    // load out
+    vmovdqa(ymm_int, ptr[reg_ptr_global]);
+  }
+  vmulps(ymm_dst, ymm_dst, ymm_int);
+  vmovups(ptr[param2 + offset], ymm_dst);
+
+  //  ret();
+  postCode();
+}
+
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
 
@@ -108,6 +108,30 @@ class ReluJitCode : public JitCode {
   ymm_t ymm_dst = ymm_t(1);
 };
 
+class VExpJitCode : public JitCode {
+ public:
+  DECLARE_JIT_CODE(VExpJitCode);
+  explicit VExpJitCode(int d, size_t code_size = 256 * 1024,
+                       void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), num_(d) {}
+  static bool init(int d);
+  void generate() override;
+
+ private:
+  int num_;
+  reg64_t param1{abi_param1};
+  reg64_t param2{abi_param2};
+
+  reg64_t reg_ptr_global = rax;
+  ymm_t ymm_src = ymm_t(0);
+  ymm_t ymm_dst = ymm_t(1);
+  ymm_t ymm_fx = ymm_t(2);
+  ymm_t ymm_fy = ymm_t(3);
+  ymm_t ymm_mask = ymm_t(4);
+  ymm_t ymm_z = ymm_t(4);
+  ymm_t ymm_tmp = ymm_t(5);
+};
+
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
 
@@ -117,6 +117,7 @@ template <typename T>
 class VExpKernel : public VActKernel<T> {
  public:
   virtual void ComputeDeprecated(const T *x, T *y) const = 0;
+  void (*Compute)(const T *, T *, int);
 };
 
 template <typename T>
 
@@ -25,10 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -128,18 +124,11 @@ void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
 
 #endif
 
-#define DECLARE_STATIC_FUNC                                 \
-  static inline std::string name(int d) {                   \
-    PADDLE_THROW("DType should be either float or double"); \
-  }                                                         \
-  static inline bool useJIT(int d) { return false; }        \
-  static inline bool useMKL(int d) { return false; }
-
 /* VMUL JitKernel */
 template <typename T>
 class VMulKernelImpl : public VMulKernel<T> {
  public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VMulKernelImpl(int d) : VMulKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
@@ -191,7 +180,7 @@ bool VMulKernelImpl<double>::useMKL(int d) {
 template <typename T>
 class VAddKernelImpl : public VAddKernel<T> {
  public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VAddKernelImpl(int d) : VAddKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
@@ -241,7 +230,7 @@ bool VAddKernelImpl<double>::useMKL(int d) {
 template <typename T>
 class VAddReluKernelImpl : public VAddReluKernel<T> {
  public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
@@ -273,7 +262,7 @@ bool VAddReluKernelImpl<float>::useJIT(int d) {
 template <typename T>
 class VScalKernelImpl : public VScalKernel<T> {
  public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VScalKernelImpl(int d) : VScalKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
@@ -322,7 +311,7 @@ bool VScalKernelImpl<double>::useMKL(int d) {
 template <typename T>
 class VAddBiasKernelImpl : public VAddBiasKernel<T> {
  public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
@@ -355,14 +344,14 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) {
 template <typename T>
 class VReluKernelImpl : public VReluKernel<T> {
  public:
-  DECLARE_STATIC_FUNC;
+  JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VReluKernelImpl(int d) : VReluKernel<T>() {
     this->num_ = d;  // TODO(TJ): remove me when ComputeDeprecated done
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 /*init*/ +
-                  d / AVX_FLOAT_BLOCK * 4 /* instructions*/ *
-                      8 /*everage byte for each instruction*/;
+      size_t sz = 96 /* init size */ +
+                  d / AVX_FLOAT_BLOCK * 4 /* instructions */ *
+                      8 /* average bytes for each instruction */;
       jitcode_.reset(new gen::ReluJitCode(d, sz > 4096 ? sz : 4096));
       this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
       return;
@@ -388,8 +377,6 @@ bool VReluKernelImpl<float>::useJIT(int d) {
 }
 #endif
 
-#undef DECLARE_STATIC_FUNC
-
 REGISTER_JITKERNEL(vmul, VMulKernel);
 REGISTER_JITKERNEL(vadd, VAddKernel);
 REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);