exp, sigmoid, tanh jitcode support more size

tensor-tang · tensor-tang · commit 1f00723fa379 · 2018-11-16T07:44:45.000Z
test=develop
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
@@ -33,11 +33,11 @@ namespace math {
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
 
-#define AVX_FLOAT_BLOCK 8
+#define YMM_FLOAT_BLOCK 8
 #define AVX_DOUBLE_BLOCK 4
-#define AVX2_FLOAT_BLOCK 8
+#define YMM_FLOAT_BLOCK 8
 #define AVX2_DOUBLE_BLOCK 4
-#define AVX512_FLOAT_BLOCK 16
+#define ZMM_FLOAT_BLOCK 16
 #define AVX512_DOUBLE_BLOCK 8
 
 template <typename T>
@@ -88,7 +88,7 @@ template <>
 inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
                                                 const float* x, float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
     vec_scal<float, platform::jit::isa_any>(n, a, x, y);
     return;
@@ -142,7 +142,7 @@ template <>
 inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
                                                     const float* x, float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
     vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
     return;
@@ -200,7 +200,7 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
                                                  const float* y, const float* z,
                                                  float* out) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
     vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
     return;
@@ -257,7 +257,7 @@ template <>
 inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
                                                     const float* x, float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
     vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
     return;
@@ -326,7 +326,7 @@ template <>
 inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
                                                    float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
     vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
     return;
@@ -415,7 +415,7 @@ template <>
 inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
                                                 float* y) {
 #ifdef __AVX__
-  constexpr int block = AVX_FLOAT_BLOCK;
+  constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block * 4) {
     vec_relu<float, platform::jit::isa_any>(n, x, y);
     return;
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
@@ -41,7 +41,7 @@ void VXXJitCode::generate() {
   } else if (scalar_index_ == 2) {
     vbroadcastss(ymm_src2, ptr[param2]);
   }
-  for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
     if (scalar_index_ != 1) {
       vmovups(ymm_src1, ptr[param1 + offset]);
     }
@@ -57,9 +57,9 @@ void VXXJitCode::generate() {
       vmaxps(ymm_dst, ymm_zero, ymm_dst);
     }
     vmovups(ptr[param3 + offset], ymm_dst);
-    offset += sizeof(float) * AVX_FLOAT_BLOCK;
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
-  int rest = num_ % AVX_FLOAT_BLOCK;
+  int rest = num_ % YMM_FLOAT_BLOCK;
   if (rest >= 4) {
     if (scalar_index_ != 1) {
       vmovups(xmm_src1, ptr[param1 + offset]);
@@ -133,23 +133,23 @@ void VXXJitCode::generate() {
 
 #define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
 
-#define OFFSET_EXP_ONE 0 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_TWO 1 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_0P5 2 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_HIG 3 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOW 4 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOG2EF 5 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C1 6 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C2 7 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P0 8 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P1 9 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P2 10 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P3 11 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P4 12 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P5 13 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_MAX_INPUT 14 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MAX 15 * AVX_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MIN 16 * AVX_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
 
 static const float exp_float_consts[] ALIGN32 = {
     REPEAT_8TIMES(1.f),
@@ -177,9 +177,12 @@ bool VActJitCode::init(int d, operand_type type) {
   bool ok = MayIUse(avx);
   if (type == operand_type::relu) {
     return ok;
+  } else if (type == operand_type::exp) {
+    // exp is slower than mkl when d >= 256
+    return ok && d % 8 == 0 && d < 256;
   } else {
     // TODO(TJ): support more
-    return ok && d == 8;  // only 8 yet
+    return ok && d % 8 == 0;
   }
 }
 
@@ -224,7 +227,7 @@ void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
   vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
   vmulps(ymm_dst, ymm_src, ymm_tmp);
   for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-       i += (AVX_FLOAT_BLOCK * sizeof(float))) {
+       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
     vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
     vaddps(ymm_dst, ymm_dst, ymm_tmp);
     vmulps(ymm_dst, ymm_dst, ymm_src);
@@ -249,15 +252,15 @@ void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
     reg64_t reg_ptr_tmp = reg_ptr_global;
     mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
     vmovdqa(ptr[reg_ptr_tmp], ymm_int);
-    vmovdqa(ptr[reg_ptr_tmp + AVX_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
+    vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
     vpaddd(xtmp1, xtmp1, xtmp2);
     vpslld(xtmp1, xtmp1, 23);
     vmovdqa(ptr[reg_ptr_tmp], xtmp1);
     // next 128bits
     vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]);
     vmovdqa(xtmp2,
             ptr[reg_ptr_tmp +
-                (AVX_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
+                (YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
     vpaddd(xtmp1, xtmp1, xtmp2);
     vpslld(xtmp1, xtmp1, 23);
     vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
@@ -317,7 +320,7 @@ void VActJitCode::generate() {
     vxorps(ymm_zero, ymm_zero, ymm_zero);
   }
   int offset = 0;
-  for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
     vmovups(ymm_src, ptr[param1 + offset]);
     switch (type_) {
       case operand_type::relu:
@@ -338,14 +341,14 @@ void VActJitCode::generate() {
         break;
     }
     vmovups(ptr[param2 + offset], ymm_dst);
-    offset += sizeof(float) * AVX_FLOAT_BLOCK;
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
   if (type_ != operand_type::relu) {
     // TODO(TJ): remove me
     ret();
     return;
   }
-  int rest = num_ % AVX_FLOAT_BLOCK;
+  int rest = num_ % YMM_FLOAT_BLOCK;
   if (rest >= 4) {
     vmovups(xmm_src, ptr[param1 + offset]);
     vmaxps(xmm_dst, xmm_zero, xmm_src);
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
@@ -29,10 +29,9 @@ namespace jitkernel {
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0
-// TODO(TJ): change AVX_FLOAT_BLOCK to YMM_FLOAT_BLOCK
-#define AVX_FLOAT_BLOCK 8
-#define AVX2_FLOAT_BLOCK 8
-#define AVX512_FLOAT_BLOCK 16
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
 
 typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
 
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -133,7 +133,7 @@ class VMulKernelImpl : public VMulKernel<T> {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
       // roughly estimate the size of code
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
       jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 0, false,
                                          sz > 4096 ? sz : 4096));
       this->Compute =
@@ -184,7 +184,7 @@ class VAddKernelImpl : public VAddKernel<T> {
   explicit VAddKernelImpl(int d) : VAddKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
       jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, false,
                                          sz > 4096 ? sz : 4096));
       this->Compute =
@@ -234,7 +234,7 @@ class VAddReluKernelImpl : public VAddReluKernel<T> {
   explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
       jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 0, true,
                                          sz > 4096 ? sz : 4096));
       this->Compute =
@@ -266,7 +266,7 @@ class VScalKernelImpl : public VScalKernel<T> {
   explicit VScalKernelImpl(int d) : VScalKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
       jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::mul, 1, false,
                                          sz > 4096 ? sz : 4096));
       this->Compute =
@@ -315,7 +315,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel<T> {
   explicit VAddBiasKernelImpl(int d) : VAddBiasKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
       jitcode_.reset(new gen::VXXJitCode(d, gen::operand_type::add, 1, false,
                                          sz > 4096 ? sz : 4096));
       this->Compute =
@@ -349,7 +349,7 @@ class VReluKernelImpl : public VReluKernel<T> {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
       size_t sz = 96 /* init size */ +
-                  d / AVX_FLOAT_BLOCK * 4 /* instructions */ *
+                  d / YMM_FLOAT_BLOCK * 4 /* instructions */ *
                       8 /* average bytes for each instruction */;
       jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::relu,
                                           sz > 4096 ? sz : 4096));
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -105,14 +105,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
       int tag_num)                                                             \
       : CRFDecodeKernel<float>() {                                             \
     this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / AVX_FLOAT_BLOCK;                                 \
-    this->rest_ = this->num_ % AVX_FLOAT_BLOCK;                                \
+    this->end_ = this->num_ / YMM_FLOAT_BLOCK;                                 \
+    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
   }                                                                            \
   template <>                                                                  \
   void CRFDecodeKernelImpl<float, jit::avx, block>::Compute(                   \
       const int seq_len, const float* x, const float* w, float* alpha,         \
       int* track) const {                                                      \
-    INIT_ALPHA(AVX_FLOAT_BLOCK)                                                \
+    INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
     /* Use the column-major strategy to get the location of maximum score.*/   \
     int seq_offset = 0;                                                        \
     constexpr int state_trans_base_idx = 2;                                    \
@@ -150,7 +150,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
           max_score = _mm256_max_ps(max_score, score_v);                       \
           trans_offset += this->num_;                                          \
         }                                                                      \
-        UPDATE_ALPHA(AVX_FLOAT_BLOCK)                                          \
+        UPDATE_ALPHA(YMM_FLOAT_BLOCK)                                          \
       }                                                                        \
       seq_offset += this->num_;                                                \
     }                                                                          \
@@ -161,14 +161,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
   CRFDecodeKernelImpl<float, isa, block>::CRFDecodeKernelImpl(int tag_num)     \
       : CRFDecodeKernel<float>() {                                             \
     this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / AVX2_FLOAT_BLOCK;                                \
-    this->rest_ = this->num_ % AVX2_FLOAT_BLOCK;                               \
+    this->end_ = this->num_ / YMM_FLOAT_BLOCK;                                 \
+    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
   }                                                                            \
   template <>                                                                  \
   void CRFDecodeKernelImpl<float, isa, block>::Compute(                        \
       const int seq_len, const float* x, const float* w, float* alpha,         \
       int* track) const {                                                      \
-    INIT_ALPHA(AVX2_FLOAT_BLOCK)                                               \
+    INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
     /* Use the column-major strategy to get the location of maximum score.*/   \
     int seq_offset = 0;                                                        \
     constexpr int state_trans_base_idx = 2;                                    \
@@ -196,7 +196,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
           max_score = _mm256_max_ps(max_score, score_v);                       \
           trans_offset += this->num_;                                          \
         }                                                                      \
-        UPDATE_ALPHA(AVX2_FLOAT_BLOCK)                                         \
+        UPDATE_ALPHA(YMM_FLOAT_BLOCK)                                          \
       }                                                                        \
       seq_offset += this->num_;                                                \
     }                                                                          \
@@ -208,14 +208,14 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
       int tag_num)                                                             \
       : CRFDecodeKernel<float>() {                                             \
     this->num_ = tag_num;                                                      \
-    this->end_ = this->num_ / AVX512_FLOAT_BLOCK;                              \
-    this->rest_ = this->num_ % AVX512_FLOAT_BLOCK;                             \
+    this->end_ = this->num_ / ZMM_FLOAT_BLOCK;                                 \
+    this->rest_ = this->num_ % ZMM_FLOAT_BLOCK;                                \
   }                                                                            \
   template <>                                                                  \
   void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute(               \
       const int seq_len, const float* x, const float* w, float* alpha,         \
       int* track) const {                                                      \
-    INIT_ALPHA(AVX512_FLOAT_BLOCK)                                             \
+    INIT_ALPHA(ZMM_FLOAT_BLOCK)                                                \
     /* Use the column-major strategy to get the location of maximum score.*/   \
     int seq_offset = 0;                                                        \
     constexpr int state_trans_base_idx = 2;                                    \
@@ -250,7 +250,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
                                                        this->num_ + j_offset), \
                             max_j);                                            \
         /* Calculate the offset of next step*/                                 \
-        j_offset += AVX512_FLOAT_BLOCK;                                        \
+        j_offset += ZMM_FLOAT_BLOCK;                                           \
         if (j == this->end_ - 1) {                                             \
           if (this->rest_ > 0) {                                               \
             j_offset += last_offset;                                           \
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -116,7 +116,7 @@ class VExpKernelImpl : public VExpKernel<T> {
   explicit VExpKernelImpl(int d) : VExpKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;  // should change
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 70 * 8;
       jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::exp,
                                           sz > 4096 ? sz : 4096));
       this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
@@ -167,7 +167,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
   explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;  // should change
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 82 * 8;
       jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::sigmoid,
                                           sz > 4096 ? sz : 4096));
       this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
@@ -219,7 +219,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
   explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
 #ifdef PADDLE_WITH_XBYAK
     if (useJIT(d)) {
-      size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;  // should change
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 84 * 8;
       jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::tanh,
                                           sz > 4096 ? sz : 4096));
       this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h