Merge pull request #14646 from jczaja/prv-softmax-mkl-sasum

luotao1 · web-flow · commit ea47685f91c3 · 2018-11-29T11:02:03.000+08:00
Softmax for inference MKL further changes
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
@@ -168,6 +168,9 @@ class Blas {
   template <typename T>
   void SCAL(int n, const T a, T* x) const;
 
+  template <typename T>
+  T ASUM(int n, T* x, int inc) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
@@ -269,6 +272,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template SCAL<T>(args...);
   }
 
+  template <typename... ARGS>
+  T ASUM(ARGS... args) const {
+    return Base()->template ASUM<T>(args...);
+  }
+
   template <typename... ARGS>
   void BatchedGEMM(ARGS... args) const {
     Base()->template BatchedGEMM<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
@@ -84,6 +84,11 @@ struct CBlas<float> {
     platform::dynload::cblas_sscal(args...);
   }
 
+  template <typename... ARGS>
+  static float ASUM(ARGS... args) {
+    return platform::dynload::cblas_sasum(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_sgemm_batch(args...);
@@ -174,6 +179,11 @@ struct CBlas<double> {
     platform::dynload::cblas_dscal(args...);
   }
 
+  template <typename... ARGS>
+  static double ASUM(ARGS... args) {
+    return platform::dynload::cblas_dasum(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_dgemm_batch(args...);
@@ -268,6 +278,7 @@ struct CBlas<platform::float16> {
   static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
   static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
   static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
+  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -476,6 +487,21 @@ void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
 #endif
 }
 
+template <>
+template <typename T>
+T Blas<platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
+  auto sum = static_cast<T>(0.0);
+#ifdef PADDLE_WITH_MKLML
+  sum = CBlas<T>::ASUM(n, x, inc);
+#else
+  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
+  for (int c = 0; c < n; ++c) {
+    sum += x[c];
+  }
+#endif
+  return sum;
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
@@ -100,11 +100,8 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
 
     blas.VEXP(num_classes * batch_size, out_data, out_data);
     for (int n = 0; n < batch_size; ++n) {
-      entities[n] = out_data[n * num_classes];
-      for (int c = 1; c < num_classes; ++c) {
-        entities[n] += out_data[n * num_classes + c];
-      }
-      blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
+      auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);
+      blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
@@ -36,9 +36,7 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
 #ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<
-        DeviceContext, T,
-        std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
+    math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else
     math::SoftmaxFunctor<DeviceContext, T, false>()(
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
@@ -68,6 +68,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_dgemm_batch);       \
   __macro(cblas_sdot);              \
   __macro(cblas_ddot);              \
+  __macro(cblas_sasum);             \
+  __macro(cblas_dasum);             \
   __macro(cblas_sscal);             \
   __macro(cblas_dscal);             \
   __macro(vsAdd);                   \

Original file line number	Diff line number	Diff line change
`@@ -100,11 +100,8 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {`
`100`	`100`
`101`	`101`	`blas.VEXP(num_classes * batch_size, out_data, out_data);`
`102`	`102`	`for (int n = 0; n < batch_size; ++n) {`
`103`		`- entities[n] = out_data[n * num_classes];`
`104`		`- for (int c = 1; c < num_classes; ++c) {`
`105`		`- entities[n] += out_data[n * num_classes + c];`
`106`		`- }`
`107`		`- blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);`
	`103`	`+ auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);`
	`104`	`+ blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);`
`108`	`105`	`}`
`109`	`106`	`}`
`110`	`107`	`};`