PaddlePaddle
diff --git a/‎benchmark/IntelOptimizedPaddle.md
Lines changed: 19 additions & 8 deletions b/‎benchmark/IntelOptimizedPaddle.md
Lines changed: 19 additions & 8 deletions
diff --git a/‎paddle/gserver/activations/ActivationFunction.cpp
Lines changed: 31 additions & 0 deletions b/‎paddle/gserver/activations/ActivationFunction.cpp
Lines changed: 31 additions & 0 deletions
diff --git a/‎paddle/gserver/layers/MKLDNNAddtoLayer.cpp
Lines changed: 20 additions & 23 deletions b/‎paddle/gserver/layers/MKLDNNAddtoLayer.cpp
Lines changed: 20 additions & 23 deletions
diff --git a/‎paddle/gserver/layers/MKLDNNAddtoLayer.h
Lines changed: 3 additions & 46 deletions b/‎paddle/gserver/layers/MKLDNNAddtoLayer.h
Lines changed: 3 additions & 46 deletions
diff --git a/‎paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
Lines changed: 17 additions & 19 deletions b/‎paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
Lines changed: 17 additions & 19 deletions
diff --git a/‎paddle/gserver/layers/MKLDNNBatchNormLayer.h
Lines changed: 4 additions & 18 deletions b/‎paddle/gserver/layers/MKLDNNBatchNormLayer.h
Lines changed: 4 additions & 18 deletions
@@ -12,11 +12,11 @@ Machine:
 
 System: CentOS release 6.3 (Final), Docker 1.12.1.
 
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
-
-- MKL-DNN tag v0.10
-- MKLML 2018.0.20170720
+PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+- MKL-DNN tag v0.11
+- MKLML 2018.0.1.20171007
 - OpenBLAS v0.2.20
+(TODO: will rerun after 0.11.0)
 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
 
@@ -31,15 +31,26 @@ Input image size - 3 * 224 * 224, Time: images/second
 
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
+| MKLML        | 12.12 | 13.70 | 16.18  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
+
+
+chart on batch size 128
+TBD
+
+ - ResNet-50
+
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
+| MKLML        | 32.52 | 31.89 | 33.12  |
+| MKL-DNN      | 81.69 | 82.35 | 84.08  |
 
 
 chart on batch size 128
 TBD
 
- - ResNet
  - GoogLeNet
 
 ### Laptop
 
@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
 /**
  * @brief Relu Activation.
  * forward. y = max(0, z)
 
@@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
 }
 
 void MKLDNNAddtoLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
   reshapeInput(bs, ih, iw);
   ic = inputLayers_[0]->getSize() / ih / iw;
   CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
     CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
@@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape(
 }
 
 void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                 MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inVals_, bias, out);
-  in = inVals_[0];
+  resetFwdBuffers(inputs, biasVal_, out);
 
   std::shared_ptr<sum::primitive_desc> fwdPD;
   std::shared_ptr<sum::primitive_desc> biasPD;
-  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
 }
 
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                 MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, bias, out);
-  in = inGrads_[0];
+  resetBwdBuffers(inputs, biasGrad_, out);
 
   // backward only need share output grad to input grad
-  for (size_t i = 0; i < inGrads_.size(); i++) {
-    if (inGrads_[i] != nullptr) {
-      inGrads_[i] = out;
-      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i] != nullptr) {
+      inputs[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
     }
   }
 
   // backward bias
   bwdBias_ = nullptr;
-  if (bias) {
+  if (biasGrad_) {
     std::vector<float> scales(bs_, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
-    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+    std::vector<memory::primitive_desc> srcPDs(bs_,
+                                               biasGrad_->getPrimitiveDesc());
+    auto biasPD =
+        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
     std::vector<primitive::at> srcs;
     for (size_t i = 0; i < grads_.size(); ++i) {
       srcs.push_back(*(grads_[i]));
     }
-    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
     pipeline.push_back(*bwdBias_);
   }
 }
@@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
 
   inputs.resize(inputLayers_.size());
   for (size_t i = 0; i < inputs.size(); i++) {
-    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
     CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
   }
 
 
@@ -26,9 +26,6 @@ namespace paddle {
  */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
 protected:
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-
   // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
   size_t layerSize_;
 
@@ -50,52 +47,19 @@ class MKLDNNAddtoLayer : public MKLDNNLayer {
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
 
-  void printValueFormat() override {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormat() override {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
-    }
-  }
-
 protected:
-  /**
-   * Forward functions: reset buffers(inputs, output, bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
@@ -110,17 +74,10 @@ class MKLDNNAddtoLayer : public MKLDNNLayer {
                         std::vector<MKLDNNMatrixPtr>& inputs,
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(inputs, output, bias)
-   */
   void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
 
-  /**
-   * prepare for bias
-   */
   void prepareBias(MKLDNNMatrixPtr& bias,
                    const MatrixPtr& biasMat,
                    const MKLDNNMatrixPtr& out,
 
@@ -116,21 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
 }
 
 void MKLDNNBatchNormLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
   oh = ih;
   ow = iw;
   // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
       << "Input channel can not be changed";
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
 }
 
 void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                     MKLDNNMatrixPtr& out) {
   // In training phase, it will always calculate mean and var,
   // so useGlobalStats must be false.
@@ -140,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
     useGlobalStats_ = false;
   }
 
-  resetFwdBuffers(in, wgt, out);
+  resetFwdBuffers(inputs[0], wgtVal_, out);
 
-  resetFwdPD(fwdPD_, in, wgt, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
 }
 
 void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                     MKLDNNMatrixPtr& out) {
   std::shared_ptr<bn_bwd::primitive_desc> pd;
 
-  resetBwdBuffers(in, wgt, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, out);
 
-  resetBwdPD(pd, in, wgt, out);
+  resetBwdPD(pd, inputs[0], wgtGrad_, out);
 
-  resetBwdPipeline(pipeline, pd, in, wgt, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
 }
 
 void MKLDNNBatchNormLayer::forward(PassType passType) {
@@ -260,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline(
 void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                            MKLDNNMatrixPtr& wgt,
                                            MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
   if (gradScaleShift_) {
     CHECK(wgtVal_);
     resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
@@ -297,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline(
   if (pd == nullptr) {
     return;
   }
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
   bwdData_.reset(
       wgt && wgtVal_
-          ? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt)
-          : new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in));
+          ? new bn_bwd(
+                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
   pipeline.push_back(*bwdData_);
 }
 
 
@@ -73,18 +73,14 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer {
   void forward(PassType passType) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
@@ -98,11 +94,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer {
    * moving = moving * AvgFraction + local * (1 - AvgFraction)
    */
   void calMovingMeanAndVar();
-  /**
-   * Forward functions: reset buffers(input, weight, output),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
+
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& out);
@@ -115,12 +107,6 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer {
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, weight, output),
-   *                     reset primitive descriptor,
-   *                     reset pipeline.
-   */
   void resetBwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& out);