PaddlePaddle
diff --git a/‎doc/source/gserver/layers/layer.rst
Lines changed: 5 additions & 0 deletions b/‎doc/source/gserver/layers/layer.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎doc/ui/api/trainer_config_helpers/layers.rst
Lines changed: 24 additions & 0 deletions b/‎doc/ui/api/trainer_config_helpers/layers.rst
Lines changed: 24 additions & 0 deletions
diff --git a/‎paddle/cuda/include/hl_cnn.h
Lines changed: 10 additions & 4 deletions b/‎paddle/cuda/include/hl_cnn.h
Lines changed: 10 additions & 4 deletions
diff --git a/‎paddle/cuda/include/stub/hl_cnn_stub.h
Lines changed: 6 additions & 4 deletions b/‎paddle/cuda/include/stub/hl_cnn_stub.h
Lines changed: 6 additions & 4 deletions
diff --git a/‎paddle/cuda/src/hl_cuda_cnn.cu
Lines changed: 23 additions & 17 deletions b/‎paddle/cuda/src/hl_cuda_cnn.cu
Lines changed: 23 additions & 17 deletions
diff --git a/‎paddle/gserver/layers/CostLayer.cpp
Lines changed: 35 additions & 0 deletions b/‎paddle/gserver/layers/CostLayer.cpp
Lines changed: 35 additions & 0 deletions
diff --git a/‎paddle/gserver/layers/CostLayer.h
Lines changed: 1 addition & 1 deletion b/‎paddle/gserver/layers/CostLayer.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/gserver/layers/PoolLayer.cpp
Lines changed: 2 additions & 4 deletions b/‎paddle/gserver/layers/PoolLayer.cpp
Lines changed: 2 additions & 4 deletions
@@ -465,6 +465,11 @@ SumOfSquaresCostLayer
 ..  doxygenclass:: paddle::SumOfSquaresCostLayer
     :members:
 
+SumCostLayer
+`````````````````````
+..  doxygenclass:: paddle::SumCostLayer
+    :members:
+
 CosSimLayer
 -----------
 ..  doxygenclass:: paddle::CosSimLayer
 
@@ -46,6 +46,12 @@ conv_operator
     :members: conv_operator
     :noindex:
 
+conv_projection
+-------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: conv_projection
+    :noindex:
+
 conv_shift_layer
 ------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -71,6 +77,12 @@ img_pool_layer
 --------------
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: img_pool_layer
+    :noindex:   
+
+spp_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: spp_layer
     :noindex:
 
 maxout_layer
@@ -254,6 +266,12 @@ expand_layer
     :members: expand_layer
     :noindex:
 
+repeat_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: repeat_layer
+    :noindex:
+
 Math Layers
 ===========
 
@@ -401,6 +419,12 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
+sum_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_cost
+    :noindex:
+
 Check Layer 
 ============
 
 
@@ -91,6 +91,7 @@ extern void hl_expand_feature2col(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
  *
  */
 extern void hl_maxpool_forward(
@@ -100,7 +101,8 @@ extern void hl_maxpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData);
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -123,6 +125,7 @@ extern void hl_maxpool_forward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  targetGrad  output grad.
+ * @param[in]   outStride   stride between output data samples. 
  *
  */
 extern void hl_maxpool_backward(
@@ -135,7 +138,7 @@ extern void hl_maxpool_backward(
     const int strideH, const int strideW,
     const int paddingH, const int paddingW,
     real scaleA, real scaleB,
-    real* targetGrad);
+    real* targetGrad, const int outStride);
 
 /**
  * @brief   Averge pool forward.
@@ -154,6 +157,7 @@ extern void hl_maxpool_backward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
  *
  */
 extern void hl_avgpool_forward(
@@ -163,7 +167,8 @@ extern void hl_avgpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData);
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -184,6 +189,7 @@ extern void hl_avgpool_forward(
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
+ * @param[in]   outStride   stride between output data samples. 
  *
  */
 extern void hl_avgpool_backward(
@@ -195,7 +201,7 @@ extern void hl_avgpool_backward(
     const int strideH, const int strideW,
     int paddingH, int paddingW,
     real scaleA, real scaleB,
-    real* backGrad);
+    real* backGrad, const int outStride);
 
 /**
  * @brief   Cross-map-respose normalize forward.
 
@@ -44,7 +44,8 @@ inline void hl_maxpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData) {}
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride) {}
 
 inline void hl_maxpool_backward(
     const int frameCnt, const real* inputData,
@@ -56,7 +57,7 @@ inline void hl_maxpool_backward(
     const int strideH, const int strideW,
     const int paddingH, const int paddingW,
     real scaleA, real scaleB,
-    real* targetGrad) {}
+    real* targetGrad, const int outStride) {}
 
 inline void hl_avgpool_forward(
     const int frameCnt, const real* inputData,
@@ -65,7 +66,8 @@ inline void hl_avgpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData) {}
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride) {}
 
 inline void hl_avgpool_backward(
     const int frameCnt, const real* outGrad,
@@ -76,7 +78,7 @@ inline void hl_avgpool_backward(
     const int strideH, const int strideW,
     int paddingH, int paddingW,
     real scaleA, real scaleB,
-    real* backGrad) {}
+    real* backGrad, const int outStride) {}
 
 inline void hl_CMRNorm_forward(
     size_t frameCnt, const real* in, real* scale, real* out,
 
@@ -152,7 +152,7 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
                                  const int ksizeW, const int ksizeH,
                                  const int strideH, const int strideW,
                                  const int offsetH, const int offsetW,
-                                 real* tgtData) {
+                                 real* tgtData, const int tgtStride) {
   int index =  blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -173,7 +173,9 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
           maxval = inputData[h * width + w];
       }
     }
-    tgtData[index] = maxval;
+    int tgtIndex = index % (pooledW * pooledH * channels) +
+        frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
   }
 }
 
@@ -184,7 +186,7 @@ void hl_maxpool_forward(const int frameCnt, const real* inputData,
                         const int sizeX, const int sizeY,
                         const int strideH, const int strideW,
                         const int paddingH, const int paddingW,
-                        real* tgtData) {
+                        real* tgtData, const int tgtStride) {
 
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -194,7 +196,7 @@ void hl_maxpool_forward(const int frameCnt, const real* inputData,
   KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels, height, width,
            pooledH, pooledW, sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData);
+           paddingH, paddingW, tgtData, tgtStride);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
@@ -207,7 +209,7 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
                                   const int strideH, const int strideW,
                                   const int padH, const int padW,
                                   real scaleA, real scaleB,
-                                  real* targetGrad) {
+                                  real* targetGrad, const int outStride) {
   int index = blockIdx.x  * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     // find out the local index
@@ -223,8 +225,8 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
     int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
     real input = inputData[index];
-    outData += (frameNum * channels + offsetC) * pooledH * pooledW;
-    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    outData += (frameNum * outStride + offsetC * pooledH * pooledW);
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         if (input == outData[ph * pooledW + pw]) {
@@ -246,7 +248,7 @@ void hl_maxpool_backward(const int frameCnt, const real* inputData,
                         const int strideH, const int strideW,
                         const int paddingH, const int paddingW,
                         real scaleA, real scaleB,
-                        real* targetGrad) {
+                        real* targetGrad, const int outStride) {
 
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -257,7 +259,7 @@ void hl_maxpool_backward(const int frameCnt, const real* inputData,
            strideH, strideW,
            paddingH, paddingW,
            scaleA, scaleB,
-           targetGrad);
+           targetGrad, outStride);
   CHECK_SYNC("hl_maxpool_backward");
 }
 
@@ -268,7 +270,7 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
                                  const int sizeX, const int sizeY,
                                  const int strideH, const int strideW,
                                  const int padH, const int padW,
-                                 real* tgtData) {
+                                 real* tgtData, const int tgtStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -293,7 +295,9 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
         aveval += inputData[h * width + w];
       }
     }
-    tgtData[index] = aveval / pool_size;
+    int tgtIndex = index % (pooledW * pooledH * channels) +
+        frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
   }
 }
 
@@ -303,14 +307,15 @@ void hl_avgpool_forward(const int frameCnt, const real* inputData,
                         const int pooledH, const int pooledW,
                         const int sizeX, const int sizeY,
                         const int strideH, const int strideW,
-                        const int paddingH, const int paddingW, real* tgtData) {
+                        const int paddingH, const int paddingW, 
+                        real* tgtData, const int tgtStride) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels,
            height, width, pooledH, pooledW,
            sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData);
+           paddingH, paddingW, tgtData, tgtStride);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
@@ -322,7 +327,7 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
                                   const int strideH, const int strideW,
                                   const int padH, const int padW,
                                   real scaleA, real scaleB,
-                                  real* tgtGrad) {
+                                  real* tgtGrad, const int outStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -335,7 +340,8 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
     int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
     int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
-    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
+
 
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
@@ -360,7 +366,7 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
                          const int strideH, const int strideW,
                          const int paddingH, const int paddingW,
                          real scaleA, real scaleB,
-                         real* backGrad) {
+                         real* backGrad, const int outStride) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
@@ -370,7 +376,7 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
            strideH, strideW,
            paddingH, paddingW,
            scaleA, scaleB,
-           backGrad);
+           backGrad, outStride);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
 
@@ -562,4 +562,39 @@ void HuberTwoClass::backwardImpIn(
   }
 }
 
+/**
+ * This cost layer compute the sum of its input as loss.
+ * \f[
+ * o(i) = \sum_{j=1}^D y_{ij}
+ * \f]
+ */
+class SumCostLayer : public Layer {
+public:
+  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    bool ret = Layer::init(layerMap, parameterMap);
+    if (!ret) return ret;
+    CHECK_EQ(inputLayers_.size(), 1UL);
+    return true;
+  }
+
+  virtual void forward(PassType passType) {
+    Layer::forward(passType);
+    const MatrixPtr& input = getInputValue(0);
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = input->getHeight();
+    int size = 1;
+    resizeOutput(batchSize, size);
+    output_.value->sumRows(*input);
+  }
+
+  virtual void backward(const UpdateCallback& callback = nullptr) {
+    getInputGrad(0)->add((real)1);
+  }
+};
+
+REGISTER_LAYER(sum_cost, SumCostLayer);
+
 }  // namespace paddle
@@ -129,7 +129,7 @@ class SoftBinaryClassCrossEntropy : public CostLayer {
  * This cost layer compute Euclidean (L2) loss for real-valued regression
  * tasks.
  * \f[
- * L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
+ * L = \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
  * \f]
  */
 class SumOfSquaresCostLayer : public CostLayer {
 
@@ -52,10 +52,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
 Layer* PoolLayer::create(const LayerConfig& config) {
   CHECK_EQ(config.inputs_size(), 1);
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
-  if (pool == "max-projection") {
-    return new MaxPoolProjectionLayer(config);
-  } else if (pool == "avg-projection") {
-    return new AvgPoolProjectionLayer(config);
+  if (pool == "max-projection" || pool == "avg-projection") {
+    return new PoolProjectionLayer(config);
 #ifndef PADDLE_ONLY_CPU
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);