PaddlePaddle
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/capi/Matrix.cpp
Lines changed: 8 additions & 0 deletions b/‎paddle/capi/Matrix.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/capi/matrix.h
Lines changed: 2 additions & 0 deletions b/‎paddle/capi/matrix.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/cuda/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎paddle/cuda/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/cuda/include/hl_cnn.h
Lines changed: 4 additions & 3 deletions b/‎paddle/cuda/include/hl_cnn.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎paddle/cuda/include/stub/hl_cnn_stub.h
Lines changed: 2 additions & 1 deletion b/‎paddle/cuda/include/stub/hl_cnn_stub.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/cuda/src/hl_cuda_cnn.cu
Lines changed: 14 additions & 5 deletions b/‎paddle/cuda/src/hl_cuda_cnn.cu
Lines changed: 14 additions & 5 deletions
diff --git a/‎paddle/framework/backward.cc
Lines changed: 27 additions & 6 deletions b/‎paddle/framework/backward.cc
Lines changed: 27 additions & 6 deletions
diff --git a/‎paddle/framework/var_type.h
Lines changed: 22 additions & 0 deletions b/‎paddle/framework/var_type.h
Lines changed: 22 additions & 0 deletions
diff --git a/‎paddle/function/ConvOp.h
Lines changed: 6 additions & 0 deletions b/‎paddle/function/ConvOp.h
Lines changed: 6 additions & 0 deletions
@@ -21,7 +21,7 @@ third_party/
 cmake-build-*
 
 # generated while compiling
-python/paddle/v2/framework/core.so
+python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
 
@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,
 
 paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
   auto ptr = new paddle::capi::CMatrix();
   ptr->mat = paddle::Matrix::createSparseMatrix(
       height,
@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
       false,
       useGpu);
   return ptr;
+#else
+  return nullptr;
+#endif
 }
 
 paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                             uint64_t colSize,
                                             float* valueArray,
                                             uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (mat == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (rowArray == nullptr || colArray == nullptr ||
@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
   } else {
     return kPD_NOT_SUPPORTED;
   }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
 }
@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
  * @param isBinary is binary (either 1 or 0 in matrix) or not.
  * @param useGpu is using GPU or not.
  * @return paddle_matrix.
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
  * NULL if the matrix is binary.
  * @param [in] valueSize length of value array. Zero if the matrix is binary.
  * @return paddle_error
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                                    int* rowArray,
 
@@ -27,7 +27,9 @@ if(WITH_GPU)
     set_source_files_properties(${CUDA_CXX_SOURCES}
                                 PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
+    if (NOT MOBILE_INFERENCE)
     set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
 endif()
 
 set(CUDA_CU_SOURCES
 
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_base.h"
 
 /**
- * @brief   Maximum pool forward.
+ * @brief   Maximum pool forward with Mask output.
  *
  * @param[in]   frameCnt    batch size of input image.
  * @param[in]   inputData   input data.
@@ -35,7 +35,7 @@ limitations under the License. */
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
- *
+ * @param[out]  maskData    the location indices of select max data.
  */
 extern void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               real* maskData = NULL);
 
 /**
  * @brief   Maximum pool backward.
 
@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               real* MaskData) {}
 
 inline void hl_maxpool_backward(const int frameCnt,
                                 const real* inputData,
 
@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
                                  const int offsetH,
                                  const int offsetW,
                                  real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 real* maskData) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     real maxval = -FLT_MAX;
+    int max_index = -1;
     inputData += (frameNum * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w])
-          maxval = inputData[h * width + w];
+        if (maxval < inputData[h * width + w]) {
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
       }
     }
     int tgtIndex =
         index % (pooledW * pooledH * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
   }
 }
 
@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
                         const int paddingH,
                         const int paddingW,
                         real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        real* maskData) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
                                                          paddingH,
                                                          paddingW,
                                                          tgtData,
-                                                         tgtStride);
+                                                         tgtStride,
+                                                         maskData);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
 
@@ -377,6 +377,12 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   return grad_op_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
+
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     ProgramDescBind& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
@@ -392,13 +398,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
 
     if ((*it)->Type() == "recurrent") {
       int step_block_idx = (*it)->GetBlockAttr("step_block");
-      auto backward_block_op_descs = MakeBlockBackward(
-          program_desc, step_block_idx, no_grad_vars, grad_to_var);
+      BlockDescBind* backward_block = CreateStepBlock(
+          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
       BlockDescBind* backward_block =
-          program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-      for (auto& ptr : backward_block_op_descs) {
-        backward_block->AppendAllocatedOp(std::move(ptr));
-      }
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+                          (*it)->GetBlockAttr("block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else {
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -449,6 +455,21 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   return backward_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDescBind* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
+
 ParamGradInfoMap AppendBackward(
     ProgramDescBind& program_desc, const VarDescBind& target,
     const std::unordered_set<std::string>& no_grad_vars) {
 
@@ -27,10 +27,32 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
     return VarDesc_VarType_LOD_RANK_TABLE;
   } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
     return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return VarDesc_VarType_SELECTED_ROWS;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
 }
 
+template <typename Visitor>
+inline void VisitVarType(const Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<framework::LoDTensor>());
+      return;
+    case VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
@@ -61,6 +61,7 @@ class ConvFunctionBase : public FunctionBase {
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
     paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
     groups_ = config.get<size_t>("groups");
 
     // number of inputs and outputs
@@ -118,6 +119,7 @@ class ConvFunctionBase : public FunctionBase {
 
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;
 
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
@@ -133,6 +135,10 @@ class ConvFunctionBase : public FunctionBase {
 
   inline int paddingW() const { return paddings_[1]; }
 
+  inline int dilationH() const { return dilations_[0]; }
+
+  inline int dilationW() const { return dilations_[1]; }
+
   // A temporary memory in convolution calculation.
   MemoryHandlePtr memory_;