From 179e100f533cc8225e7b01e23d7b53b05024f88a Mon Sep 17 00:00:00 2001
From: zhangliujie <zhangliujie@baidudeMacBook-Pro-6.local>
Date: Wed, 31 Jul 2019 14:10:01 +0800
Subject: [PATCH 1/3] add ops from content dnn

---
 .../fluid/operators/match_matrix_tensor_op.cc | 644 ++++++++++++++
 .../fluid/operators/match_matrix_tensor_op.h  |  53 ++
 .../operators/search_aligned_mat_mul_op.cc    | 373 ++++++++
 .../search_attention_padding_mask_op.cc       | 263 ++++++
 paddle/fluid/operators/search_compute.h       | 422 +++++++++
 paddle/fluid/operators/search_embedding_op.cc | 370 ++++++++
 paddle/fluid/operators/search_fc_op.cc        | 331 +++++++
 paddle/fluid/operators/search_grnn_op.cc      | 807 ++++++++++++++++++
 paddle/fluid/operators/search_grnn_op.h       |  27 +
 .../operators/search_group_padding_op.cc      | 222 +++++
 .../operators/search_seq_arithmetic_op.cc     | 255 ++++++
 .../operators/search_seq_depadding_op.cc      | 207 +++++
 paddle/fluid/operators/search_seq_fc_op.cc    | 217 +++++
 .../fluid/operators/search_seq_softmax_op.cc  | 211 +++++
 .../sequence_topk_avg_pooling_op.cc           | 134 +++
 .../sequence_topk_avg_pooling_op.h            | 264 ++++++
 .../sequence_ops/sequence_topk_pooling_op.cc  | 137 +++
 .../sequence_ops/sequence_topk_pooling_op.h   | 175 ++++
 paddle/fluid/operators/uniform_random_op.cc   |  21 +
 paddle/fluid/operators/var_conv_2d_op.cc      | 470 ++++++++++
 paddle/fluid/operators/var_conv_2d_op.h       |  46 +
 python/paddle/fluid/initializer.py            |  49 +-
 python/paddle/fluid/layers/nn.py              | 501 +++++++++++
 23 files changed, 6181 insertions(+), 18 deletions(-)
 create mode 100644 paddle/fluid/operators/match_matrix_tensor_op.cc
 create mode 100644 paddle/fluid/operators/match_matrix_tensor_op.h
 create mode 100644 paddle/fluid/operators/search_aligned_mat_mul_op.cc
 create mode 100644 paddle/fluid/operators/search_attention_padding_mask_op.cc
 create mode 100644 paddle/fluid/operators/search_compute.h
 create mode 100644 paddle/fluid/operators/search_embedding_op.cc
 create mode 100644 paddle/fluid/operators/search_fc_op.cc
 create mode 100644 paddle/fluid/operators/search_grnn_op.cc
 create mode 100644 paddle/fluid/operators/search_grnn_op.h
 create mode 100644 paddle/fluid/operators/search_group_padding_op.cc
 create mode 100644 paddle/fluid/operators/search_seq_arithmetic_op.cc
 create mode 100644 paddle/fluid/operators/search_seq_depadding_op.cc
 create mode 100644 paddle/fluid/operators/search_seq_fc_op.cc
 create mode 100644 paddle/fluid/operators/search_seq_softmax_op.cc
 create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
 create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
 create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc
 create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h
 create mode 100644 paddle/fluid/operators/var_conv_2d_op.cc
 create mode 100644 paddle/fluid/operators/var_conv_2d_op.h
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
new file mode 100644
index 00000000000000..e0108f69a82641
--- /dev/null
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -0,0 +1,644 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+//#include "naive_gemm.h"
+
+#include "paddle/fluid/operators/match_matrix_tensor_op.h"
+
+#ifndef WIN32
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define __m256x __m256
+#define __m128x __m128
+
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define __m256x __m256d
+#define __m128x __m128d
+
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+#endif
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "X(Input) of MatchMatrix should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Y"),
+                 "Y(Input) of MatchMatrix should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("W"),
+                 "W(Input) of MatchMatrix should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Tmp"),
+                 "Tmp(Output) of Fully Connected should not be null.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  // for (int i = 0; i < x_dims.size(); i++) {
+  //  LOG(ERROR) << "match_matrix_tensor: x_dims[" << i << "]:" << x_dims <<
+  //  "]";
+  //}
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
+                    "The rank of Input(X) can't be less than 2.");
+
+  auto y_dims = ctx->GetInputDim("Y");
+  /*
+  for (int i = 0; i < y_dims.size(); i++) {
+    LOG(ERROR) << "match_matrix_tensor: y_dims[" << i << "]:" << y_dims << "]";
+  }
+  */
+  PADDLE_ENFORCE_EQ(y_dims.size(), 2,
+                    "The rank of Input(Y) can't be less than 2.");
+
+  auto w_dims = ctx->GetInputDim("W");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 3UL, "W should be 3-D tensor");
+  /*
+  for (int i = 0; i < w_dims.size(); i++) {
+    LOG(ERROR) << "match_matrix_tensor: w_dims[" << i << "]:" << w_dims << "]";
+  }
+  */
+  int dim_t = ctx->Attrs().Get<int>("dim_t");
+
+  PADDLE_ENFORCE(
+      x_dims[1] == w_dims[0] && y_dims[1] == w_dims[2] && w_dims[1] == dim_t,
+      "W 's shape must be X[1] * dim_t * Y[1].");
+
+  int out_dim_0 = -1;
+  int tmp_dim_0 = -1;
+  if (ctx->IsRuntime()) {
+    framework::Variable* x_var =
+        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+    const auto& x_lod = x_var->Get<LoDTensor>().lod();
+    PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+    const auto& x_lod_0 = x_lod[0];
+    PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                      "The Input(X)'s lod info is corrupted.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+        "The Input(X)'s lod info mismatches the actual tensor shape.");
+
+    framework::Variable* y_var =
+        boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+    const auto& y_lod = y_var->Get<LoDTensor>().lod();
+    PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info.");
+    const auto& y_lod_0 = y_lod[0];
+    PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
+                      "The Input(Y)'s lod info is corrupted.");
+    PADDLE_ENFORCE_EQ(
+        y_dims[0], static_cast<int64_t>(y_lod_0.back()),
+        "The Input(Y)'s lod info mismatches the actual tensor shape.");
+
+    PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(),
+                      "The Length of X and Y must be equal.");
+
+    out_dim_0 = 0;
+    for (size_t i = 1; i < x_lod_0.size(); i++) {
+      int x_len = x_lod_0[i] - x_lod_0[i - 1];
+      int y_len = y_lod_0[i] - y_lod_0[i - 1];
+      out_dim_0 += (x_len * y_len);
+    }
+    out_dim_0 *= dim_t;
+
+    tmp_dim_0 = x_dims[0] * dim_t * x_dims[1];
+  } else {
+    // compile time
+    framework::VarDesc* x_desc =
+        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
+    PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
+    framework::VarDesc* y_desc =
+        boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
+    PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1);
+  }
+
+  std::vector<int64_t> out_dims_vec{out_dim_0};
+  out_dims_vec.push_back(1);
+  std::vector<int64_t> tmp_dims_vec{tmp_dim_0};
+  tmp_dims_vec.push_back(1);
+  ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+  ctx->SetOutputDim("Tmp", framework::make_ddim(tmp_dims_vec));
+}
+
+void MatchMatrixTensorOpGrad::InferShape(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "Input(X) of SequencePadGradOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Y"),
+                 "Input(Y) of SequencePadGradOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("W"),
+                 "Input(W) of SequencePadGradOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                 "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+  //   PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Tmp")),
+  //                  "Input(Tmp@GRAD) of SequencePadGradOp should not be
+  //                  null.");
+
+  if (ctx->HasOutput(framework::GradVarName("X"))) {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+  if (ctx->HasOutput(framework::GradVarName("Y"))) {
+    ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y"));
+    ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y"));
+  }
+  if (ctx->HasOutput(framework::GradVarName("W"))) {
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+  }
+}
+
+void MatchMatrixTensorOpMaker::Make() {
+  AddInput("X",
+           "X (LoDTensor, default LoDTensor<float>) Input variable which "
+           "should contain lod information.");
+  AddInput("Y",
+           "Y (LoDTensor, default LoDTensor<float>) Input variable which "
+           "should contain lod information.");
+  AddInput("W", "W (Tensor), The weight of X and Y.");
+  AddAttr<int>("dim_t", "the dim of W").SetDefault(1);
+  AddOutput("Out",
+            "(LoDTensor, default LoDTensor<float>) Output variable which "
+            "is X * W * Y");
+  AddOutput("Tmp",
+            "(LoDTensor, default LoDTensor<float>) tmp variable which is "
+            "used for X * W");
+  AddComment(R"DOC(
+      Match Matrix Tensor Operator
+
+      This operator calculate X * W * Y, only support 2-D for X and Y.
+      the output is a level-3 LodTensor: 
+        level_0: dim_t
+        level_1: query length
+        level_2: title length
+      
+      NOTE: only support 'float32' data type now.
+
+    )DOC");
+}
+#ifndef WIN32
+
+template <typename DeviceContext, typename T>
+void lego_cpu_gemm(const math::BlasT<DeviceContext, T>& blas,
+                   const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+                   const int M, const int N, const int K, const T alpha,
+                   const T* A, const T* B, const T beta, T* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  //#ifdef LEGO_USE_FLOAT
+#ifndef __NAIVE_GEMM__
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm<T>(true, (TransA == CblasTrans), (TransB == CblasTrans), M, N, K,
+                 alpha, A, lda, B, ldb, beta, C, N);
+#endif  // !__NAIVE_GEMM__
+
+
+
+  //   platform::dynload::cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K,
+  //   alpha,
+  //                                  A, lda, B, ldb, beta, C, N);
+  //   cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+  //   ldb,
+  //               beta, C, N);
+  // #else
+  //     cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+  //     ldb,
+  //                 beta, C, N);
+  // #endif
+}
+
+template <typename DeviceContext, typename T>
+void lego_cpu_gemm_with_lda(const math::BlasT<DeviceContext, T>& blas,
+                            const CBLAS_TRANSPOSE TransA,
+                            const CBLAS_TRANSPOSE TransB, const int M,
+                            const int N, const int K, const T alpha, const T* A,
+                            const T* B, const T beta, T* C, int lda) {
+  //        int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  // #ifdef LEGO_USE_FLOAT
+
+#ifndef __NAIVE_GEMM__
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm<T>(true, (TransA == CblasTrans), (TransB == CblasTrans), M, N, K,
+                 alpha, A, lda, B, ldb, beta, C, N);
+#endif  // !__NAIVE_GEMM__
+
+
+  //   platform::dynload::cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K,
+  //   alpha,
+  //                                  A, lda, B, ldb, beta, C, N);
+  // #else
+  //   cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+  //   ldb,
+  //               beta, C, N);
+  // #endif
+}
+
+template <typename T>
+inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+  // #if defined(LEGO_AVX)
+  //   lll = len & ~AVX_CUT_LEN_MASK;
+  //   __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
+  //   for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+  //     _mm256_store_px(
+  //         y + jjj,
+  //         _mm256_add_px(_mm256_load_px(y + jjj),
+  //                       _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
+  //   }
+  //
+  // #elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  __m128x mm_alpha = _mm_load1_px(&alpha);
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(y + jjj,
+                 _mm_add_px(_mm_load_px(y + jjj),
+                            _mm_mul_px(mm_alpha, _mm_load_px(x + jjj))));
+  }
+
+  // #endif
+  for (; jjj < len; jjj++) {
+    y[jjj] += alpha * x[jjj];
+  }
+}
+#endif
+
+template <typename DeviceContext, typename T>
+class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifndef WIN32
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* y = ctx.Input<LoDTensor>("Y");
+    auto* w = ctx.Input<Tensor>("W");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    auto* tmp = ctx.Output<LoDTensor>("Tmp");
+
+    int dim_t = ctx.Attr<int>("dim_t");
+
+    int dim_in = x->dims()[1];
+
+    const auto& offset_l = x->lod()[0];
+    const auto& offset_r = y->lod()[0];
+
+    std::vector<size_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+      int len_l = offset_l[b + 1] - offset_l[b];
+      int len_r = offset_r[b + 1] - offset_r[b];
+      top_size += dim_t * len_l * len_r;
+      top_offset.push_back(top_size);
+    }
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T));
+
+    auto* bottom_l_data = x->data<T>();
+    auto* bottom_r_data = y->data<T>();
+    auto* t_data = w->data<T>();
+    auto* bottom_l_trans_data = tmp->mutable_data<T>(ctx.GetPlace());
+    memset(bottom_l_trans_data, 0.0,
+           tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+
+    // int M = x->dims()[0], N = dim_t * dim_in, K = dim_in;
+    lego_cpu_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0],
+                  dim_t * dim_in, dim_in, 1.0f, bottom_l_data, t_data, 0.0f,
+                  bottom_l_trans_data);
+    //     cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0,
+    //                 bottom_l_data, K, t_data, N, 0.0, bottom_l_trans_data,
+    //                 N);
+/*
+if (top_size == 9792)
+{
+    std::ofstream out_to_file("out.model", std::ios::binary);
+    out_to_file.write((char*)bottom_l_trans_data, tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
+    out_to_file.close();
+    std::ofstream out_to_file_r("out.model.bottom_r_data", std::ios::binary);
+    out_to_file_r.write((char*)bottom_r_data, y->dims()[0] * y->dims()[1]*sizeof(float));
+    out_to_file_r.close();
+
+    float * p = new float[9792];
+    Tensor out_tensor;
+    out_tensor.Resize(out->dims());
+    auto* out_tensor_data = out_tensor.mutable_data<T>(ctx.GetPlace());
+    blas.GEMM(CblasNoTrans, CblasTrans, 4, 51, 128, 1.0f, 
+        bottom_l_trans_data,
+        384,
+        bottom_r_data,
+        128,
+        0.0f,
+        out_tensor_data,
+        51);
+        
+    LOG(ERROR) << "check_mkl: p[28] = " << out_tensor_data[28] << " ";
+    LOG(ERROR) << "check_mkl: p[39] = " << out_tensor_data[39] << " ";
+    LOG(ERROR) << "check_mkl: p[49] = " << out_tensor_data[49] << " ";
+
+    int n;
+    n = memcmp(bottom_r_data + 39*128, bottom_r_data + 49*128, 128 );
+    LOG(ERROR) << "memcmp = " << n;
+
+    float f28 = 0;
+    blas.GEMM(CblasNoTrans, CblasTrans, 1, 1, 128, 1.0f, 
+        bottom_l_trans_data,
+        384,
+        bottom_r_data + 28*128,
+        128,
+        0.0f,
+        &f28,
+        1);
+    LOG(ERROR) << "check_mkl: single f28 = " << f28 << " ";
+
+    float f39 = 0;
+    blas.GEMM(CblasNoTrans, CblasTrans, 1, 1, 128, 1.0f, 
+        bottom_l_trans_data,
+        384,
+        bottom_r_data + 39*128,
+        128,
+        0.0f,
+        &f39,
+        1);
+    LOG(ERROR) << "check_mkl: single f39 = " << f39 << " ";
+
+    float f49 = 0;
+    blas.GEMM(CblasNoTrans, CblasTrans, 1, 1, 128, 1.0f, 
+        bottom_l_trans_data,
+        384,
+        bottom_r_data + 49*128,
+        128,
+        0.0f,
+        &f49,
+        1);
+    LOG(ERROR) << "check_mkl: single f49 = " << f49 << " ";
+
+    for (int tt= 0; tt < 4*51; tt++)
+    {   
+        LOG(ERROR) << p[tt] << " ";
+    }   
+    LOG(ERROR) << "check_end";
+}
+*/
+    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+      for (int t = 0; t < dim_t; t++) {
+        int len_l = offset_l[b + 1] - offset_l[b];
+        int len_r = offset_r[b + 1] - offset_r[b];
+        auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
+        const auto* l_t_data =
+            bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
+        const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
+        auto blas_2 = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+        lego_cpu_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r,
+                               dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data,
+                               dim_t * dim_in);
+        /*
+        if (top_size == 9792)
+        {
+            LOG(ERROR) << "top_565 = " << out_data[565] << " " ;
+        }
+        */
+      }
+    }
+
+    int batch_size = x->lod()[0].size() - 1;
+    int lod_lv1_size = batch_size * dim_t;
+    int lod_lv2_size = x->lod()[0].back() * dim_t;
+    std::vector<size_t> out_lod0(batch_size + 1, 0);
+    std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+    std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+    for (int i = 0; i < batch_size; i++) {
+      out_lod0[i + 1] = out_lod0[i] + dim_t;
+      int len_l = offset_l[i + 1] - offset_l[i];
+
+      for (int j = 0; j < dim_t; j++) {
+        out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+        int len_r = offset_r[i + 1] - offset_r[i];
+
+        for (int k = 0; k < len_l; k++) {
+          out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+              out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+        }
+      }
+    }
+
+    framework::LoD out_lod;
+    // out_lod.push_back(out_lod0);
+    // out_lod.push_back(out_lod1);
+    // out_lod.push_back(out_lod2);
+    out_lod.push_back(top_offset);
+    out_lod.push_back(offset_l);
+    out_lod.push_back(offset_r);
+    
+    out->set_lod(out_lod);
+
+#endif
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifndef WIN32
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* y = ctx.Input<LoDTensor>("Y");
+    auto* w = ctx.Input<Tensor>("W");
+    auto* tmp = ctx.Input<LoDTensor>("Tmp");
+
+    int dim_t = ctx.Attr<int>("dim_t");
+    int dim_in = x->dims()[1];
+
+    const auto& offset_l = x->lod()[0];
+    const auto& offset_r = y->lod()[0];
+    std::vector<int> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+      int len_l = offset_l[b + 1] - offset_l[b];
+      int len_r = offset_r[b + 1] - offset_r[b];
+      top_size += dim_t * len_l * len_r;
+      top_offset.push_back(top_size);
+    }
+
+    auto* bottom_l_data = x->data<T>();
+    auto* bottom_r_data = y->data<T>();
+    auto* bottom_l_trans_data = tmp->data<T>();
+
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_y = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
+    // auto* d_tmp = ctx.Input<LoDTensor>(framework::GradVarName("Tmp"));
+
+    Tensor tmp_grad;
+    tmp_grad.Resize(tmp->dims());
+    auto* d_tmp_data = tmp_grad.mutable_data<T>(ctx.GetPlace());
+    auto* top_diff = d_out->data<T>();
+    auto* bottom_l_diff = d_x->mutable_data<T>(ctx.GetPlace());
+    auto* bottom_r_diff = d_y->mutable_data<T>(ctx.GetPlace());
+    // auto* d_tmp_data = d_tmp->data<T>();
+    auto* bottom_l_trans_diff = const_cast<T*>(d_tmp_data);
+    memset(bottom_l_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
+    memset(bottom_r_diff, 0.0, y->dims()[0] * y->dims()[1] * sizeof(T));
+    memset(bottom_l_trans_diff, 0.0,
+           tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
+
+    for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+      for (int t = 0; t < dim_t; t++) {
+        int len_l = offset_l[b + 1] - offset_l[b];
+        int len_r = offset_r[b + 1] - offset_r[b];
+
+        for (int i = 0; i < len_l; i++) {
+          for (int j = 0; j < len_r; j++) {
+            auto diff =
+                top_diff[top_offset[b] + t * len_l * len_r + i * len_r + j];
+            auto* l_trans_data = bottom_l_trans_data +
+                                 (offset_l[b] + i) * dim_in * dim_t +
+                                 t * dim_in;
+            auto* l_trans_diff = bottom_l_trans_diff +
+                                 (offset_l[b] + i) * dim_in * dim_t +
+                                 t * dim_in;
+            auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in;
+            auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in;
+            if (diff != 0.0) {
+              sse_axpy(r_data, l_trans_diff, dim_in, diff);
+              sse_axpy(l_trans_data, r_diff, dim_in, diff);
+            }
+          }
+        }
+      }
+    }
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+
+    auto* t_data = w->data<T>();
+    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
+    auto* t_diff = d_w->mutable_data<T>(ctx.GetPlace());
+    memset(t_diff, 0.0, w->dims()[0] * w->dims()[1] * w->dims()[2] * sizeof(T));
+    // bottom_diff
+    lego_cpu_gemm(blas, CblasNoTrans, CblasTrans, x->dims()[0], dim_in,
+                  dim_t * dim_in, 1.0f, bottom_l_trans_diff, t_data, 1.0f,
+                  bottom_l_diff);
+
+    // t_diff
+    lego_cpu_gemm(blas, CblasTrans, CblasNoTrans, dim_in, dim_t * dim_in,
+                  x->dims()[0], 1.0f, bottom_l_data, bottom_l_trans_diff, 1.0f,
+                  t_diff);
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(match_matrix_tensor, ops::MatchMatrixTensorOP,
+                  ops::MatchMatrixTensorOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    match_matrix_tensor,
+    ops::CPUMatchMatrixTensorOPKernel<paddle::platform::CPUDeviceContext, float>
+    //     ops::CPUMatchMatrixTensorOPKernel<paddle::platform::CPUDeviceContext,
+    //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    match_matrix_tensor_grad,
+    ops::CPUMatchMatrixTensorOPGradKernel<paddle::platform::CPUDeviceContext,
+                                          float>
+    //     ops::CPUMatchMatrixTensorOPGradKernel<paddle::platform::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h
new file mode 100644
index 00000000000000..847f34f1c71388
--- /dev/null
+++ b/paddle/fluid/operators/match_matrix_tensor_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+class MatchMatrixTensorOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+//  framework::OpKernelType GetExpectedKernelType(
+//      const framework::ExecutionContext& ctx) const override;
+
+ private:
+};
+
+class MatchMatrixTensorOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+
+ protected:
+//  framework::OpKernelType GetExpectedKernelType(
+//      const framework::ExecutionContext& ctx) const override;
+};
+
+class MatchMatrixTensorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+}  // namespace operators
+}  // namespace paddle
+
diff --git a/paddle/fluid/operators/search_aligned_mat_mul_op.cc b/paddle/fluid/operators/search_aligned_mat_mul_op.cc
new file mode 100644
index 00000000000000..594771476bb254
--- /dev/null
+++ b/paddle/fluid/operators/search_aligned_mat_mul_op.cc
@@ -0,0 +1,373 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
+#include "search_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+using DDim = framework::DDim;
+
+void assign_dims(int64_t x_dims_1, int64_t x_aligned_size, int64_t y_dims_1,
+                 int64_t y_aligned_size, CBLAS_TRANSPOSE trans_x,
+                 CBLAS_TRANSPOSE trans_y, std::vector<int64_t>& _dims) {
+  std::vector<CBLAS_TRANSPOSE> _trans{trans_x, trans_y};
+  _dims.resize(3);
+
+  const auto bot0_aligned_size = x_aligned_size;
+  const auto bot1_aligned_size = y_aligned_size;
+
+  _dims[0] = (_trans[0] == CblasTrans) ? x_dims_1 : bot0_aligned_size;
+  _dims[1] = (_trans[0] == CblasTrans) ? bot0_aligned_size : x_dims_1;
+  _dims[2] = (_trans[1] == CblasTrans) ? bot1_aligned_size : y_dims_1;
+
+  int bot1_row_num = (_trans[1] == CblasTrans) ? y_dims_1 : bot1_aligned_size;
+  PADDLE_ENFORCE_EQ(_dims[1], bot1_row_num,
+                    "Mismatch size, bot0_final_cols=[%d] bot1_final_rows=[%d]",
+                    _dims[1], bot1_row_num);
+}
+
+void assign_dims(const DDim& x_dims, const LoD& x_lod, const DDim& y_dims,
+                 const LoD& y_lod, CBLAS_TRANSPOSE trans_x, CBLAS_TRANSPOSE trans_y,
+                 std::vector<int64_t>& _dims) {
+
+  std::vector<CBLAS_TRANSPOSE> _trans{trans_x, trans_y};
+  _dims.resize(3);
+
+  const auto bot0_aligned_size = static_cast<int64_t>(x_lod[0][1]);
+  const auto bot1_aligned_size = static_cast<int64_t>(y_lod[0][1]);
+
+  _dims[0] = (_trans[0] == CblasTrans) ? x_dims[1] : bot0_aligned_size;
+  _dims[1] = (_trans[0] == CblasTrans) ? bot0_aligned_size : x_dims[1];
+  _dims[2] = (_trans[1] == CblasTrans) ? bot1_aligned_size : y_dims[1];
+
+  int bot1_row_num = (_trans[1] == CblasTrans) ? y_dims[1] : bot1_aligned_size;
+  PADDLE_ENFORCE_EQ(_dims[1], bot1_row_num,
+         "Mismatch size, bot0_final_cols=[%d] bot1_final_rows=[%d]", _dims[1],
+         bot1_row_num);
+}
+
+class SearchAlignedMatMulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("Y",
+             "Y (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+
+    AddAttr<bool>("transpose_X", "If true, use the transpose of `X`.")
+        .SetDefault(false);
+    AddAttr<bool>("transpose_Y", "If true, use the transpose of `Y`.")
+        .SetDefault(false);
+    AddAttr<float>("alpha", "The scale of Out").SetDefault(1.0f);
+
+    AddOutput("Out", "Out (Tensor, default Tensor<float>) Output variable");
+    AddOutput("_a_addr",
+              "_a_addr (Tensor, default Tensor<float>) Output variable");
+    AddOutput("_b_addr",
+              "_b_addr (Tensor, default Tensor<float>) Output variable");
+    AddOutput("_c_addr",
+              "_c_addr (Tensor, default Tensor<float>) Output variable");
+
+    AddComment(R"DOC(
+  SearchAlignedMatMul
+
+  NOTE: only support 'float32' data type now.
+
+)DOC");
+  }
+};
+
+class SearchAlignedMatMulOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("_a_addr"),
+                   "_a_addr(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("_b_addr"),
+                   "_b_addr(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("_c_addr"),
+                   "_c_addr(Output) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "X should be 2-D tensor");
+
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Y should be 2-D tensor");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+      const auto& y_lod = y_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info.");
+      const auto& y_lod_0 = y_lod[0];
+      PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
+                        "The Input(Y)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          y_dims[0], static_cast<int64_t>(y_lod_0.back()),
+          "The Input(Y)'s lod info mismatches the actual tensor shape.");
+
+      PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(),
+                        "The Length of X and Y must be equal.");
+    } else {
+      // compile time
+      bool trans_x = ctx->Attrs().Get<bool>("transpose_X");
+      bool trans_y = ctx->Attrs().Get<bool>("transpose_Y");
+
+      std::vector<CBLAS_TRANSPOSE> _trans{CblasNoTrans, CblasNoTrans};
+      _trans[0] = trans_x ? CblasTrans : CblasNoTrans;
+      _trans[1] = trans_y ? CblasTrans : CblasNoTrans;
+
+      std::vector<int64_t> _dims;
+      assign_dims(x_dims[1], -1, y_dims[1], -1, _trans[0], _trans[1], _dims);
+      ctx->SetOutputDim("Out", framework::make_ddim({-1, _dims[2]}));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchAlignedMatMulOPKernel : public framework::OpKernel<T> {
+ public:
+  void prepare_ff(const framework::ExecutionContext& ctx,
+                  std::vector<int64_t>& _dims) const {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* bottom1 = ctx.Input<LoDTensor>("Y");
+    auto* top = ctx.Output<LoDTensor>("Out");
+    auto* _a_addr = ctx.Output<Tensor>("_a_addr");
+    auto* _b_addr = ctx.Output<Tensor>("_b_addr");
+    auto* _c_addr = ctx.Output<Tensor>("_c_addr");
+
+    const int batch = bottom0->lod()[0].size() - 1;
+    _a_addr->Resize(framework::make_ddim({batch}));
+    _b_addr->Resize(framework::make_ddim({batch}));
+    _c_addr->Resize(framework::make_ddim({batch}));
+
+    T** a_addr_data = (T**)_a_addr->mutable_data<T>(ctx.GetPlace());
+    T** b_addr_data = (T**)_b_addr->mutable_data<T>(ctx.GetPlace());
+    T** c_addr_data = (T**)_c_addr->mutable_data<T>(ctx.GetPlace());
+
+    PADDLE_ENFORCE_EQ(_dims.size(), 3, "_dims.size() should be eq 3.");
+    const int bot0_size = _dims[0] * _dims[1];
+    const int bot1_size = _dims[1] * _dims[2];
+    const int top_size = _dims[0] * _dims[2];
+
+    for (int i = 0; i < batch; ++i) {
+      a_addr_data[i] = const_cast<T*>(bottom0->data<T>()) + bot0_size * i;
+      b_addr_data[i] = const_cast<T*>(bottom1->data<T>()) + bot1_size * i;
+      c_addr_data[i] = top->mutable_data<T>(ctx.GetPlace()) + top_size * i;
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* bottom1 = ctx.Input<LoDTensor>("Y");
+    auto* top = ctx.Output<LoDTensor>("Out");
+    auto* _a_addr = ctx.Output<Tensor>("_a_addr");
+    auto* _b_addr = ctx.Output<Tensor>("_b_addr");
+    auto* _c_addr = ctx.Output<Tensor>("_c_addr");
+    float _scale = ctx.Attr<float>("alpha");
+    
+    bool trans_x = ctx.Attr<bool>("transpose_X");
+    bool trans_y = ctx.Attr<bool>("transpose_Y");
+
+    std::vector<CBLAS_TRANSPOSE> _trans{CblasNoTrans, CblasNoTrans};
+    _trans[0] = trans_x ? CblasTrans : CblasNoTrans;
+    _trans[1] = trans_y ? CblasTrans : CblasNoTrans;
+
+    std::vector<int64_t> _dims;
+    assign_dims(bottom0->dims(), bottom0->lod(), bottom1->dims(),
+                bottom1->lod(), _trans[0], _trans[1], _dims);
+
+    const int batch = bottom0->lod()[0].size() - 1;
+    std::vector<size_t> offset(batch + 1);
+    for (int i = 0; i <= batch; ++i) {
+      offset[i] = _dims[0] * i;
+    }
+
+    framework::LoD top_lod;
+    top_lod.push_back(offset);
+    top->set_lod(top_lod);
+    top->Resize(framework::make_ddim({static_cast<int64_t>(offset[batch]), _dims[2]}));
+
+    prepare_ff(ctx, _dims);
+
+    call_gemm_batched(ctx, _trans[0], _trans[1], static_cast<int>(_dims[0]), static_cast<int>(_dims[2]), static_cast<int>(_dims[1]),
+                      _scale, (const T**)_a_addr->data<T>(),
+                      (const T**)_b_addr->data<T>(), (T)0.0,
+                      (T**)_c_addr->mutable_data<T>(ctx.GetPlace()), batch);
+  }
+};
+
+class SearchAlignedMatMulOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("_a_addr"),
+                   "_a_addr(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("_b_addr"),
+                   "_b_addr(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("_c_addr"),
+                   "_c_addr(Output) should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Y"))) {
+      ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y"));
+      ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchAlignedMatMulOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void prepare_bp(const framework::ExecutionContext& ctx,
+                  std::vector<int64_t>& _dims, Tensor& _a_addr_diff,
+                  Tensor& _b_addr_diff, Tensor& _c_addr_diff) const {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* _a_addr = ctx.Input<Tensor>("_a_addr");
+    auto* _b_addr = ctx.Input<Tensor>("_b_addr");
+    auto* _c_addr = ctx.Input<Tensor>("_c_addr");
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_y = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
+
+    const int batch = bottom0->lod()[0].size() - 1;
+    PADDLE_ENFORCE_EQ(_a_addr->dims()[0], batch, "blob should be initialized before bp");
+
+    _a_addr_diff.Resize(_a_addr->dims());
+    _b_addr_diff.Resize(_b_addr->dims());
+    _c_addr_diff.Resize(_c_addr->dims());
+    T** a_addr_diff = (T**)_a_addr_diff.mutable_data<T>(ctx.GetPlace());
+    T** b_addr_diff = (T**)_b_addr_diff.mutable_data<T>(ctx.GetPlace());
+    T** c_addr_diff = (T**)_c_addr_diff.mutable_data<T>(ctx.GetPlace());
+
+    
+
+    const int bot0_size = _dims[0] * _dims[1];
+    const int bot1_size = _dims[1] * _dims[2];
+    const int top_size = _dims[0] * _dims[2];
+
+    for (int i = 0; i < batch; ++i) {
+      a_addr_diff[i] = d_x->mutable_data<T>(ctx.GetPlace()) + bot0_size * i;
+      b_addr_diff[i] = d_y->mutable_data<T>(ctx.GetPlace()) + bot1_size * i;
+      c_addr_diff[i] =
+          const_cast<T*>(d_out->data<T>()) + top_size * i;
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* bottom1 = ctx.Input<LoDTensor>("Y");
+    auto* _a_addr = ctx.Input<Tensor>("_a_addr");
+    auto* _b_addr = ctx.Input<Tensor>("_b_addr");
+    bool trans_x = ctx.Attr<bool>("transpose_X");
+    bool trans_y = ctx.Attr<bool>("transpose_Y");
+    float _scale = ctx.Attr<float>("alpha");
+
+    std::vector<CBLAS_TRANSPOSE> _trans{CblasNoTrans, CblasNoTrans};
+    _trans[0] = trans_x ? CblasTrans : CblasNoTrans;
+    _trans[1] = trans_y ? CblasTrans : CblasNoTrans;
+
+    std::vector<int64_t> _dims;
+    assign_dims(bottom0->dims(), bottom0->lod(), bottom1->dims(),
+                bottom1->lod(), _trans[0], _trans[1], _dims);
+
+    Tensor _a_addr_diff, _b_addr_diff, _c_addr_diff;
+    prepare_bp(ctx, _dims, _a_addr_diff, _b_addr_diff, _c_addr_diff);
+
+    const int batch = bottom0->lod()[0].size() - 1;
+    if (_trans[1] == CblasTrans) {
+      call_gemm_batched(
+          ctx, CblasTrans, _trans[0], _dims[2], _dims[1], _dims[0], _scale,
+          (const T**)_c_addr_diff.data<T>(), (const T**)_a_addr->data<T>(),
+          (T)0.0, (T**)_b_addr_diff.mutable_data<T>(ctx.GetPlace()), batch);
+    } else {
+      CBLAS_TRANSPOSE bot0_trans =
+          _trans[0] == CblasTrans ? CblasNoTrans : CblasTrans;
+      call_gemm_batched(
+          ctx, bot0_trans, CblasNoTrans, static_cast<int>(_dims[1]), static_cast<int>(_dims[2]), static_cast<int>(_dims[0]), _scale,
+          (const T**)_a_addr->data<T>(), (const T**)_c_addr_diff.data<T>(),
+          (T)0.0, (T**)_b_addr_diff.mutable_data<T>(ctx.GetPlace()), batch);
+    }
+
+    if (_trans[0] == CblasTrans) {
+      call_gemm_batched(
+          ctx, _trans[1], CblasTrans, _dims[1], _dims[0], _dims[2], _scale,
+          (const T**)_b_addr->data<T>(), (const T**)_c_addr_diff.data<T>(),
+          (T)0.0, (T**)_a_addr_diff.mutable_data<T>(ctx.GetPlace()), batch);
+    } else {
+      CBLAS_TRANSPOSE bot1_trans =
+          (_trans[1] == CblasTrans) ? CblasNoTrans : CblasTrans;
+      call_gemm_batched(
+          ctx, CblasNoTrans, bot1_trans, _dims[0], _dims[1], _dims[2], _scale,
+          (const T**)_c_addr_diff.data<T>(), (const T**)_b_addr->data<T>(),
+          (T)0.0, (T**)_a_addr_diff.mutable_data<T>(ctx.GetPlace()), batch);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_aligned_mat_mul, ops::SearchAlignedMatMulOP,
+                  ops::SearchAlignedMatMulOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_aligned_mat_mul_grad, ops::SearchAlignedMatMulOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    search_aligned_mat_mul,
+    ops::CPUSearchAlignedMatMulOPKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchAlignedMatMulOPKernel<plt::CPUDeviceContext,
+    //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_aligned_mat_mul_grad,
+    ops::CPUSearchAlignedMatMulOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchAlignedMatMulOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_attention_padding_mask_op.cc b/paddle/fluid/operators/search_attention_padding_mask_op.cc
new file mode 100644
index 00000000000000..e7ac2224ab9dd2
--- /dev/null
+++ b/paddle/fluid/operators/search_attention_padding_mask_op.cc
@@ -0,0 +1,263 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+class SearchAttentionPaddingMaskOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("Y",
+             "Y (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+
+    AddAttr<int>("pad_id", "pad_id").SetDefault(0).EqualGreaterThan(0);
+	AddAttr<float>("mask", "mask").SetDefault(0.0);
+
+    AddOutput("Out",
+              "Out (LoDTensor, default LoDTensor<float>) Output variable");
+    AddOutput(
+        "pad_begin",
+        "pad_begin (LoDTensor, default LoDTensor<float>) Output variable");
+
+    AddComment(R"DOC(
+  SearchAttentionPaddingMask
+
+  NOTE: only support 'float32' data type now.
+
+)DOC");
+  }
+};
+
+class SearchAttentionPaddingMaskOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("pad_begin"),
+                   "pad_begin(Output) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
+
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Y should be 2-D tensor");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+      const auto& y_lod = y_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info.");
+      const auto& y_lod_0 = y_lod[0];
+      PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
+                        "The Input(Y)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          y_dims[0], static_cast<int64_t>(y_lod_0.back()),
+          "The Input(Y)'s lod info mismatches the actual tensor shape.");
+    } else {
+      // compile time
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, x_dims[1]}));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchAttentionPaddingMaskOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* bottom1 = ctx.Input<LoDTensor>("Y");
+    auto* top = ctx.Output<LoDTensor>("Out");
+    auto* _pad_begin = ctx.Output<LoDTensor>("pad_begin");
+
+    int _pad_id = ctx.Attr<int>("pad_id");
+    float _mask = ctx.Attr<float>("mask");
+
+    const auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
+    PADDLE_ENFORCE_EQ(src_len, bottom0->dims()[1],
+                      "Mismatch source length, expect: %d get: %d", src_len,
+                      bottom0->dims()[1]);
+    const int att_batch = bottom0->lod()[0].size() - 1;
+    const int src_batch = bottom1->lod()[0].size() - 1;
+    PADDLE_ENFORCE_EQ(att_batch % src_batch, 0,
+                      "Mismatch batch size, bottom0: %d, bottom1: %d",
+                      att_batch, src_batch);
+
+    _pad_begin->Resize(framework::make_ddim({src_batch}));
+    int* pad_begin = _pad_begin->mutable_data<int>(ctx.GetPlace());
+    for (int i = 0; i < src_batch; ++i) {
+      // bottom data is padded to be aligned
+      const auto* src_data = bottom1->data<T>() + src_len * i;
+      int index = src_len - 1;
+      for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+           --index) {
+      }
+      pad_begin[i] = index + 1;
+    }
+
+    top->Resize(bottom0->dims());
+    const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
+    auto* top_data = top->mutable_data<T>(ctx.GetPlace());
+    memcpy(top_data, bottom0->data<T>(),
+           bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
+    for (int i = 0; i < att_batch; ++i) {
+      for (int j = 0; j < att_len; ++j) {
+        top_data =
+            top->mutable_data<T>(ctx.GetPlace()) + src_len * (att_len * i + j);
+        int src_idx = i % src_batch;
+        for (int k = pad_begin[src_idx]; k < src_len; ++k) {
+          top_data[k] = _mask;
+        }
+      }
+    }
+  }
+};
+
+class SearchAttentionPaddingMaskGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("search_attention_padding_mask_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput("Y", Input("Y"));
+    op_desc_ptr->SetInput("pad_begin", Output("pad_begin"));
+
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+class SearchAttentionPaddingMaskOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("pad_begin"),
+                   "Input(pad_begin) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchAttentionPaddingMaskOPGradKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* bottom1 = ctx.Input<LoDTensor>("Y");
+    auto* _pad_begin = ctx.Input<LoDTensor>("pad_begin");
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    const int* pad_begin = _pad_begin->data<int>();
+    const auto att_batch = bottom0->lod()[0].size() - 1;
+    const auto src_batch = bottom1->lod()[0].size() - 1;
+
+    const auto att_len = bottom0->lod()[0][1];
+    const auto src_len = bottom1->lod()[0][1];
+
+    auto* att_diff = d_x->mutable_data<T>(ctx.GetPlace());
+    memcpy(att_diff, d_out->data<T>(),
+           d_out->dims()[0] * d_out->dims()[1] * sizeof(T));
+    for (int i = 0; i < att_batch; ++i) {
+      for (int j = 0; j < att_len; ++j) {
+        int src_idx = i % src_batch;
+        att_diff = d_x->mutable_data<T>(ctx.GetPlace()) +
+                   src_len * (att_len * i + j) + pad_begin[src_idx];
+        memset(att_diff, 0, (src_len - pad_begin[src_idx]) * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_attention_padding_mask,
+                  ops::SearchAttentionPaddingMaskOP,
+                  ops::SearchAttentionPaddingMaskOpMaker,
+                  ops::SearchAttentionPaddingMaskGradOpMaker);
+REGISTER_OPERATOR(search_attention_padding_mask_grad,
+                  ops::SearchAttentionPaddingMaskOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    search_attention_padding_mask,
+    ops::CPUSearchAttentionPaddingMaskOPKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchAttentionPaddingMaskOPKernel<plt::CPUDeviceContext,
+    //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_attention_padding_mask_grad,
+    ops::CPUSearchAttentionPaddingMaskOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchAttentionPaddingMaskOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
new file mode 100644
index 00000000000000..ce72f18f72a0d6
--- /dev/null
+++ b/paddle/fluid/operators/search_compute.h
@@ -0,0 +1,422 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <immintrin.h>  // sse
+#include <cfloat>
+#include <cmath>    //fabs
+#include <cstring>  // memcpy
+
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/mklml.h"
+//#include "naive_gemm.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename DeviceContext, typename T>
+void call_gemm(const math::BlasT<DeviceContext, T>& blas,
+               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const T alpha, const T* A,
+               const T* B, const T beta, T* C) {
+#ifndef __NAIVE_GEMM__
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A,
+              B, beta, C);
+#endif  // !__NAIVE_GEMM__
+}
+
+template <typename T>
+void call_gemm(const framework::ExecutionContext& ctx,
+               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const T alpha, const T* A,
+               const T* B, const T beta, T* C) {
+#ifndef __NAIVE_GEMM__
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A,
+              B, beta, C);
+#endif  // !__NAIVE_GEMM__
+}
+
+template <typename T>
+void call_gemm_batched(const framework::ExecutionContext& ctx,
+                       const CBLAS_TRANSPOSE TransA,
+                       const CBLAS_TRANSPOSE TransB, const int M, const int N,
+                       const int K, const T alpha, const T** A,
+                       const T** B, const T beta, T** C,
+                       const int batch) {
+  for (int i = 0; i < batch; ++i) {
+    call_gemm(ctx, TransA, TransB, M, N, K, alpha, A[i], B[i], beta, C[i]);
+  }
+}
+
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+#ifndef LEGO_SSE
+#define LEGO_SSE
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define __m256x __m256
+#define __m128x __m128
+
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define __m256x __m256d
+#define __m128x __m128d
+
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define X_MIN FLT_MIN
+#define X_MAX FLT_MAX
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define X_MIN DBL_MIN
+#define X_MAX DBL_MAX
+
+#endif
+
+template <typename T>
+inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj),
+                                           _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(
+        y + jjj,
+        _mm256_add_px(_mm256_load_px(y + jjj),
+                      _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
+  }
+
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  __m128x mm_alpha = _mm_load1_px(&alpha);
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(y + jjj,
+                 _mm_add_px(_mm_load_px(y + jjj),
+                            _mm_mul_px(mm_alpha, _mm_load_px(x + jjj))));
+  }
+
+#endif
+  for (; jjj < len; jjj++) {
+    y[jjj] += alpha * x[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
+  }
+
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  __m128x mm_alpha = _mm_load1_px(&alpha);
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(y + jjj, _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)));
+  }
+
+#endif
+  for (; jjj < len; jjj++) {
+    y[jjj] = alpha * x[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_eltmul(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(z + jjj, _mm256_mul_px(_mm256_load_px(x + jjj),
+                                           _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_mul_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] * y[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_add_scalar(const T* x, T* y, size_t len, const T alpha) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(y + jjj, _mm256_add_px(mm_alpha, _mm256_load_px(x + jjj)));
+  }
+
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  __m128x mm_alpha = _mm_load1_px(&alpha);
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(y + jjj, _mm_add_px(mm_alpha, _mm_load_px(x + jjj)));
+  }
+
+#endif
+  for (; jjj < len; jjj++) {
+    y[jjj] = alpha + x[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_sum(const T* x, T& y, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+  y = 0.;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+
+  __m256x mm_result = _mm256_setzero_px();
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    mm_result = _mm256_add_px(mm_result, _mm256_load_px(x + jjj));
+  }
+
+#if defined(LEGO_USE_FLOAT)
+  __m256x hsum = _mm256_hadd_px(mm_result, mm_result);
+#elif defined(LEGO_USE_DOUBLE)
+  __m256x hsum = mm_result;
+#endif
+  hsum = _mm256_add_px(hsum, _mm256_permute2f128_px(hsum, hsum, 0x1));
+  _mm_store_sx(&y, _mm_hadd_px(_mm256_castpx256_px128(hsum),
+                               _mm256_castpx256_px128(hsum)));
+
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  __m128x mm_result = _mm_setzero_px();
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    mm_result = _mm_add_px(mm_result, _mm_load_px(x + jjj));
+  }
+  __m128x mm_tmp = _mm_hadd_px(mm_result, mm_result);
+
+#if defined(LEGO_USE_FLOAT)
+  _mm_store_sx(&y, _mm_hadd_px(mm_tmp, mm_tmp));
+#elif defined(LEGO_USE_DOUBLE)
+  _mm_store_sx(&y, mm_tmp);
+#endif
+
+#endif
+  for (; jjj < len; jjj++) {
+    y += x[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_scale(const T* x, T* y, size_t len, const T alpha) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
+
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
+  }
+
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  __m128x mm_alpha = _mm_load1_px(&alpha);
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(y + jjj, _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    y[jjj] = alpha * x[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_ip(const T* vec1, const T* vec2, size_t len, T& result) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+  result = 0.;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+
+  __m256x mm_result = _mm256_setzero_px();
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    mm_result = _mm256_add_px(
+        mm_result,
+        _mm256_mul_px(_mm256_load_px(vec1 + jjj), _mm256_load_px(vec2 + jjj)));
+  }
+
+  //    result = mm_result[0]+mm_result[1]+mm_result[2]+mm_result[3]+
+  //      mm_result[4]+mm_result[5]+mm_result[6]+mm_result[7];
+
+#if defined(LEGO_USE_FLOAT)
+  __m256x hsum = _mm256_hadd_px(mm_result, mm_result);
+#elif defined(LEGO_USE_DOUBLE)
+  __m256x hsum = mm_result;
+#endif
+
+  hsum = _mm256_add_px(hsum, _mm256_permute2f128_px(hsum, hsum, 0x1));
+
+  _mm_store_sx(&result, _mm_hadd_px(_mm256_castpx256_px128(hsum),
+                                    _mm256_castpx256_px128(hsum)));
+
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  __m128x mm_result = _mm_setzero_px();
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    mm_result = _mm_add_px(mm_result, _mm_mul_px(_mm_load_px(vec1 + jjj),
+                                                 _mm_load_px(vec2 + jjj)));
+  }
+  __m128x mm_tmp = _mm_hadd_px(mm_result, mm_result);
+#if defined(LEGO_USE_FLOAT)
+  _mm_store_sx(&result, _mm_hadd_px(mm_tmp, mm_tmp));
+#elif defined(LEGO_USE_DOUBLE)
+  _mm_store_sx(&result, mm_tmp);
+#endif
+
+#endif
+  for (; jjj < len; jjj++) {
+    result += vec1[jjj] * vec2[jjj];
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/search_embedding_op.cc b/paddle/fluid/operators/search_embedding_op.cc
new file mode 100644
index 00000000000000..2ae37288debd94
--- /dev/null
+++ b/paddle/fluid/operators/search_embedding_op.cc
@@ -0,0 +1,370 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+#ifndef LEGO_SSE
+#define LEGO_SSE
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define __m256x __m256
+#define __m128x __m128
+
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define __m256x __m256d
+#define __m128x __m128d
+
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+
+template <typename T>
+inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj),
+                                           _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  __m256x mm_alpha = _mm256_broadcast_sx(&alpha);
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(
+        y + jjj,
+        _mm256_add_px(_mm256_load_px(y + jjj),
+                      _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
+  }
+
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+  __m128x mm_alpha = _mm_load1_px(&alpha);
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(y + jjj,
+                 _mm_add_px(_mm_load_px(y + jjj),
+                            _mm_mul_px(mm_alpha, _mm_load_px(x + jjj))));
+  }
+
+#endif
+  for (; jjj < len; jjj++) {
+    y[jjj] += alpha * x[jjj];
+  }
+}
+
+class SearchEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (Tensor, default Tensor<int_64>) Input variable which "
+             "should contain lod information.");
+    AddInput("W", "W (Tensor)");
+    AddAttr<int>("num_voc", "num_voc").SetDefault(0).EqualGreaterThan(0);
+    AddAttr<int>("num_emb", "num_emb").SetDefault(0).EqualGreaterThan(0);
+    AddAttr<float>("lr", "learning rate").SetDefault(0.0).EqualGreaterThan(0.0);
+
+    AddOutput("Out", "Out (Tensor, default Tensor<float>) Output variable");
+
+    AddComment(R"DOC(
+      SearchEmbedding
+      
+      NOTE: only support 'float32' data type now.
+
+    )DOC");
+  }
+};
+
+class SearchEmbeddingOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "W(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
+
+    auto w_dims = ctx->GetInputDim("W");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+
+    int num_voc = ctx->Attrs().Get<int>("num_voc");
+    int num_emb = ctx->Attrs().Get<int>("num_emb");
+
+    PADDLE_ENFORCE_EQ(w_dims[0], num_voc,
+                      "w_dims[0] should be equal to num_voc");
+    PADDLE_ENFORCE_EQ(w_dims[1], num_emb,
+                      "w_dims[1] should be equal to num_emb");
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, num_emb}));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchEmbeddingOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<LoDTensor>("X");
+    auto* _blobs = ctx.Input<Tensor>("W");
+    auto* top = ctx.Output<LoDTensor>("Out");
+
+    int _cap_e = ctx.Attr<int>("num_emb");
+
+    int _cap_l = bottom->dims()[0];
+    auto& offset = bottom->lod()[0];
+    std::vector<size_t> top_offset;
+    top_offset.resize(offset.size());
+    top_offset[0] = 0;
+
+    for (int i = 0; i < top_offset.size() - 1; ++i) {
+      int w = offset[i + 1] - offset[i];
+      if (w == 0) {
+        top_offset[i + 1] = top_offset[i] + 1;
+      } else {
+        top_offset[i + 1] = top_offset[i] + w;
+      }
+    }
+
+    int top_l = top_offset[top_offset.size() - 1];
+    framework::LoD top_lod;
+    top_lod.push_back(top_offset);
+    top->set_lod(top_lod);
+    top->Resize(framework::make_ddim({top_l, _cap_e}));
+
+    PADDLE_ENFORCE_EQ(top_l, _cap_l,
+                      "top_l should be equal to _cap_l");
+
+    auto* top_data = top->mutable_data<T>(ctx.GetPlace());
+    const auto* bottom_data = bottom->data<int64_t>();
+    const auto* weights = _blobs->data<T>();
+
+    for (int i = 0; i < offset.size() - 1; ++i) {
+      int w = offset[i + 1] - offset[i];
+      if (w == 1 && bottom_data[offset[i]] == -1) {
+        //LOG (ERROR) << "zero len sequence " << i << "/" << top_offset.size() - 1;
+        memset(top_data + top_offset[i] * _cap_e, 0, _cap_e * sizeof(T));
+      } else {
+        for (int j = 0; j < w; ++j) {
+          unsigned int word_idx =
+              static_cast<unsigned int>(bottom_data[offset[i] + j]);
+          memcpy((void*)(top_data + (top_offset[i] + j) * _cap_e),
+                 (void*)(weights + word_idx * _cap_e), _cap_e * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+class SearchEmbeddingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) of SearchEmbeddingGradOp should not be null.");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+  
+};
+
+class SearchEmbeddingGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("search_embedding_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput("W", Input("W"));
+
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchEmbeddingOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<LoDTensor>("X");
+    auto* _blobs = ctx.Input<Tensor>("W");
+    auto* top = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+
+    int _cap_e = ctx.Attr<int>("num_emb");
+    float _lr = ctx.Attr<float>("lr");
+
+    auto& offset = bottom->lod()[0];
+    auto& top_offset = top->lod()[0];
+
+    const auto* top_diff = top->data<T>();
+    const auto* bottom_data = bottom->data<int64_t>();
+    T* weights = (T*) (_blobs->data<T>());
+
+    T mlr = -1.0 * _lr;
+
+    for (int i = 0; i < offset.size() - 1; ++i) {
+      int w = offset[i + 1] - offset[i];
+      if (!(w == 1 && bottom_data[offset[i]] == -1)) {
+        for (int j = 0; j < w; ++j) {
+          unsigned int word_idx =
+              static_cast<unsigned int>(bottom_data[offset[i] + j]);
+          sse_axpy((const T*)top_diff + (top_offset[i] + j) * _cap_e,
+                   weights + word_idx * _cap_e, _cap_e, mlr);
+        }
+      } else {
+        //LOG(ERROR) << "bp: zero len sequence " << i << "/"
+        //           << top_offset.size() - 1;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_embedding, ops::SearchEmbeddingOP,
+                  ops::SearchEmbeddingOpMaker, ops::SearchEmbeddingGradOpMaker);
+REGISTER_OPERATOR(search_embedding_grad, ops::SearchEmbeddingOpGrad);
+
+REGISTER_OP_CPU_KERNEL(search_embedding,
+                       ops::CPUSearchEmbeddingOPKernel<plt::CPUDeviceContext, float>
+                       //     ops::CPUSearchEmbeddingOPKernel<plt::CPUDeviceContext,
+                       //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_embedding_grad, ops::CPUSearchEmbeddingOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchEmbeddingOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_fc_op.cc b/paddle/fluid/operators/search_fc_op.cc
new file mode 100644
index 00000000000000..6752564c05573d
--- /dev/null
+++ b/paddle/fluid/operators/search_fc_op.cc
@@ -0,0 +1,331 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+//#include "naive_gemm.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/mklml.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename DeviceContext, typename T>
+void call_gemm(const math::BlasT<DeviceContext, T>& blas,
+               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const T alpha, const T* A,
+               const T* B, const T beta, T* C) {
+#ifndef __NAIVE_GEMM__
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A,
+              B, beta, C);
+#endif  // !__NAIVE_GEMM__
+}
+
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+#ifndef LEGO_SSE
+#define LEGO_SSE
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define __m256x __m256
+#define __m128x __m128
+
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define __m256x __m256d
+#define __m128x __m128d
+
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+
+template <typename T>
+inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj),
+                                           _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+
+class SearchFCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (Tensor, default Tensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("W", "W (Tensor)");
+    AddInput("b", "b (Tensor)");
+    AddAttr<int>("out_size", "out_size: the output size")
+        .SetDefault(0)
+        .EqualGreaterThan(1);
+
+    AddOutput("Out", "Out (Tensor, default Tensor<float>) Output variable");
+
+    AddComment(R"DOC(
+      SearchFC
+      
+      NOTE: only support 'float32' data type now.
+
+    )DOC");
+  }
+};
+
+class SearchFCOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "W(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("b"), "b(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
+
+    auto w_dims = ctx->GetInputDim("W");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+
+    auto b_dims = ctx->GetInputDim("b");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 1, "b should be 1-D tensor");
+
+    int out_size = ctx->Attrs().Get<int>("out_size");
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, out_size}));
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(w_dims[1], x_dims[1], "wrong shape: w_dims[1] != x_dims[1]");
+    }
+    else {
+      // compile time
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchFCOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<Tensor>("X");
+    auto* w = ctx.Input<Tensor>("W");
+    auto* b = ctx.Input<Tensor>("b");
+    auto* top = ctx.Output<Tensor>("Out");
+
+    int out_size = ctx.Attr<int>("out_size");  // 100
+    int batch = bottom->dims()[0];
+
+    int _out = w->dims()[0];  // 100
+    int _in = w->dims()[1];   // 228
+    //PADDLE_ENFORCE_EQ(out_size, _out, "out_size should equal to w->dims()[1]");
+    //PADDLE_ENFORCE_EQ(bottom->dims()[1], _in,
+    //                  "x.dims()[1] should equal to w->dims()[0]");
+
+    top->Resize(framework::make_ddim({bottom->dims()[0], out_size}));
+
+    const auto* bottom_data = bottom->data<T>();
+    auto* top_data = top->mutable_data<T>(ctx.GetPlace());
+    const auto* weights = w->data<T>();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    call_gemm(blas, CblasNoTrans, CblasTrans, batch, _out, _in, 1.0f,
+              bottom_data, weights, 0.0f, top_data);
+    if (true) {
+      const auto* bias_data = b->data<T>();
+      for (int i = 0; i < batch; ++i) {
+        // add bias here
+        sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out);
+      }
+    }
+  }
+};
+
+class SearchFCOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("b"), "Input(b) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("W"))) {
+      ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("b"))) {
+      ctx->SetOutputDim(framework::GradVarName("b"), ctx->GetInputDim("b"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchFCOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    //     auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    //
+    //     auto* bottom_diff = d_x->mutable_data<T>(ctx.GetPlace());
+    //     auto* x = ctx.Input<LoDTensor>("X");
+    //     memset(bottom_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
+    auto* bottom = ctx.Input<Tensor>("X");
+    auto* w = ctx.Input<Tensor>("W");
+    int _out = w->dims()[0];  // 100
+    int _in = w->dims()[1];   // 228
+
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
+
+    int batch = bottom->dims()[0];
+    const auto* top_diff = d_out->data<T>();
+    const auto* bottom_data = bottom->data<T>();
+    auto* bottom_diff = d_x->mutable_data<T>(ctx.GetPlace());
+
+    const auto* weights = w->data<T>();
+    auto* weights_diff = d_w->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    //call_gemm(blas, CblasTrans, CblasNoTrans, _in, _out, batch, 1.0f,
+    //          bottom_data, top_diff, 0.0f, weights_diff);
+    call_gemm(blas, CblasTrans, CblasNoTrans, _out, _in, batch, (T)1.0,
+                          top_diff, bottom_data, (T)0.0, weights_diff);
+
+    call_gemm(blas, CblasNoTrans, CblasNoTrans, batch, _in, _out, (T)1.0, top_diff,
+              weights, (T)0.0, bottom_diff);
+
+    if (true) {
+      auto* d_b = ctx.Output<Tensor>(framework::GradVarName("b"));
+      auto* bias_diff = d_b->mutable_data<T>(ctx.GetPlace());
+      memset(bias_diff, 0.0, _out * sizeof(T));
+      for (int i = 0; i < batch; ++i) {
+        sse_eltadd(bias_diff, top_diff + i * _out, bias_diff, _out);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_fc, ops::SearchFCOP, ops::SearchFCOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_fc_grad, ops::SearchFCOpGrad);
+
+REGISTER_OP_CPU_KERNEL(search_fc,
+                       ops::CPUSearchFCOPKernel<plt::CPUDeviceContext, float>
+                       //     ops::CPUSearchFCOPKernel<plt::CPUDeviceContext,
+                       //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_fc_grad, ops::CPUSearchFCOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchFCOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_grnn_op.cc b/paddle/fluid/operators/search_grnn_op.cc
new file mode 100644
index 00000000000000..118a12c69f3044
--- /dev/null
+++ b/paddle/fluid/operators/search_grnn_op.cc
@@ -0,0 +1,807 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/search_grnn_op.h"
+#include <cmath>
+#ifndef WIN32
+//#include "naive_gemm.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifndef _DEBUGGING
+#define _DEBUGGING
+#endif
+
+//#include "debug.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+#define SIGMOID(z) (sigmoid(z))
+#define SIGMOID_D(a) ((a) * (1 - (a)))
+#define TANHD(a) (1 - (a) * (a))
+
+template <typename DeviceContext, typename T>
+void call_gemm(const math::BlasT<DeviceContext, T>& blas,
+               const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const T alpha, const T* A,
+               const T* B, const T beta, T* C) {
+#ifndef __NAIVE_GEMM__
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A,
+              B, beta, C);
+#endif  // !__NAIVE_GEMM__
+}
+
+template <typename T>
+T sigmoid(T z) {
+  return 1 / (1 + std::exp(-z));
+}
+
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+#ifndef LEGO_SSE
+#define LEGO_SSE
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define __m256x __m256
+#define __m128x __m128
+
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define __m256x __m256d
+#define __m128x __m128d
+
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+
+template <typename T>
+inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj),
+                                           _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+
+template <typename T>
+inline void sse_eltmul(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(z + jjj, _mm256_mul_px(_mm256_load_px(x + jjj),
+                                           _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_mul_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] * y[jjj];
+  }
+}
+
+class SearchGrnnOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("Wi", "Wi (Tensor)");
+    AddInput("Wh", "Wh (Tensor)");
+    AddAttr<int>("num_input", "num_input: the embedding size").SetDefault(0);
+    AddAttr<int>("num_hidden", "num_hidden: the hidden size").SetDefault(0);
+
+    AddOutput("Out",
+              "Out (LoDTensor, default LoDTensor<float>) Output variable");
+    AddOutput("tmp_buffer",
+              "tmp_buffer (LoDTensor, default LoDTensor<float>) tmp variable");
+    AddOutput("idx_sorted_by_width",
+              "idx_sorted_by_width (Tensor, Tensor<int>) tmp variable");
+    AddOutput(
+        "layout_input",
+        "layout_input (LoDTensor, default LoDTensor<float>) tmp variable");
+
+    AddComment(R"DOC(
+      SearchGrnn
+      
+      NOTE: only support 'float32' data type now.
+
+    )DOC");
+  }
+};
+
+class SearchGrnnOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Wi"), "Wi(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Wh"), "Wh(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("tmp_buffer"),
+                   "tmp_buffer(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("idx_sorted_by_width"),
+                   "idx_sorted_by_width(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("layout_input"),
+                   "layout_input(Output) should not be null.");
+
+    int _cap_h = ctx->Attrs().Get<int>("num_hidden");
+    int _cap_e = ctx->Attrs().Get<int>("num_input");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2,
+                      "The rank of X(Input) can't be less than 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], _cap_e, "x_dims[1] should be equal to _cap_e");
+
+    auto wi_dims = ctx->GetInputDim("Wi");
+    PADDLE_ENFORCE_EQ(wi_dims.size(), 3, "Wi should be 3-D tensor");
+    PADDLE_ENFORCE_EQ(wi_dims[0], 3, "Wi dim[0] should be equal to 3");
+    PADDLE_ENFORCE_EQ(wi_dims[1], _cap_h,
+                      "wi_dims[1] should be equal to _cap_h");
+    PADDLE_ENFORCE_EQ(wi_dims[2], _cap_e,
+                      "wi_dims[2] should be equal to _cap_e");
+
+    auto wh_dims = ctx->GetInputDim("Wh");
+    PADDLE_ENFORCE_EQ(wh_dims.size(), 3, "Wi should be 3-D tensor");
+    PADDLE_ENFORCE_EQ(wh_dims[0], 3, "Wh dim[0] should be equal to 3");
+    PADDLE_ENFORCE_EQ(wh_dims[1], _cap_h,
+                      "wh_dims[1] should be equal to _cap_h");
+    PADDLE_ENFORCE_EQ(wh_dims[2], _cap_h,
+                      "wh_dims[2] should be equal to _cap_h");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod[0].back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+    } else {
+      std::vector<int64_t> out_dims_vec{-1};
+      out_dims_vec.push_back(_cap_h);
+      std::vector<int64_t> tmp_buffer_shape{20};
+      tmp_buffer_shape.push_back(-1);
+      tmp_buffer_shape.push_back(_cap_h);
+      ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+      ctx->SetOutputDim("tmp_buffer", framework::make_ddim(tmp_buffer_shape));
+    }
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchGrnnOPKernel : public framework::OpKernel<T> {
+ public:
+  void prepare_layout(const framework::ExecutionContext& ctx,
+                      const LoDTensor* input_blob) const {
+    auto* _idx_sorted_by_width = ctx.Output<Tensor>("idx_sorted_by_width");
+    auto* _layout_input = ctx.Output<LoDTensor>("layout_input");
+
+    auto _input = input_blob;
+
+    // usually total length
+    int dim0 = _input->dims()[0];
+    // if it is id only sequence
+    int dim1 = 1;
+
+    // if its a embedding like sequence (dim1 would be embedding_size)
+    if (_input->dims().size() > 1) {
+      dim1 = _input->dims()[1];
+    }
+
+    int batch = _input->lod()[0].size() - 1;
+
+    auto& offset = _input->lod()[0];
+
+    Tensor _width;
+    _width.Resize(framework::make_ddim({batch}));
+    _idx_sorted_by_width->Resize(framework::make_ddim({batch}));
+    int* width_data = _width.mutable_data<int>(ctx.GetPlace());
+    int* idx_sorted_by_width_data =
+        _idx_sorted_by_width->mutable_data<int>(ctx.GetPlace());
+    // sort sequence by width (descending) and find the largest width in the
+    // batch
+    for (int i = 0; i < batch; i++) {
+      width_data[i] = offset[i + 1] - offset[i];
+      idx_sorted_by_width_data[i] = i;
+    }
+    std::sort(idx_sorted_by_width_data, idx_sorted_by_width_data + batch,
+              [&_width](int a, int b) {
+                return _width.data<int>()[a] > _width.data<int>()[b];
+              });
+    int max_width = width_data[idx_sorted_by_width_data[0]];
+
+    // start of reorganizing the input
+    std::vector<size_t> new_offset;
+    new_offset.resize(max_width + 1);
+
+    new_offset[0] = 0;
+    int j = batch - 1;
+    int last_width = 0;
+    int sub_row = 0;
+    int sub_col = 0;
+
+    for (int i = 1; i <= max_width;) {
+      for (int k = j; k >= 0; --k) {
+        if (width_data[idx_sorted_by_width_data[k]] > last_width) {
+          sub_row = width_data[idx_sorted_by_width_data[k]] - last_width;
+          sub_col = k + 1;
+
+          for (int s = 0; s < sub_row; s++) {
+            new_offset[i] = new_offset[i - 1] + sub_col;
+            i++;
+          }
+
+          // move on
+          last_width = width_data[idx_sorted_by_width_data[k]];
+          j = k - 1;
+          break;
+        }
+      }
+    }
+
+    // copying to the reorganized buffer
+    if (_input->dims().size() == 1) {
+      //_layout_input.reshape_batch_sequence({dim0}, new_offset);
+    } else {
+      //_layout_input.reshape_batch_sequence({dim0, dim1}, new_offset);
+
+      framework::LoD new_lod;
+      new_lod.push_back(new_offset);
+      _layout_input->set_lod(new_lod);
+      _layout_input->Resize(framework::make_ddim({dim0, dim1}));
+    }
+
+    auto* new_emb = _layout_input->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < max_width; i++) {
+      int w = new_offset[i + 1] - new_offset[i];
+      auto* emb_start = new_emb + dim1 * new_offset[i];
+      for (int j = 0; j < w; ++j) {
+        memcpy(emb_start + dim1 * j,
+               _input->data<T>() + dim1 * offset[idx_sorted_by_width_data[j]] +
+                   dim1 * i,
+               dim1 * sizeof(T));
+      }
+    }
+    // end of reorganizing the input
+  }
+
+  void copy_back(const framework::ExecutionContext& ctx, T* from, T* to,
+                 int step) const {
+    auto* _input = ctx.Input<LoDTensor>("X");
+    auto* _layout_input = ctx.Output<LoDTensor>("layout_input");
+    auto* _idx_sorted_by_width = ctx.Output<Tensor>("idx_sorted_by_width");
+
+    const auto& offset = _input->lod()[0];
+    const auto& new_offset = _layout_input->lod()[0];
+    const auto* idx_sorted_by_width_data = _idx_sorted_by_width->data<int>();
+    for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
+      int w = new_offset[i + 1] - new_offset[i];
+      for (int j = 0; j < w; j++) {
+        memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i),
+               from + (new_offset[i] + j) * step, step * sizeof(T));
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<LoDTensor>("X");
+    auto* wi = ctx.Input<LoDTensor>("Wi");
+    auto* wh = ctx.Input<Tensor>("Wh");
+    auto* top = ctx.Output<LoDTensor>("Out");
+    auto* _buffer = ctx.Output<LoDTensor>("tmp_buffer");
+
+    // std::vector<const LoDTensor*> _blobs{wi, wh};
+
+    int _cap_h = ctx.Attr<int>("num_hidden");
+    int _cap_e = ctx.Attr<int>("num_input");
+
+    int _cap_l = bottom->dims()[0];
+    int batch = bottom->lod()[0].size() - 1;
+
+    const auto& offset = bottom->lod()[0];
+    framework::LoD top_lod;
+    top_lod.push_back(offset);
+    top->set_lod(top_lod);
+    std::vector<int64_t> top_dims_vec{_cap_l, _cap_h};
+    auto* top_hidden = top->mutable_data<T>(framework::make_ddim(top_dims_vec),
+                                            ctx.GetPlace());
+
+    const auto* dense_e2h = wi->data<T>();
+    const auto* dense_h2h = wh->data<T>();
+
+    const auto* e2h = dense_e2h;
+    const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
+    const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
+    const auto* h2h = dense_h2h;
+    const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
+    const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
+
+    prepare_layout(ctx, bottom);
+    auto* _layout_input = ctx.Output<LoDTensor>("layout_input");
+    auto* new_emb = _layout_input->mutable_data<T>(ctx.GetPlace());
+    const auto& new_offset = _layout_input->lod()[0];
+    int max_width = _layout_input->lod()[0].size() - 1;
+
+    // this buffer is used for book keeping info which will be used in bp
+    // buffer also needed in bp, so make it larger
+    _buffer->Resize(framework::make_ddim({20, _cap_l, _cap_h}));
+    auto* buffer_data = _buffer->mutable_data<T>(ctx.GetPlace());
+    auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h;
+    auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h;
+    auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h;
+
+    auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h;
+    auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h;
+    auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h;
+
+    auto* r = buffer_data + 6 * _cap_l * _cap_h;
+    auto* z = buffer_data + 7 * _cap_l * _cap_h;
+    auto* tilde = buffer_data + 8 * _cap_l * _cap_h;
+    // the internal hidden
+    auto* hidden = buffer_data + 19 * _cap_l * _cap_h;
+
+    // precompute embedding to hidden
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    call_gemm(blas, CblasNoTrans, CblasTrans, _cap_l, _cap_h, _cap_e, 1.0f,
+              new_emb, e2h, 0.0f, w_x_e);
+    call_gemm(blas, CblasNoTrans, CblasTrans, _cap_l, _cap_h, _cap_e, 1.0f,
+              new_emb, e2hr, 0.0f, wr_x_e);
+    call_gemm(blas, CblasNoTrans, CblasTrans, _cap_l, _cap_h, _cap_e, 1.0f,
+              new_emb, e2hz, 0.0f, wz_x_e);
+
+    // precompute hidden0
+    for (int i = 0; i < batch * _cap_h; i++) {
+      tilde[i] = std::tanh(w_x_e[i]);
+      z[i] = sigmoid<T>(wz_x_e[i]);
+      hidden[i] = (1. - z[i]) * tilde[i];
+    }
+
+    // recurrence
+    for (int i = 1; i < max_width; i++) {
+      int w_tm1 = new_offset[i] - new_offset[i - 1];
+      int w = new_offset[i + 1] - new_offset[i];
+
+      // precompute hidden i-1 to hidden i
+      auto* htm1 = hidden + new_offset[i - 1] * _cap_h;
+
+      call_gemm(blas, CblasNoTrans, CblasTrans, w, _cap_h, _cap_h, 1.0f, htm1,
+                h2h, 0.0f, u_x_h + new_offset[i] * _cap_h);
+      call_gemm(blas, CblasNoTrans, CblasTrans, w, _cap_h, _cap_h, 1.0f, htm1,
+                h2hr, 0.0f, ur_x_h + new_offset[i] * _cap_h);
+      call_gemm(blas, CblasNoTrans, CblasTrans, w, _cap_h, _cap_h, 1.0f, htm1,
+                h2hz, 0.0f, uz_x_h + new_offset[i] * _cap_h);
+
+      // compute the gate and hidden
+      for (size_t j = new_offset[i] * _cap_h; j < (new_offset[i] + w) * _cap_h;
+           j++) {
+        r[j] = sigmoid(wr_x_e[j] + ur_x_h[j]);
+        z[j] = sigmoid(wz_x_e[j] + uz_x_h[j]);
+        tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]);
+
+        hidden[j] = z[j] * hidden[j - _cap_h * w_tm1] + (1.0 - z[j]) * tilde[j];
+      }
+    }
+
+    // copy back to top
+    copy_back(ctx, hidden, top_hidden, _cap_h);
+  }
+};
+
+class SearchGrnnOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Wi"), "Input(Wi) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Wh"), "Input(Wh) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Wi"))) {
+      ctx->SetOutputDim(framework::GradVarName("Wi"), ctx->GetInputDim("Wi"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Wh"))) {
+      ctx->SetOutputDim(framework::GradVarName("Wh"), ctx->GetInputDim("Wh"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchGrnnOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void do_same_layout(const framework::ExecutionContext& ctx, const T* from,
+                      T* to, int step) const {
+    auto* _input = ctx.Input<LoDTensor>("X");
+    auto* _layout_input = ctx.Input<LoDTensor>("layout_input");
+    auto& offset = _input->lod()[0];
+    const auto& new_offset = _layout_input->lod()[0];
+    auto* _idx_sorted_by_width = ctx.Input<Tensor>("idx_sorted_by_width");
+    const int* idx_sorted_by_width_data = _idx_sorted_by_width->data<int>();
+
+    for (int i = 0; i < _layout_input->lod()[0].size() - 1; i++) {
+      int w = new_offset[i + 1] - new_offset[i];
+      for (int j = 0; j < w; j++) {
+        memcpy(to + (new_offset[i] + j) * step,
+               from + step * (offset[idx_sorted_by_width_data[j]] + i),
+               step * sizeof(T));
+      }
+    }
+  }
+
+  void copy_back(const framework::ExecutionContext& ctx, T* from, T* to,
+                 int step) const {
+    auto* _input = ctx.Input<LoDTensor>("X");
+    auto* _layout_input = ctx.Input<LoDTensor>("layout_input");
+    auto* _idx_sorted_by_width = ctx.Input<Tensor>("idx_sorted_by_width");
+
+    const auto& offset = _input->lod()[0];
+    const auto& new_offset = _layout_input->lod()[0];
+    const auto* idx_sorted_by_width_data = _idx_sorted_by_width->data<int>();
+    for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
+      int w = new_offset[i + 1] - new_offset[i];
+      for (int j = 0; j < w; j++) {
+        memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i),
+               from + (new_offset[i] + j) * step, step * sizeof(T));
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<LoDTensor>("X");
+    auto* wi = ctx.Input<LoDTensor>("Wi");
+    auto* wh = ctx.Input<Tensor>("Wh");
+    auto* _buffer = ctx.Input<LoDTensor>("tmp_buffer");
+    auto* _layout_input = ctx.Input<LoDTensor>("layout_input");
+
+    // std::vector<const LoDTensor*> _blobs{wi, wh};
+
+    int _cap_h = ctx.Attr<int>("num_hidden");
+    int _cap_e = ctx.Attr<int>("num_input");
+    int _cap_l = bottom->dims()[0];
+
+    auto* d_bottom = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_top = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_wi = ctx.Output<LoDTensor>(framework::GradVarName("Wi"));
+    auto* d_wh = ctx.Output<LoDTensor>(framework::GradVarName("Wh"));
+
+    int batch = bottom->lod()[0].size() - 1;
+
+    const auto& new_offset = _layout_input->lod()[0];
+    int max_width = _layout_input->lod()[0].size() - 1;
+
+    // the original top and bottom pointers
+    auto* top_diff = d_top->data<T>();
+    auto* ediff = d_bottom->mutable_data<T>(ctx.GetPlace());
+
+    const auto* dense_e2h = wi->data<T>();
+    const auto* dense_h2h = wh->data<T>();
+
+    auto* dense_e2h_diff = d_wi->mutable_data<T>(ctx.GetPlace());
+    auto* dense_h2h_diff = d_wh->mutable_data<T>(ctx.GetPlace());
+    // init parameter's diff
+    memset(dense_e2h_diff, 0, 3 * _cap_e * _cap_h * sizeof(T));
+    memset(dense_h2h_diff, 0, 3 * _cap_h * _cap_h * sizeof(T));
+
+    const auto* e2h = dense_e2h;
+    const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
+    const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
+    const auto* h2h = dense_h2h;
+    const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
+    const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
+
+    auto* e2h_diff = dense_e2h_diff;
+    auto* e2hr_diff = dense_e2h_diff + 1 * _cap_e * _cap_h;
+    auto* e2hz_diff = dense_e2h_diff + 2 * _cap_e * _cap_h;
+    auto* h2h_diff = dense_h2h_diff;
+    auto* h2hr_diff = dense_h2h_diff + 1 * _cap_h * _cap_h;
+    auto* h2hz_diff = dense_h2h_diff + 2 * _cap_h * _cap_h;
+
+    auto u_x_h = _buffer->data<T>() + 3 * _cap_l * _cap_h;
+
+    Tensor buffer_diff;
+    buffer_diff.Resize(framework::make_ddim({20, _cap_l, _cap_h}));
+    auto* buffer_diff_data = buffer_diff.mutable_data<T>(ctx.GetPlace());
+
+    auto e2hdiff = buffer_diff_data + 0 * _cap_l * _cap_h;
+    auto e2hrdiff = buffer_diff_data + 1 * _cap_l * _cap_h;
+    auto e2hzdiff = buffer_diff_data + 2 * _cap_l * _cap_h;
+
+    auto h2hdiff = buffer_diff_data + 3 * _cap_l * _cap_h;
+    auto h2hrdiff = buffer_diff_data + 4 * _cap_l * _cap_h;
+    auto h2hzdiff = buffer_diff_data + 5 * _cap_l * _cap_h;
+
+    auto* buffer_data = _buffer->data<T>();
+    auto r = buffer_data + 6 * _cap_l * _cap_h;
+    auto z = buffer_data + 7 * _cap_l * _cap_h;
+    auto tilde = buffer_data + 8 * _cap_l * _cap_h;
+
+    auto d_r = buffer_diff_data + 9 * _cap_l * _cap_h;
+    auto d_z = buffer_diff_data + 10 * _cap_l * _cap_h;
+    auto d_tilde = buffer_diff_data + 11 * _cap_l * _cap_h;
+
+    auto tmp_buffer = buffer_diff_data + 12 * _cap_l * _cap_h;
+
+    auto hidden = buffer_data + 19 * _cap_l * _cap_h;
+    auto hidden_diff = buffer_diff_data + 19 * _cap_l * _cap_h;
+    auto embedding = _layout_input->data<T>();
+    Tensor _layout_input_grad;
+    _layout_input_grad.Resize(_layout_input->dims());
+    auto embedding_diff = _layout_input_grad.mutable_data<T>(ctx.GetPlace());
+
+    // copy top_hiddden diff back to the reorganized hidden, so we can use
+    // segemm to back-prop the sequence
+    do_same_layout(ctx, top_diff, hidden_diff, _cap_h);
+
+    // precompute nonlinear diff
+    for (int k = 0; k < new_offset[1] * _cap_h; k++) {
+      d_z[k] = SIGMOID_D(z[k]);
+      d_tilde[k] = TANHD(tilde[k]);
+    }
+
+    for (int k = new_offset[1] * _cap_h; k < new_offset[max_width] * _cap_h;
+         k++) {
+      d_r[k] = SIGMOID_D(r[k]);
+      d_z[k] = SIGMOID_D(z[k]);
+      d_tilde[k] = TANHD(tilde[k]);
+    }
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    // back prop
+    for (int i = max_width - 1; i > 0; i--) {
+      int w_tm1 = new_offset[i] - new_offset[i - 1];
+      int w = new_offset[i + 1] - new_offset[i];
+
+      for (int j = new_offset[i]; j < (new_offset[i] + w); j++) {
+        for (int k = 0; k < _cap_h; k++) {
+          int ht = j * _cap_h + k;
+          int htm1 = ht - _cap_h * w_tm1;
+
+          T common = (1.0 - z[ht]) * d_tilde[ht] * hidden_diff[ht];
+
+          h2hdiff[htm1] = common * r[ht];
+          h2hrdiff[htm1] = common * u_x_h[ht] * d_r[ht];
+          h2hzdiff[htm1] =
+              (hidden[htm1] - tilde[ht]) * d_z[ht] * hidden_diff[ht];
+
+          e2hdiff[ht] = common;
+          e2hrdiff[ht] = h2hrdiff[htm1];
+          e2hzdiff[ht] = h2hzdiff[htm1];
+        }
+      }
+
+      auto* hidden_htm1 = hidden + new_offset[i - 1] * _cap_h;
+      auto* h2hdiff_htm1 = h2hdiff + new_offset[i - 1] * _cap_h;
+      auto* h2hrdiff_htm1 = h2hrdiff + new_offset[i - 1] * _cap_h;
+      auto* h2hzdiff_htm1 = h2hzdiff + new_offset[i - 1] * _cap_h;
+
+      call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_h, w, (T)1.0,
+                h2hdiff_htm1, hidden_htm1, (T)1.0, h2h_diff);
+
+      call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_h, w, (T)1.0,
+                h2hrdiff_htm1, hidden_htm1, (T)1.0, h2hr_diff);
+
+      call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_h, w, (T)1.0,
+                h2hzdiff_htm1, hidden_htm1, (T)1.0, h2hz_diff);
+
+      auto* embedding_et = embedding + new_offset[i] * _cap_e;
+      auto* e2hdiff_ht = e2hdiff + new_offset[i] * _cap_h;
+      auto* e2hrdiff_ht = e2hrdiff + new_offset[i] * _cap_h;
+      auto* e2hzdiff_ht = e2hzdiff + new_offset[i] * _cap_h;
+
+      call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, w, (T)1.0,
+                e2hdiff_ht, embedding_et, (T)1.0, e2h_diff);
+
+      call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, w, (T)1.0,
+                e2hrdiff_ht, embedding_et, (T)1.0, e2hr_diff);
+
+      call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, w, (T)1.0,
+                e2hzdiff_ht, embedding_et, (T)1.0, e2hz_diff);
+
+      sse_eltmul(z + new_offset[i] * _cap_h,
+                 hidden_diff + new_offset[i] * _cap_h,
+                 tmp_buffer + new_offset[i - 1] * _cap_h, _cap_h * w);
+      // add this with diff from top
+      sse_eltadd(hidden_diff + new_offset[i - 1] * _cap_h,
+                 tmp_buffer + new_offset[i - 1] * _cap_h,
+                 hidden_diff + new_offset[i - 1] * _cap_h, _cap_h * w);
+
+      call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_h, _cap_h, (T)1.0,
+                h2hdiff_htm1, h2h, (T)1.0,
+                hidden_diff + new_offset[i - 1] * _cap_h);
+      call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_h, _cap_h, (T)1.0,
+                h2hrdiff_htm1, h2hr, (T)1.0,
+                hidden_diff + new_offset[i - 1] * _cap_h);
+      call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_h, _cap_h, (T)1.0,
+                h2hzdiff_htm1, h2hz, (T)1.0,
+                hidden_diff + new_offset[i - 1] * _cap_h);
+
+      // bp embedding diff
+      auto* embedding_diff_et = embedding_diff + new_offset[i] * _cap_e;
+
+      call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_e, _cap_h, (T)1.0,
+                e2hdiff_ht, e2h, (T)0.0, embedding_diff_et);
+
+      call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_e, _cap_h, (T)1.0,
+                e2hrdiff_ht, e2hr, (T)1.0, embedding_diff_et);
+
+      call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_e, _cap_h, (T)1.0,
+                e2hzdiff_ht, e2hz, (T)1.0, embedding_diff_et);
+    }
+
+    for (int i = 0; i < batch * _cap_h; i++) {
+      e2hdiff[i] = (1. - z[i]) * d_tilde[i] * hidden_diff[i];
+      e2hzdiff[i] = (-tilde[i]) * d_z[i] * hidden_diff[i];
+    }
+    call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, batch, (T)1.0,
+              e2hdiff, embedding, (T)1.0, e2h_diff);
+    call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, batch, (T)1.0,
+              e2hzdiff, embedding, (T)1.0, e2hz_diff);
+
+    call_gemm(blas, CblasNoTrans, CblasNoTrans, batch, _cap_e, _cap_h, (T)1.0,
+              e2hdiff, e2h, (T)0.0, embedding_diff);
+    call_gemm(blas, CblasNoTrans, CblasNoTrans, batch, _cap_e, _cap_h, (T)1.0,
+              e2hzdiff, e2hz, (T)1.0, embedding_diff);
+
+    // copy back to original embedding diff, and hidden diff (probablly no use,
+    // but for safety)
+    copy_back(ctx, embedding_diff, ediff, _cap_e);
+    //_layout_helper.copy_back(hidden_diff, top_diff, _cap_h);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_grnn, ops::SearchGrnnOP, ops::SearchGrnnOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_grnn_grad, ops::SearchGrnnOpGrad);
+
+REGISTER_OP_CPU_KERNEL(search_grnn,
+                       ops::CPUSearchGrnnOPKernel<plt::CPUDeviceContext, float>
+                       //     ops::CPUSearchGrnnOPKernel<plt::CPUDeviceContext,
+                       //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_grnn_grad,
+    ops::CPUSearchGrnnOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchGrnnOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_grnn_op.h b/paddle/fluid/operators/search_grnn_op.h
new file mode 100644
index 00000000000000..80411bb0262894
--- /dev/null
+++ b/paddle/fluid/operators/search_grnn_op.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/search_group_padding_op.cc b/paddle/fluid/operators/search_group_padding_op.cc
new file mode 100644
index 00000000000000..7ebadebed5322a
--- /dev/null
+++ b/paddle/fluid/operators/search_group_padding_op.cc
@@ -0,0 +1,222 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+class SearchGroupPaddingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+
+    AddAttr<int>("pad_id", "pad_id").SetDefault(0).EqualGreaterThan(0);
+
+    AddOutput("Out_emb_padding", "Out_emb_padding");
+    AddOutput("Out_new", "Out_new");
+    AddOutput("Out_padding", "Out_padding");
+
+    AddComment(R"DOC(
+  SearchGroupPadding
+
+  NOTE: only support 'float32' data type now.
+
+)DOC");
+  }
+};
+
+class SearchGroupPaddingOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out_emb_padding"),
+                   "Out(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out_new"),
+                   "Out(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out_padding"),
+                   "Out(Output) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+    } else {
+      // compile time
+      framework::VarDesc* x_desc =
+          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
+      PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
+    }
+
+    ctx->SetOutputDim("Out_emb_padding", framework::make_ddim({-1, x_dims[1]}));
+    ctx->SetOutputDim("Out_new", framework::make_ddim({x_dims[0], 1}));
+    // ctx->ShareLoD("X", /*->*/ "Out_new");
+    ctx->SetOutputDim("Out_padding", framework::make_ddim({-1, 1}));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchGroupPaddingOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* top0 = ctx.Output<LoDTensor>("Out_emb_padding");
+    auto* top1 = ctx.Output<LoDTensor>("Out_new");
+    auto* top2 = ctx.Output<LoDTensor>("Out_padding");
+
+    int _pad_id = ctx.Attr<int>("pad_id");
+
+    int batch = bottom0->lod()[0].size() - 1;
+    int dim0 = bottom0->dims()[0];
+    int dim1 = bottom0->dims()[1];  // dim1 is usually the embedding size
+
+    const auto offset = bottom0->lod()[0];
+    int max_seq = 0;
+    for (int i = 0; i < batch; ++i) {
+      if (offset[i + 1] - offset[i] > max_seq) {
+        max_seq = offset[i + 1] - offset[i];
+      }
+    }
+
+    std::vector<size_t> new_offset;
+    new_offset.resize(batch + 1);
+
+    for (int i = 0; i < batch + 1; ++i) {
+      new_offset[i] = i * max_seq;
+    }
+
+    // for padding data
+    framework::LoD top0_lod;
+    top0_lod.push_back(new_offset);
+    top0->set_lod(top0_lod);
+    top0->Resize(framework::make_ddim({batch * max_seq, dim1}));
+
+    // for origin input id
+    // already set by ShareLoD in InferShape
+          framework::LoD top1_lod;
+          top1_lod.push_back(offset);
+          top1->set_lod(top1_lod);
+    top1->Resize(framework::make_ddim({dim0, 1}));
+    memset(top1->mutable_data<T>(ctx.GetPlace()), 0,
+           top1->dims()[0] * top1->dims()[1] * sizeof(T));
+
+    // for padding input id
+    framework::LoD top2_lod;
+    top2_lod.push_back(new_offset);
+    top2->set_lod(top2_lod);
+    top2->Resize(framework::make_ddim({batch * max_seq, 1}));
+
+    // copy data
+    const auto* bottom_data = bottom0->data<T>();
+    auto* top_data = top0->mutable_data<T>(ctx.GetPlace());
+    auto* top_padding_input_data = top2->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < batch; i++) {
+      const int copy_step = offset[i + 1] - offset[i];
+      const int start = i * max_seq;
+      memcpy(top_data + start * dim1, bottom_data + offset[i] * dim1,
+             copy_step * dim1 * sizeof(T));
+      memset(top_data + (start + copy_step) * dim1, 0,
+             (max_seq - copy_step) * dim1 * sizeof(T));
+      // for padding input id
+      memset(top_padding_input_data + start, 0, copy_step * sizeof(T));
+      for (int j = start + copy_step; j < start + max_seq; j++) {
+        top_padding_input_data[j] = static_cast<T>(_pad_id);
+      }
+    }
+  }
+};
+
+class SearchGroupPaddingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out_emb_padding")),
+        "Input(Out_emb_padding@GRAD) of SequencePadGradOp should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchGroupPaddingOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* top0 = ctx.Input<LoDTensor>("Out_emb_padding");
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_out =
+        ctx.Input<LoDTensor>(framework::GradVarName("Out_emb_padding"));
+
+    int batch = bottom0->lod()[0].size() - 1;
+    int dim1 = bottom0->dims()[1];  // dim1 is usually the embedding size
+
+    auto* bottom_diff = d_x->mutable_data<T>(ctx.GetPlace());
+    const auto* top_diff = d_out->data<T>();
+    const auto offset = bottom0->lod()[0];
+    const auto top_offset = top0->lod()[0];
+    for (int i = 0; i < batch; i++) {
+      const int step = offset[i + 1] - offset[i];
+      memcpy(bottom_diff + offset[i] * dim1, top_diff + top_offset[i] * dim1,
+             step * dim1 * sizeof(T));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_group_padding, ops::SearchGroupPaddingOP,
+                  ops::SearchGroupPaddingOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_group_padding_grad, ops::SearchGroupPaddingOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    search_group_padding,
+    ops::CPUSearchGroupPaddingOPKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchGroupPaddingOPKernel<plt::CPUDeviceContext,
+    //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_group_padding_grad,
+    ops::CPUSearchGroupPaddingOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchGroupPaddingOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_seq_arithmetic_op.cc b/paddle/fluid/operators/search_seq_arithmetic_op.cc
new file mode 100644
index 00000000000000..8b9048150f520e
--- /dev/null
+++ b/paddle/fluid/operators/search_seq_arithmetic_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
+#include "search_compute.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+class SearchSeqArithmeticOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("Y",
+             "Y (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+
+    AddAttr<int>("op_type", "operation type: 1: add; 2: sub; 3: mul")
+        .SetDefault(0)
+        .EqualGreaterThan(1);
+
+    AddOutput("Out",
+              "Out (LoDTensor, default LoDTensor<float>) Output variable");
+
+    AddComment(R"DOC(
+  SearchSeqArithmetic
+
+  NOTE: only support 'float32' data type now.
+
+)DOC");
+  }
+};
+
+class SearchSeqArithmeticOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
+
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Y should be 2-D tensor");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+      const auto& y_lod = y_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info.");
+      const auto& y_lod_0 = y_lod[0];
+      PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
+                        "The Input(Y)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          y_dims[0], static_cast<int64_t>(y_lod_0.back()),
+          "The Input(Y)'s lod info mismatches the actual tensor shape.");
+
+      PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(),
+                        "The Length of X and Y must be equal.");
+    } else {
+      // compile time
+      framework::VarDesc* x_desc =
+          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
+      PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
+      framework::VarDesc* y_desc =
+          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
+      PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1);
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, x_dims[1]}));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqArithmeticOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* bottom1 = ctx.Input<LoDTensor>("Y");
+    auto* top = ctx.Output<LoDTensor>("Out");
+
+    int _op_type = ctx.Attr<int>("op_type");
+
+    auto len1 = bottom0->dims()[0] * bottom0->dims()[1];
+    auto len2 = bottom1->dims()[0] * bottom1->dims()[1];
+    const auto* bottom_data0 = bottom0->data<T>();
+    const auto* bottom_data1 = bottom1->data<T>();
+    // already set by ShareLoD in InferShape
+    //        framework::LoD top_lod;
+    //        top_lod.push_back(offset);
+    //        top->set_lod(top_lod);
+    top->Resize(framework::make_ddim({bottom0->dims()[0], bottom0->dims()[1]}));
+    auto* top_data = top->mutable_data<T>(ctx.GetPlace());
+
+    switch (_op_type) {
+      case 1:  // addition: top[0] = bottom[0] + bottom[1]
+        if (len1 > len2) {
+          sse_eltadd(bottom_data0, bottom_data1, top_data, len2);
+          memcpy(&top_data[len2], &bottom_data0[len2],
+                 (len1 - len2) * sizeof(T));
+        } else {
+          sse_eltadd(bottom_data0, bottom_data1, top_data, len1);
+        }
+        break;
+      case 2:  // substraction: top[0] = bottom[0] - bottom[1]
+        memcpy(top_data, bottom_data0, len1 * sizeof(T));
+        if (len1 > len2) {
+          sse_axpy(bottom_data1, top_data, len2, (T)-1.0);
+        } else {
+          sse_axpy(bottom_data1, top_data, len1, (T)-1.0);
+        }
+        break;
+      case 3:  // multiplication: top[0] = bottom[0] * bottom[1]
+        if (len1 > len2) {
+          sse_eltmul(bottom_data0, bottom_data1, top_data, len2);
+          memcpy(&top_data[len2], &bottom_data0[len2],
+                 (len1 - len2) * sizeof(T));
+        } else {
+          sse_eltmul(bottom_data0, bottom_data1, top_data, len1);
+        }
+        break;
+    }
+  }
+};
+
+class SearchSeqArithmeticOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Y"))) {
+      ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y"));
+      ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqArithmeticOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* bottom1 = ctx.Input<LoDTensor>("Y");
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_y = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
+    int _op_type = ctx.Attr<int>("op_type");
+
+    auto len1 = bottom0->dims()[0] * bottom0->dims()[1];
+    auto len2 = bottom1->dims()[0] * bottom1->dims()[1];
+    auto* bottom_diff0 = d_x->mutable_data<T>(ctx.GetPlace());
+    auto* bottom_diff1 = d_y->mutable_data<T>(ctx.GetPlace());
+    const auto* top_diff = d_out->data<T>();
+    const auto* bottom_data0 = bottom0->data<T>();
+    const auto* bottom_data1 = bottom1->data<T>();
+
+    switch (_op_type) {
+      case 1:  // addition
+        memcpy(bottom_diff0, top_diff, len1 * sizeof(T));
+        if (len1 >= len2) {
+          memcpy(bottom_diff1, top_diff, len2 * sizeof(T));
+        } else {
+          memset(bottom_diff1, 0, len2 * sizeof(T));
+          memcpy(bottom_diff1, top_diff, len1 * sizeof(T));
+        }
+        break;
+      case 2:  // substraction
+        memcpy(bottom_diff0, top_diff, len1 * sizeof(T));
+        if (len1 >= len2) {
+          sse_axpy_noadd(top_diff, bottom_diff1, len2, (T)-1.0);
+        } else {
+          memset(bottom_diff1, 0, len2 * sizeof(T));
+          sse_axpy_noadd(top_diff, bottom_diff1, len1, (T)-1.0);
+        }
+        break;
+      case 3:  // multiplication
+        if (len1 >= len2) {
+          memcpy(bottom_diff0, top_diff, len1 * sizeof(T));
+          sse_eltmul(top_diff, bottom_data1, bottom_diff0, len2);
+          sse_eltmul(top_diff, bottom_data0, bottom_diff1, len2);
+        } else {
+          sse_eltmul(top_diff, bottom_data1, bottom_diff0, len1);
+          memset(bottom_diff1, 0, len2 * sizeof(T));
+          sse_eltmul(top_diff, bottom_data0, bottom_diff1, len1);
+        }
+        break;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_seq_arithmetic, ops::SearchSeqArithmeticOP,
+                  ops::SearchSeqArithmeticOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_seq_arithmetic_grad, ops::SearchSeqArithmeticOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    search_seq_arithmetic,
+    ops::CPUSearchSeqArithmeticOPKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchSeqArithmeticOPKernel<plt::CPUDeviceContext,
+    //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_seq_arithmetic_grad,
+    ops::CPUSearchSeqArithmeticOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchSeqArithmeticOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_seq_depadding_op.cc b/paddle/fluid/operators/search_seq_depadding_op.cc
new file mode 100644
index 00000000000000..8cf0e5a2b6f6e7
--- /dev/null
+++ b/paddle/fluid/operators/search_seq_depadding_op.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+class SearchSeqDepaddingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Pad",
+             "Pad (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("Src",
+             "Src (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+
+    AddOutput("Out", "Out");
+
+    AddComment(R"DOC(
+  SearchSeqDepadding
+
+  NOTE: only support 'float32' data type now.
+
+)DOC");
+  }
+};
+
+class SearchSeqDepaddingOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Pad"), "Pad(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Src"), "Src(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+
+    auto pad_dims = ctx->GetInputDim("Pad");
+    PADDLE_ENFORCE_EQ(pad_dims.size(), 2,
+                      "The rank of Pad(Input) should be 2.");
+
+    auto src_dims = ctx->GetInputDim("Src");
+    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
+                      "The rank of Src(Input) should be 2.");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* pad_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Pad")[0]);
+      const auto& pad_lod = pad_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!pad_lod.empty(), "The Input(Pad) must hold lod info.");
+      const auto& pad_lod_0 = pad_lod[0];
+      PADDLE_ENFORCE_GE(pad_lod_0.size(), 2,
+                        "The Input(Pad)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          pad_dims[0], static_cast<int64_t>(pad_lod_0.back()),
+          "The Input(Pad)'s lod info mismatches the actual tensor shape.");
+
+      framework::Variable* src_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Src")[0]);
+      const auto& src_lod = src_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!src_lod.empty(), "The Input(Src) must hold lod info.");
+      const auto& src_lod_0 = src_lod[0];
+      PADDLE_ENFORCE_GE(src_lod_0.size(), 2,
+                        "The Input(Src)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          src_dims[0], static_cast<int64_t>(src_lod_0.back()),
+          "The Input(Src)'s lod info mismatches the actual tensor shape.");
+    } else {
+      // compile time
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, pad_dims[1]}));
+    //ctx->ShareLoD("Src", /*->*/ "Out");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqDepaddingOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("Pad");
+    auto* bottom1 = ctx.Input<LoDTensor>("Src");
+    auto* top0 = ctx.Output<LoDTensor>("Out");
+
+    const int pad_batch = bottom0->lod()[0].size() - 1;
+    const int src_batch = bottom1->lod()[0].size() - 1;
+    PADDLE_ENFORCE_EQ(pad_batch % src_batch, 0,
+                      "Mismatch batch size, bottom0: %d, bottom1: %d",
+                      pad_batch, src_batch);
+
+    const auto& src_offset = bottom1->lod()[0];
+    const auto& pad_offset = bottom0->lod()[0];
+    const int src_cap_l = bottom1->dims()[0];
+    const int pad_cap_e = bottom0->dims()[1];
+
+    framework::LoD top0_lod;
+    top0_lod.push_back(src_offset);
+    top0->set_lod(top0_lod);
+    top0->Resize(framework::make_ddim({src_cap_l, pad_cap_e}));
+
+    const auto* bottom_data = bottom0->data<T>();
+    auto* top_data = top0->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < src_batch; ++i) {
+      const int src_i_l = src_offset[i + 1] - src_offset[i];
+      const int pad_i_l = pad_offset[i + 1] - pad_offset[i];
+      PADDLE_ENFORCE_GE(
+          pad_i_l, src_i_l,
+          "the length of padding seq input is less than source seq input.");
+      memcpy(top_data + src_offset[i] * pad_cap_e,
+             bottom_data + pad_offset[i] * pad_cap_e,
+             src_i_l * pad_cap_e * sizeof(T));
+    }
+  }
+};
+
+class SearchSeqDepaddingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Pad"), "Input(Pad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Src"), "Input(Src) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("Pad"))) {
+      ctx->SetOutputDim(framework::GradVarName("Pad"), ctx->GetInputDim("Pad"));
+      ctx->ShareLoD("Pad", /*->*/ framework::GradVarName("Pad"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Src"))) {
+      ctx->SetOutputDim(framework::GradVarName("Src"), ctx->GetInputDim("Src"));
+      ctx->ShareLoD("Src", /*->*/ framework::GradVarName("Src"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqDepaddingOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("Pad");
+    auto* bottom1 = ctx.Input<LoDTensor>("Src");
+    auto* d_bottom0 = ctx.Output<LoDTensor>(framework::GradVarName("Pad"));
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+
+    const int src_batch = bottom1->lod()[0].size() - 1;
+    const auto& src_offset = bottom1->lod()[0];
+    const auto& pad_offset = bottom0->lod()[0];
+    const int pad_cap_e = bottom0->dims()[1];
+
+    const auto* top_diff = d_out->data<T>();
+    auto* bottom_diff = d_bottom0->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < src_batch; i++) {
+      const int src_i_l = src_offset[i + 1] - src_offset[i];
+      const int pad_i_l = pad_offset[i + 1] - pad_offset[i];
+      PADDLE_ENFORCE_GE(
+          pad_i_l, src_i_l,
+          "the length of padding seq input is less than source seq input.");
+
+      memcpy(bottom_diff + pad_offset[i] * pad_cap_e,
+             top_diff + src_offset[i] * pad_cap_e,
+             src_i_l * pad_cap_e * sizeof(T));
+      memset(bottom_diff + (pad_offset[i] + src_i_l) * pad_cap_e, 0,
+             (pad_i_l - src_i_l) * pad_cap_e * sizeof(T));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_seq_depadding, ops::SearchSeqDepaddingOP,
+                  ops::SearchSeqDepaddingOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_seq_depadding_grad, ops::SearchSeqDepaddingOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    search_seq_depadding,
+    ops::CPUSearchSeqDepaddingOPKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchSeqDepaddingOPKernel<plt::CPUDeviceContext,
+    //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_seq_depadding_grad,
+    ops::CPUSearchSeqDepaddingOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchSeqDepaddingOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_seq_fc_op.cc b/paddle/fluid/operators/search_seq_fc_op.cc
new file mode 100644
index 00000000000000..b007a44e701d0f
--- /dev/null
+++ b/paddle/fluid/operators/search_seq_fc_op.cc
@@ -0,0 +1,217 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
+#include "search_compute.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+class SearchSeqFCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("W", "W (Tensor)");
+    AddInput("b", "b (LoDTensor)");
+    AddAttr<int>("out_size", "out_size: the output size")
+        .SetDefault(0)
+        .EqualGreaterThan(1);
+    AddAttr<bool>("has_bias", "true or false").SetDefault(true);
+
+    AddOutput("Out",
+              "Out (LoDTensor, default LoDTensor<float>) Output variable");
+
+    AddComment(R"DOC(
+  SearchSeqFC
+
+  NOTE: only support 'float32' data type now.
+
+)DOC");
+  }
+};
+
+class SearchSeqFCOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "W(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+    int out_size = ctx->Attrs().Get<int>("out_size");
+    bool has_bias = ctx->Attrs().Get<bool>("has_bias");
+
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
+
+    auto w_dims = ctx->GetInputDim("W");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+
+    PADDLE_ENFORCE_EQ(w_dims[0], out_size,
+                      "wrong shape: w_dims[0] != out_size");
+
+    PADDLE_ENFORCE_EQ(w_dims[1], x_dims[1],
+                      "wrong shape: w_dims[1] != x_dims[1]");
+
+    if (has_bias) {
+      PADDLE_ENFORCE(ctx->HasInput("b"), "b(Input) should not be null.");
+      auto b_dims = ctx->GetInputDim("b");
+      PADDLE_ENFORCE_EQ(b_dims.size(), 1, "b should be 1-D tensor");
+    }
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+    } else {
+      // compile time
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, out_size}));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqFCOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("W");
+    auto* b = ctx.Input<Tensor>("b");
+    auto* top = ctx.Output<LoDTensor>("Out");
+    bool _bias_term = ctx.Attr<bool>("has_bias");
+
+    int _out = w->dims()[0];
+    int _in = w->dims()[1];
+    int res_num = bottom->dims()[0];
+
+    top->Resize(framework::make_ddim({res_num, _out}));
+    const auto* bottom_data = bottom->data<T>();
+    auto* top_data = top->mutable_data<T>(ctx.GetPlace());
+    const auto* weights = w->data<T>();
+
+    call_gemm(ctx, CblasNoTrans, CblasTrans, res_num, _out, _in, (T)1.0,
+              bottom_data, weights, (T)0.0, top_data);
+
+    if (_bias_term) {
+      const auto* bias = b->data<T>();;
+      for (int i = 0; i < res_num; ++i) {
+        sse_eltadd(top_data + i * _out, bias, top_data + i * _out, _out);
+      }
+    }
+  }
+};
+
+class SearchSeqFCOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("b"), "Input(b) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")));
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+
+    bool has_bias = ctx->Attrs().Get<bool>("has_bias");
+    if (has_bias) {
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("b")));
+      ctx->SetOutputDim(framework::GradVarName("b"), ctx->GetInputDim("b"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqFCOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("W");
+    bool _bias_term = ctx.Attr<bool>("has_bias");
+
+    int _out = w->dims()[0];
+    int _in = w->dims()[1];
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
+
+    int res_num = bottom->dims()[0];
+
+    const auto* top_diff = d_out->data<T>();
+    const auto* bottom_data = bottom->data<T>();
+    auto* bottom_diff = d_x->mutable_data<T>(ctx.GetPlace());
+    const auto* weights = w->data<T>();
+    auto* weights_diff = d_w->mutable_data<T>(ctx.GetPlace());
+
+    call_gemm(ctx, CblasTrans, CblasNoTrans, _out, _in, res_num, (T)1.0,
+              top_diff, bottom_data, (T)0.0, weights_diff);
+    call_gemm(ctx, CblasNoTrans, CblasNoTrans, res_num, _in, _out, (T)1.0,
+              top_diff, weights, (T)0.0, bottom_diff);
+
+    if (_bias_term) {
+      auto* d_b = ctx.Output<Tensor>(framework::GradVarName("b"));
+      auto* bias_diff = d_b->mutable_data<T>(ctx.GetPlace());
+      memset(bias_diff, (T)0.0, _out * sizeof(T));
+      for (int i = 0; i < res_num; ++i) {
+        sse_eltadd(bias_diff, top_diff + i * _out, bias_diff, _out);
+      }
+    }
+        
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_seq_fc, ops::SearchSeqFCOP, ops::SearchSeqFCOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_seq_fc_grad, ops::SearchSeqFCOpGrad);
+
+REGISTER_OP_CPU_KERNEL(search_seq_fc,
+                       ops::CPUSearchSeqFCOPKernel<plt::CPUDeviceContext, float>
+                       //     ops::CPUSearchSeqFCOPKernel<plt::CPUDeviceContext,
+                       //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_seq_fc_grad,
+    ops::CPUSearchSeqFCOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchSeqFCOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/search_seq_softmax_op.cc b/paddle/fluid/operators/search_seq_softmax_op.cc
new file mode 100644
index 00000000000000..3829710c5e98f4
--- /dev/null
+++ b/paddle/fluid/operators/search_seq_softmax_op.cc
@@ -0,0 +1,211 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "paddle/fluid/framework/op_registry.h"
+#include "search_compute.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+class SearchSeqSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+
+    AddAttr<int>("alg", "operation type: 0: accurate; 1: log; others: invalid")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+
+    AddOutput("Out",
+              "Out (LoDTensor, default LoDTensor<float>) Output variable");
+    AddOutput("Out_log",
+              "Out_log (LoDTensor, default LoDTensor<float>) Output variable");
+
+    AddComment(R"DOC(
+  SearchSeqSoftmax
+
+  NOTE: only support 'float32' data type now.
+
+)DOC");
+  }
+};
+
+class SearchSeqSoftmaxOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out_log"), "Out_log(Output) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2.");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      const auto& x_lod_0 = x_lod[0];
+      PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+    } else {
+      // compile time
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1, x_dims[1]}));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqSoftmaxOPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* top0 = ctx.Output<LoDTensor>("Out");
+    auto* _prob = ctx.Output<LoDTensor>("Out_log");
+    int _output_log = ctx.Attr<int>("alg");
+
+    int seq_size = bottom0->dims()[0];
+    int dim = bottom0->dims()[1];
+    const auto offset_vec = bottom0->lod()[0];
+    top0->Resize(framework::make_ddim({seq_size, dim}));
+    const auto* bottom_data = bottom0->data<T>();
+    auto* top_data = top0->mutable_data<T>(ctx.GetPlace());
+
+    for (int i = 0; i < seq_size; ++i) {
+      int offset = i * dim;
+      auto max_val =
+          *std::max_element(bottom_data + offset, bottom_data + offset + dim);
+      max_val *= -1;
+      sse_add_scalar(bottom_data + offset, top_data + offset, dim, max_val);
+      for (int j = 0; j < dim; ++j) {
+        top_data[offset + j] = std::exp(top_data[offset + j]);
+      }
+      T sum;
+      sse_sum(top_data + offset, sum, dim);
+      sum = 1.0 / sum;
+      sse_scale(top_data + offset, top_data + offset, dim, sum);
+    }
+
+    if (_output_log) {
+      const int size = top0->dims()[0] * top0->dims()[1];
+      _prob->Resize(framework::make_ddim({size}));
+      auto* prob_data = _prob->mutable_data<T>(ctx.GetPlace());
+      memcpy(prob_data, top_data, size * sizeof(T));
+      for (int i = 0; i < size; ++i) {
+        top_data[i] = std::log(std::max(prob_data[i], X_MIN));
+      }
+    } else {
+      _prob->Resize(framework::make_ddim({1}));
+      _prob->mutable_data<T>(ctx.GetPlace());
+    }
+  }
+};
+
+class SearchSeqSoftmaxOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUSearchSeqSoftmaxOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom0 = ctx.Input<LoDTensor>("X");
+    auto* top0 = ctx.Input<LoDTensor>("Out");
+    auto* _prob = ctx.Input<LoDTensor>("Out_log");
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    int _output_log = ctx.Attr<int>("alg");
+
+    int seq_size = bottom0->dims()[0];
+    int dim = bottom0->dims()[1];
+    const auto* top_diff = d_out->data<T>();
+    const auto* top_data = top0->data<T>();
+    auto* bottom_diff = d_x->mutable_data<T>(ctx.GetPlace());
+
+    if (_output_log) {
+      const auto* prob_data = _prob->data<T>();
+      Tensor buffer_diff;
+      buffer_diff.Resize(_prob->dims());
+      auto* prob_diff = buffer_diff.mutable_data<T>(ctx.GetPlace());
+
+      const int size = top0->dims()[0] * top0->dims()[1];
+      PADDLE_ENFORCE_EQ(size, _prob->dims()[0] * _prob->dims()[1], "top_size should be eq to _prob_size");
+      for (int i = 0; i < size; ++i) {
+        prob_diff[i] = top_diff[i] / std::max(prob_data[i], X_MIN);
+      }
+      top_diff = prob_diff;
+      top_data = prob_data;
+    }
+
+    for (int i = 0; i < seq_size; ++i) {
+      int offset = i * dim;
+      T ip_d_t;
+      sse_ip(top_diff + offset, top_data + offset, dim, ip_d_t);
+      ip_d_t *= -1;
+      sse_add_scalar(top_diff + offset, bottom_diff + offset, dim, ip_d_t);
+      sse_eltmul(top_data + offset, bottom_diff + offset, bottom_diff + offset,
+                 dim);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(search_seq_softmax, ops::SearchSeqSoftmaxOP,
+                  ops::SearchSeqSoftmaxOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(search_seq_softmax_grad, ops::SearchSeqSoftmaxOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    search_seq_softmax,
+    ops::CPUSearchSeqSoftmaxOPKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchSeqSoftmaxOPKernel<plt::CPUDeviceContext,
+    //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    search_seq_softmax_grad,
+    ops::CPUSearchSeqSoftmaxOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUSearchSeqSoftmaxOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
new file mode 100644
index 00000000000000..eb553d0e2a1bdf
--- /dev/null
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROW"),
+                   "Input(ROW) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("COLUMN"),
+                   "Input(COLUMN) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("pos"), "pos(out) should not be null");
+
+    auto attr = ctx->Attrs();
+    auto channel_num = attr.Get<int>("channel_num");
+    auto topks = attr.Get<std::vector<int>>("topks");
+
+    auto row_dim = ctx->GetInputDim("ROW");
+
+    auto num_k = topks.size();
+    auto row_shape_0 = row_dim[0];
+
+    std::vector<int> vec_out_shape;
+    vec_out_shape.push_back(row_shape_0);
+    vec_out_shape.push_back(channel_num * num_k);
+
+    ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape));
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor) The variable-length input of SequenceTopkPoolingOp");
+    AddInput("ROW", "(LoDTensor) the row info");
+    AddInput("COLUMN", "(LoDTensor) the column info");
+    AddOutput(
+        "Out",
+        "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD "
+        "infomation.");
+    AddOutput("pos", "(Tensor<int>) store the topk index ").AsIntermediate();
+    AddAttr<std::vector<int>>("topks", "topks");
+    AddAttr<int>("channel_num", "channel number");
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+    sequecen topk average pooling op
+    )DOC");
+  }
+};
+
+class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");
+
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // return framework::OpKernelType(
+    //    framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+    //    ctx.device_context());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class SequenceTopkAvgPoolGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_topk_avg_pooling_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput("ROW", Input("ROW"));
+    op_desc_ptr->SetInput("COLUMN", Input("COLUMN"));
+    op_desc_ptr->SetInput("pos", Output("pos"));
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_topk_avg_pooling, ops::SequenceTopkAvgPoolingOp,
+                  ops::SequenceTopkAvgPoolingOpMaker,
+                  ops::SequenceTopkAvgPoolGradOpMaker);
+REGISTER_OPERATOR(sequence_topk_avg_pooling_grad,
+                  ops::SequenceTopkAvgPoolingGradOp);
+REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling,
+                       ops::SequenceTopkAvgPoolingKernel<
+                           paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling_grad,
+                       ops::SequenceTopkAvgPoolingGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
new file mode 100644
index 00000000000000..d6f7dff0535728
--- /dev/null
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -0,0 +1,264 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace {
+template <typename T>
+void get_topk_pos(const T* data, int length, int k, int* pos,
+                  bool debug = false) {
+  size_t real_k = k < length ? k : length;
+
+  std::vector<T> v(data, data + length);
+
+  std::vector<int> topk_pos;
+  T min_val = -10000000.0;
+  while (topk_pos.size() < real_k) {
+    T max_val = min_val;
+    int max_pos = -1;
+    for (int i = 0; i < length; ++i) {
+      if (v[i] > max_val) {
+        max_pos = i;
+        max_val = v[i];
+      }
+    }
+
+    assert(max_pos >= 0);
+
+    topk_pos.push_back(max_pos);
+    v[max_pos] = min_val;
+  }
+
+  assert(topk_pos.size() > 0);
+  while (topk_pos.size() < (size_t)k) {
+    topk_pos.push_back(-1);
+  }
+
+  for (size_t i = 0; i < topk_pos.size(); ++i) {
+    pos[i] = topk_pos[i];
+  }
+}
+}  // namespace
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* row = context.Input<LoDTensor>("ROW");
+    auto* col = context.Input<LoDTensor>("COLUMN");
+    auto* out = context.Output<LoDTensor>("Out");
+    auto* pos = context.Output<Tensor>("pos");
+
+    auto channel_num = context.Attr<int>("channel_num");
+    auto topks = context.Attr<std::vector<int>>("topks");
+    auto k_num = topks.size();
+    auto max_k = topks[topks.size() - 1];
+    std::vector<int> vec_pos_shape;
+    auto in_lod = in->lod()[0];
+
+    auto row_lod = row->lod()[0];
+    auto col_lod = col->lod()[0];
+    int batch_size = row_lod.size() - 1;
+    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
+    vec_pos_shape.push_back(pos_total_size);
+    pos->Resize({framework::make_ddim(vec_pos_shape)});
+    auto pos_data = pos->mutable_data<int>(context.GetPlace());
+
+    int offset = 0;
+    framework::Vector<size_t> vec_out_lod;
+    vec_out_lod.reserve(batch_size + 1);
+    for (int i = 0; i <= batch_size; ++i) {
+      offset = row_lod[i];
+      vec_out_lod.push_back(offset);
+    }
+
+    /*
+    std::vector<int> vec_out_shape;
+    vec_out_shape.push_back( offset );
+    out->Resize( {framework::make_ddim(vec_out_shape)} );
+    */
+
+    framework::LoD lod_temp;
+    lod_temp.push_back(vec_out_lod);
+    out->set_lod(lod_temp);
+
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+
+    T* sum_data = new T[max_k];
+    for (int i = 0; i < batch_size; ++i) {
+      int total_size = in_lod[i + 1] - in_lod[i];
+      int row_size = row_lod[i + 1] - row_lod[i];
+      int col_size = col_lod[i + 1] - col_lod[i];
+      PADDLE_ENFORCE_EQ(total_size, channel_num * row_size * col_size, "size wrong in sequence_topk_avg_pooling_op!");
+
+      int feature_num = row_size * col_size;
+      for (int j = 0; j < channel_num; ++j) {
+        auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num;
+
+        for (int r = 0; r < row_size; ++r) {
+          auto row_data = input_offset_feature_data + r * col_size;
+          /*
+                          if( ( in_lod[i] + j * feature_num + r * col_size ) <=
+             4871 && ( in_lod[i] + j * feature_num + r * col_size ) <= 4908 &&
+                               ( in_lod[i] + j * feature_num + ( r + 1 ) *
+             col_size ) >= 4908 )
+                          {
+                              LOG(ERROR) << "in same row " << col_size;
+                              LOG(ERROR) << "edge " << in_lod[i] + j *
+             feature_num + r * col_size  << "\t"
+                                        << in_lod[i] + j * feature_num + ( r + 1
+             ) * col_size;
+
+
+                              LOG(ERROR) << "max k " << max_k;
+                              for( size_t h = 0; h < col_size; ++h )
+                              {
+                                  LOG(ERROR) << "element " << row_data[h];
+                              }
+
+                              LOG(ERROR) << "31 " <<  row_data[31];
+                              LOG(ERROR) << "68 " <<  row_data[68];
+
+                              LOG(ERROR) << "cmp " << ( row_data[31] >
+             row_data[68] ); LOG(ERROR) << "cmp " << ( row_data[31] ==
+             row_data[68] );
+                          }
+                          */
+          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
+                                r * channel_num * max_k + j * max_k;
+          auto out_slice_data = out_data + row_lod[i] * channel_num * k_num +
+                                r * channel_num * k_num + j * k_num;
+
+          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
+          if (pos_slice_data[0] == -1) {
+            sum_data[0] = 0.0;
+          } else {
+            sum_data[0] = row_data[pos_slice_data[0]];
+          }
+          for (int k = 1; k < max_k; ++k) {
+            if (pos_slice_data[k] == -1) {
+              sum_data[k] = sum_data[k - 1];
+            } else {
+              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
+            }
+          }
+          /*
+                          LOG(ERROR) << "topk_avg_debug row: " << row_lod.size()
+             << ", col:" << col_lod.size() << ", k_size:" << k_num << "\n"; for(
+             size_t k = 0; k < k_num; ++k )
+                          {
+                              LOG(ERROR) << "pos in ff: " << in_lod[i] + j *
+             feature_num +  r * col_size + pos_slice_data[k];
+                          }
+          */
+          for (size_t k = 0; k < k_num; ++k) {
+            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
+          }
+        }
+      }
+    }
+    delete[] sum_data;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* pos_input = context.Input<Tensor>("pos");
+    auto* row_input = context.Input<LoDTensor>("ROW");
+    auto* col_input = context.Input<LoDTensor>("COLUMN");
+    auto* forward_input = context.Input<LoDTensor>("X");
+
+    int batch_size = row_input->lod()[0].size() - 1;
+    auto channel_num = context.Attr<int>("channel_num");
+    auto topks = context.Attr<std::vector<int>>("topks");
+    auto k_num = topks.size();
+    auto max_k = topks[k_num - 1];
+
+    auto out_lod = forward_input->lod();
+    in_grad->set_lod(out_lod);
+
+    in_grad->mutable_data<T>(context.GetPlace());
+    auto pos_data = pos_input->data<int>();
+    auto out_data = out_grad->data<T>();
+
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+
+    auto in_data = in_grad->data<T>();
+
+    auto out_offset = out_lod[0];
+    auto row_lod = row_input->lod()[0];
+    auto col_lod = col_input->lod()[0];
+
+    // LOG( ERROR ) << "----------------------------------";
+    for (int i = 0; i < batch_size; ++i) {
+      int row_size = row_lod[i + 1] - row_lod[i];
+      int col_size = col_lod[i + 1] - col_lod[i];
+      int feature_num = row_size * col_size;
+
+      for (int j = 0; j < channel_num; ++j) {
+        auto in_offset_feature_data = in_data + out_offset[i] + j * feature_num;
+
+        for (int r = 0; r < row_size; r++) {
+          auto row_data = out_data + row_lod[i] * channel_num * k_num +
+                          r * channel_num * k_num + j * k_num;
+          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
+                                r * channel_num * max_k + j * max_k;
+          auto in_slice_data = in_offset_feature_data + r * col_size;
+
+          for (size_t m = 0; m < k_num; ++m) {
+            for (int k = 0; k < topks[m]; ++k) {
+              if (pos_slice_data[k] == -1) {
+                break;
+              } else {
+                in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m];
+                /*
+                                            if ( out_offset[i] + j * feature_num
+                   + r * col_size + pos_slice_data[k] == 4909 )
+                                            {
+                                                LOG(ERROR) << "pos in bp " <<
+                   out_offset[i] + j * feature_num + r * col_size +
+                   pos_slice_data[k] << "\t" << row_data[m] / topks[m];
+                                            }
+                                            */
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc
new file mode 100644
index 00000000000000..3243e2a496fe02
--- /dev/null
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class SequenceTopkPoolingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("pos"),
+                   "pos(out) should not be null");
+
+    auto attr = ctx->Attrs();
+    auto channel_num = attr.Get<int>("channel_num");
+    auto topk = attr.Get<int>("topk");
+    
+    std::vector<int> vec_out_shape;
+    if (ctx->IsRuntime()) {
+        framework::Variable* x_var =
+            boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+        const auto& x_lod = x_var->Get<LoDTensor>().lod();
+        PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+        const auto& x_lod_0 = x_lod[0];
+        vec_out_shape.push_back(x_lod_0.size() - 1);
+    }
+    else
+    {
+        vec_out_shape.push_back( -1 );
+    }
+
+    vec_out_shape.push_back( channel_num * topk );
+
+    ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape));
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceTopkPoolingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(LoDTensor) The variable-length input of SequenceTopkPoolingOp");
+    AddOutput("Out",
+              "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD "
+              "infomation.");
+    AddOutput("pos",
+              "(Tensor<int>) store the topk index " ).AsIntermediate();
+    AddAttr<int>("topk",
+              "topk attr");
+    AddAttr<int>("channel_num",
+              "channel number");
+    AddAttr<int>("batch_size",
+              "batch size");
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+    sequecen topk pooling op
+    )DOC");
+  }
+};
+
+class SequenceTopkPoolingGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");
+
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+ framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    //return framework::OpKernelType(
+    //    framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+    //    ctx.device_context());
+        auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+        return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class SequenceTopkPoolingGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_topk_pooling_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput("pos", Output("pos"));
+
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_topk_pooling, ops::SequenceTopkPoolingOp, ops::SequenceTopkPoolingOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_topk_pooling_grad, ops::SequenceTopkPoolingGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_topk_pooling,
+    ops::SequenceTopkPoolingKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_topk_pooling_grad,
+    ops::SequenceTopkPoolingGradKernel<float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h
new file mode 100644
index 00000000000000..122ec0b8e35c03
--- /dev/null
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace {
+template <typename T>
+void get_topk_pos(const T* data, int length, int k, int* pos) {
+  size_t real_k = k < length ? k : length;
+
+  std::vector<T> v(data, data + length);
+
+  std::vector<int> topk_pos;
+  T min_val = -10000000.0;
+  while (topk_pos.size() < real_k) {
+    T max_val = min_val;
+    int max_pos = -1;
+    for (int i = 0; i < length; ++i) {
+      if (v[i] > max_val) {
+        max_pos = i;
+        max_val = v[i];
+      }
+    }
+
+    assert(max_pos >= 0);
+
+    topk_pos.push_back(max_pos);
+    v[max_pos] = min_val;
+  }
+
+  assert(topk_pos.size() > 0);
+  while (topk_pos.size() < (size_t)k) {
+    topk_pos.push_back(-1);
+  }
+
+  for (size_t i = 0; i < topk_pos.size(); ++i) {
+    pos[i] = topk_pos[i];
+  }
+}
+}  // namespace
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class SequenceTopkPoolingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    auto* pos = context.Output<Tensor>("pos");
+
+    auto channel_num = context.Attr<int>("channel_num");
+    auto topk = context.Attr<int>("topk");
+    std::vector<int> vec_pos_shape;
+    auto batch_size = in->lod()[0].size() - 1;
+    vec_pos_shape.push_back(batch_size * channel_num * topk);
+    pos->Resize({framework::make_ddim(vec_pos_shape)});
+    auto pos_data = pos->mutable_data<int>(context.GetPlace());
+
+    auto in_lod = in->lod()[0];
+
+    framework::Vector<size_t> vec_out_lod;
+    vec_out_lod.reserve(batch_size + 1);
+    for (int i = 0; i <= batch_size; ++i) {
+      vec_out_lod.push_back(i * channel_num * topk);
+    }
+    framework::LoD lod_temp;
+    lod_temp.push_back(vec_out_lod);
+    out->set_lod(lod_temp);
+
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int total_size = in_lod[i + 1] - in_lod[i];
+      if (total_size % channel_num != 0) {
+        LOG(ERROR) << "input cannot mod channel num";
+      }
+
+      int feature_num = total_size / channel_num;
+      auto in_offset_data = in_data + in_lod[i];
+      for (int j = 0; j < channel_num; ++j) {
+        auto input_slice_data = in_offset_data + j * feature_num;
+        auto pos_slice_data = pos_data + i * topk * channel_num + j * topk;
+        auto out_slice_data = out_data + i * topk * channel_num + j * topk;
+
+        get_topk_pos<T>(input_slice_data, feature_num, topk, pos_slice_data);
+        for (int k = 0; k < topk; ++k) {
+          if (pos_slice_data[k] == -1) {
+            out_slice_data[k] = 0.0;
+          } else {
+            out_slice_data[k] = input_slice_data[pos_slice_data[k]];
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class SequenceTopkPoolingGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* pos_input = context.Input<Tensor>("pos");
+    auto* real_input = context.Input<LoDTensor>("X");
+
+    auto channel_num = context.Attr<int>("channel_num");
+    auto topk = context.Attr<int>("topk");
+
+    auto out_lod = real_input->lod();
+    in_grad->set_lod(out_lod);
+
+    in_grad->mutable_data<T>(context.GetPlace());
+    auto pos_data = pos_input->data<int>();
+
+    auto out_data = out_grad->data<T>();
+
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+
+    auto in_data = in_grad->data<T>();
+
+    auto out_offset = out_lod[0];
+
+    auto batch_size = real_input->lod()[0].size() - 1;
+    for (int i = 0; i < batch_size; ++i) {
+      auto in_offset_data = in_data + out_offset[i];
+      int total_size = out_offset[i + 1] - out_offset[i];
+      int feature_num = total_size / channel_num;
+
+      for (int j = 0; j < channel_num; ++j) {
+        auto in_slice_data = in_offset_data + j * feature_num;
+        auto pos_slice_data = pos_data + i * channel_num * topk + j * topk;
+        auto out_slice_data = out_data + i * channel_num * topk + j * topk;
+
+        for (int k = 0; k < topk; ++k) {
+          if (pos_slice_data[k] == -1) {
+            continue;
+          } else {
+            // LOG(ERROR) << i << " " << j << " " << k << " " <<
+            // pos_slice_data[k] << " " << out_slice_data[k];
+            in_slice_data[pos_slice_data[k]] = out_slice_data[k];
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index bb6a1c5b165693..bcc1dd89e562a7 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -53,6 +53,16 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
+    // Init diag element
+    unsigned int diag_num = static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
+    unsigned int diag_step = static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
+    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
+    if (diag_num > 0) {
+      for (int64_t i = 0; i < diag_num; ++i) {
+        int64_t pos = i*diag_step + i;
+        data[pos] = diag_val;
+      }
+    }
   }
 };
 
@@ -67,6 +77,10 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
+    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_num"), 0,
+        "diag_num must greater than 0");
+    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_step"), 0,
+        "diag_step must greater than 0");
     auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
@@ -105,6 +119,13 @@ uniform distribution. The random result is in set [min, max].
                  "Note that if seed is not 0, this operator will always "
                  "generate the same random numbers every time. [default 0].")
         .SetDefault(0);
+    AddAttr<int>("diag_num", "The number of diag elements. Note that if "
+                 "diag_num is 0, it means without diag init.[default 0].")
+        .SetDefault(0);
+    AddAttr<int>("diag_step", "The step between two diag element.[default 0].")
+        .SetDefault(0);
+    AddAttr<float>("diag_val", "The value of diag initialition. [default 1.0].")
+        .SetDefault(1.0f);
     AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
         .SetDefault(framework::proto::VarType::FP32);
   }
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
new file mode 100644
index 00000000000000..78c2aa713eba6f
--- /dev/null
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -0,0 +1,470 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/var_conv_2d_op.h"
+#ifndef WIN32
+//#include "naive_gemm.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+class VarConv2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "X (LoDTensor, default LoDTensor<float>) Input variable which "
+             "should contain lod information.");
+    AddInput("W", "W (Tensor), the filter.");
+    AddAttr<int>("InputChannel", "the input filter num").SetDefault(1);
+    AddAttr<int>("OutputChannel", "the output filter num").SetDefault(1);
+    AddAttr<int>("StrideH", "the height of Stride").SetDefault(1);
+    AddAttr<int>("StrideW", "the width of Stride").SetDefault(1);
+    AddAttr<int>("KernelH", "the height of Kernel").SetDefault(1);
+    AddAttr<int>("KernelW", "the width of Kernel").SetDefault(1);
+
+    AddOutput("Out", "(LoDTensor, default LoDTensor<float>) Output variable");
+    AddOutput("Col",
+              "(LoDTensor, default LoDTensor<float>) the intermediate result "
+              "variable");
+
+    AddComment(R"DOC(
+      Var Size Conv Operator
+
+      This operator calculate Out = \sigma \left ( W * X + b \right ), 
+      only support 2-D for X.
+      the input is a level-3 LodTensor: 
+      
+      NOTE: only support 'float32' data type now.
+
+    )DOC");
+  }
+};
+
+class VarConv2dOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of VarConv2dOP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "W(Input) of VarConv2dOP should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Out(Output) of VarConv2dOP should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Col"),
+                   "Col(Output) of VarConv2dOP should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2,
+                      "The rank of X(Input) can't be less than 2.");
+
+    auto w_dims = ctx->GetInputDim("W");
+    /*
+    for (int i = 0; i < w_dims.size(); i++) {
+      LOG(ERROR) << "var_conv_2d: w_dims[" << i << "]:" << w_dims << "]";
+    }
+    */
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+    int output_channel = ctx->Attrs().Get<int>("OutputChannel");
+    int input_channel = ctx->Attrs().Get<int>("InputChannel");
+    int kernel_h = ctx->Attrs().Get<int>("KernelH");
+    int kernel_w = ctx->Attrs().Get<int>("KernelW");
+    PADDLE_ENFORCE_EQ(w_dims[0], output_channel,
+                      "W dim[0] should be equal to OutputChannel");
+    PADDLE_ENFORCE_EQ(
+        w_dims[1], input_channel * kernel_h * kernel_w,
+        "W dim[1] should be equal to InputChannel * StrideH * StrideW");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      const auto& x_lod = x_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info.");
+      PADDLE_ENFORCE_GE(x_lod.size(), 3,
+                        "The Input(X)'s lod info is corrupted.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], static_cast<int64_t>(x_lod[0].back()),
+          "The Input(X)'s lod info mismatches the actual tensor shape.");
+    } else {
+      std::vector<int64_t> out_dims_vec{-1};
+      out_dims_vec.push_back(1);
+      std::vector<int64_t> col_dims_vec{-1};
+      col_dims_vec.push_back(1);
+      ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+      ctx->SetOutputDim("Col", framework::make_ddim(col_dims_vec));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
+ public:
+  void Im2Col(const framework::ExecutionContext& ctx, const LoDTensor& input,
+              LoDTensor* col) const {
+    int input_channel = ctx.Attr<int>("InputChannel");
+    int kernel_h = ctx.Attr<int>("KernelH");
+    int kernel_w = ctx.Attr<int>("KernelW");
+    int stride_h = ctx.Attr<int>("StrideH");
+    int stride_w = ctx.Attr<int>("StrideW");
+
+    int batch = input.lod()[0].size() - 1;
+    const auto& bottom_offset = input.lod()[0];
+    const auto& offset_x = input.lod()[2];
+    const auto& offset_y = input.lod()[1];
+
+    // top offset is the whole size of each data sample
+    std::vector<size_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_x = top_im_y * top_im_x;
+      int top_y = input_channel * kernel_h * kernel_w;
+      top_size += top_y * top_x;
+      top_offset.push_back(top_size);
+    }
+    framework::LoD col_lod;
+    col_lod.push_back(top_offset);
+    col->set_lod(col_lod);
+    std::vector<int64_t> col_dims_vec{top_size};
+    col_dims_vec.push_back(1);
+    auto* top_data = col->mutable_data<T>(framework::make_ddim(col_dims_vec),
+                                          ctx.GetPlace());
+    auto* bottom_data = input.data<T>();
+
+    int kernel_win_size = kernel_h * kernel_w;
+    int half_kernel_h = kernel_h / 2;
+    int half_kernel_w = kernel_w / 2;
+    for (int b = 0; b < batch; ++b) {
+      int t_offset = top_offset[b];
+      int b_offset = bottom_offset[b];
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      if (width == 0 || height == 0) {
+        continue;
+      }
+      int top_im_x = (width - 1) / stride_w + 1;
+      int top_im_y = (height - 1) / stride_h + 1;
+      int top_x = top_im_y * top_im_x;
+      for (int z = 0; z < input_channel; ++z) {
+        int row_offset = kernel_win_size * z;
+        int im_offset = z * width * height;
+        for (int y = 0; y < height; y += stride_h) {
+          for (int x = 0; x < width; x += stride_w) {
+            int col_offset = x / stride_w + y / stride_h * top_im_x;
+            for (int ky = 0; ky < kernel_h; ++ky) {
+              for (int kx = 0; kx < kernel_w; ++kx) {
+                int im_y = y + ky - half_kernel_h;
+                int im_x = x + kx - half_kernel_w;
+                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] =
+                      bottom_data[b_offset + im_offset + im_y * width + im_x];
+                } else {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] = 0;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* bottom = ctx.Input<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("W");
+    auto* top = ctx.Output<LoDTensor>("Out");
+    auto* col = ctx.Output<LoDTensor>("Col");
+
+    int output_channel = ctx.Attr<int>("OutputChannel");
+    int input_channel = ctx.Attr<int>("InputChannel");
+    int kernel_h = ctx.Attr<int>("KernelH");
+    int kernel_w = ctx.Attr<int>("KernelW");
+    int stride_h = ctx.Attr<int>("StrideH");
+    int stride_w = ctx.Attr<int>("StrideW");
+
+    Im2Col(ctx, *bottom, col);
+    int batch = bottom->lod()[0].size() - 1;
+    const auto& col_offset = col->lod()[0];
+    const auto& offset_x = bottom->lod()[2];
+    const auto& offset_y = bottom->lod()[1];
+    std::vector<size_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_im_size = top_im_y * top_im_x;
+      top_size += output_channel * top_im_size;
+      top_offset.push_back(top_size);
+    }
+
+    framework::LoD top_lod;
+    top_lod.push_back(top_offset);
+    top_lod.push_back(bottom->lod()[1]);
+    top_lod.push_back(bottom->lod()[2]);
+
+    top->set_lod(top_lod);
+    std::vector<int64_t> top_dims_vec{top_size};
+    top_dims_vec.push_back(1);
+    auto* top_data = top->mutable_data<T>(framework::make_ddim(top_dims_vec),
+                                          ctx.GetPlace());
+
+    auto* w_data = w->data<T>();
+    auto* col_data = col->data<T>();
+
+#ifndef WIN32
+#ifndef __NAIVE_GEMM__
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+#endif  // !__NAIVE_GEMM__
+
+#endif
+    for (int b = 0; b < batch; ++b) {
+      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+      if (top_im_size == 0) {
+        continue;
+      }
+#ifndef WIN32
+
+#ifndef __NAIVE_GEMM__
+      blas.GEMM(CblasNoTrans, CblasNoTrans, output_channel, top_im_size,
+                input_channel * kernel_h * kernel_w, 1.0, w_data,
+                col_data + col_offset[b], 0.0, top_data + top_offset[b]);
+#else
+      naive::gemm<T>(false, false, output_channel, top_im_size,
+                     input_channel * kernel_h * kernel_w, 1.0, w_data,
+                     col_data + col_offset[b], 0.0, top_data + top_offset[b]);
+
+#endif  // !__NAIVE_GEMM__
+
+#endif
+    }
+  }
+};
+
+class VarConv2dOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePadGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(W) of SequencePadGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+    //     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Col")),
+    //                    "Input(Col@GRAD) of SequencePadGradOp should not be
+    //                    null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("W"))) {
+      ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Im2ColGrad(const framework::ExecutionContext& ctx, T* top_diff) const {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* col = ctx.Input<LoDTensor>("Col");
+
+    int input_channel = ctx.Attr<int>("InputChannel");
+    int kernel_h = ctx.Attr<int>("KernelH");
+    int kernel_w = ctx.Attr<int>("KernelW");
+    int stride_h = ctx.Attr<int>("StrideH");
+    int stride_w = ctx.Attr<int>("StrideW");
+
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    auto* bottom_diff = d_x->mutable_data<T>(ctx.GetPlace());
+    memset(bottom_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
+
+    const auto& bottom_offset = x->lod()[0];
+    const auto& offset_x = x->lod()[2];
+    const auto& offset_y = x->lod()[1];
+    const auto& top_offset = col->lod()[0];
+    int batch = x->lod()[0].size() - 1;
+    int kernel_win_size = kernel_h * kernel_w;
+    int half_kernel_h = kernel_h / 2;
+    int half_kernel_w = kernel_w / 2;
+    for (int b = 0; b < batch; ++b) {
+      int t_offset = top_offset[b];
+      int b_offset = bottom_offset[b];
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      if (width == 0 || height == 0) {
+        continue;
+      }
+      int top_im_x = (width - 1) / stride_w + 1;
+      int top_im_y = (height - 1) / stride_h + 1;
+      int top_x = top_im_y * top_im_x;
+      for (int z = 0; z < input_channel; ++z) {
+        int row_offset = kernel_win_size * z;
+        int im_offset = z * width * height;
+        for (int y = 0; y < height; y += stride_h) {
+          for (int x = 0; x < width; x += stride_w) {
+            int col_offset = x / stride_w + y / stride_h * top_im_x;
+            for (int ky = 0; ky < kernel_h; ++ky) {
+              for (int kx = 0; kx < kernel_w; ++kx) {
+                int im_y = y + ky - half_kernel_h;
+                int im_x = x + kx - half_kernel_w;
+                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                  bottom_diff[b_offset + im_offset + im_y * width + im_x] +=
+                      top_diff[t_offset +
+                               (row_offset + ky * kernel_w + kx) * top_x +
+                               col_offset];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("W");
+    auto* col = ctx.Input<LoDTensor>("Col");
+    auto* out = ctx.Input<LoDTensor>("Out");
+
+    int output_channel = ctx.Attr<int>("OutputChannel");
+    int input_channel = ctx.Attr<int>("InputChannel");
+    int kernel_h = ctx.Attr<int>("KernelH");
+    int kernel_w = ctx.Attr<int>("KernelW");
+
+    auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    // auto* d_col = ctx.Input<LoDTensor>(framework::GradVarName("Col"));
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
+
+    Tensor col_grad;
+    col_grad.Resize(col->dims());
+    auto* col_diff = col_grad.mutable_data<T>(ctx.GetPlace());
+    auto* bottom_diff = d_x->mutable_data<T>(ctx.GetPlace());
+    auto* w_diff = d_w->mutable_data<T>(ctx.GetPlace());
+    // auto* col_diff = const_cast<T*>(d_col->data<T>());
+    memset(bottom_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T));
+    memset(w_diff, 0.0, w->dims()[0] * w->dims()[1] * sizeof(T));
+    memset(col_diff, 0.0, col->dims()[0] * col->dims()[1] * sizeof(T));
+    auto* top_diff = d_out->data<T>();
+    auto* w_data = w->data<T>();
+    auto* col_data = col->data<T>();
+    int batch = x->lod()[0].size() - 1;
+    const auto& top_offset = out->lod()[0];
+    const auto& col_offset = col->lod()[0];
+#ifndef WIN32
+#ifndef __NAIVE_GEMM__
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+#endif  // !__NAIVE_GEMM__
+#endif
+    for (int b = 0; b < batch; ++b) {
+      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+      if (top_im_size == 0) {
+        continue;
+      }
+#ifndef WIN32
+
+#ifndef __NAIVE_GEMM__
+      blas.GEMM(CblasTrans, CblasNoTrans, input_channel * kernel_h * kernel_w,
+                top_im_size, output_channel, 1.0, w_data,
+                top_diff + top_offset[b], 1.0, col_diff + col_offset[b]);
+#else
+      naive::gemm<T>(true, false, input_channel * kernel_h * kernel_w,
+                     top_im_size, output_channel, 1.0, w_data,
+                     top_diff + top_offset[b], 1.0, col_diff + col_offset[b]);
+#endif  // !__NAIVE_GEMM__
+
+#ifndef __NAIVE_GEMM__
+      blas.GEMM(CblasNoTrans, CblasTrans, output_channel,
+                input_channel * kernel_h * kernel_w, top_im_size, 1.0,
+                top_diff + top_offset[b], col_data + col_offset[b], 1.0,
+                w_diff);
+#else
+     naive::gemm<T>(false, true, output_channel,
+                   input_channel * kernel_h * kernel_w, top_im_size, 1.0,
+                   top_diff + top_offset[b], col_data + col_offset[b], 1.0,
+                   w_diff);
+#endif  // !__NAIVE_GEMM__
+
+#endif
+    }
+    Im2ColGrad(ctx, col_diff);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+namespace frm = paddle::framework;
+REGISTER_OPERATOR(var_conv_2d, ops::VarConv2dOP, ops::VarConv2dOpMaker,
+                  frm::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(var_conv_2d_grad, ops::VarConv2dOpGrad);
+
+REGISTER_OP_CPU_KERNEL(var_conv_2d,
+                       ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext, float>
+                       //     ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext,
+                       //                                       double>
+);
+REGISTER_OP_CPU_KERNEL(
+    var_conv_2d_grad,
+    ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext, float>
+    //     ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext,
+    //                                           double>
+);
diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h
new file mode 100644
index 00000000000000..80ccb94fbaf664
--- /dev/null
+++ b/paddle/fluid/operators/var_conv_2d_op.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+/*
+class MatchMatrixTensorOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class MatchMatrixTensorOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class MatchMatrixTensorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+*/
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index da2591b98058a2..4b9c9d2cd59a94 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -250,32 +250,41 @@ def __call__(self, var, block):
         return op
 
 
-class NormalInitializer(Initializer):
-    """Implements the Random Normal(Gaussian) distribution initializer
+class UniformInitializer(Initializer):
+    """Implements the random uniform distribution initializer
 
     Args:
-        loc (float): mean of the normal distribution
-        scale (float): standard deviation of the normal distribution
+        low (float): lower boundary of the uniform distribution
+        high (float): upper boundary of the uniform distribution
         seed (int): random seed
 
     Examples:
         .. code-block:: python
 
+            import paddle.fluid as fluid
+            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
             fc = fluid.layers.fc(input=x, size=10,
-                param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
+    		param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
     """
 
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        assert loc is not None
-        assert scale is not None
+    def __init__(self, low=-1.0, high=1.0, seed=0, diag_num=0, diag_step=0, diag_val=1.0):
+        assert low is not None
+        assert high is not None
+        assert high >= low
         assert seed is not None
-        super(NormalInitializer, self).__init__()
-        self._mean = loc
-        self._std_dev = scale
+        assert diag_num is not None
+        assert diag_step is not None
+        assert diag_val is not None
+        super(UniformInitializer, self).__init__()
+        self._low = low
+        self._high = high
         self._seed = seed
+        self._diag_num = diag_num
+        self._diag_step = diag_step
+        self._diag_val = diag_val
 
     def __call__(self, var, block):
-        """Add normal distribution initialization ops for a variable
+        """Add uniform distribution initialization ops for a variable
 
         Args:
             var: Variable that needs to be initialized
@@ -291,11 +300,12 @@ def __call__(self, var, block):
         if self._seed == 0:
             self._seed = block.program.random_seed
 
-        # to be compatible of fp16 initalizers
+        # to be compatible of fp16 initializers
         if var.dtype == VarDesc.VarType.FP16:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
-                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                name=unique_name.generate(".".join(
+                    ['uniform_random', var.name, 'tmp'])),
                 shape=var.shape,
                 dtype=out_dtype,
                 type=VarDesc.VarType.LOD_TENSOR,
@@ -305,15 +315,17 @@ def __call__(self, var, block):
             out_var = var
 
         op = block._prepend_op(
-            type="gaussian_random",
+            type="uniform_random",
             outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
                 "dtype": out_dtype,
-                "mean": self._mean,
-                "std": self._std_dev,
+                "min": self._low,
+                "max": self._high,
                 "seed": self._seed,
-                "use_mkldnn": False
+                "diag_num": self._diag_num,
+                "diag_step": self._diag_step,
+                "diag_val": self._diag_val
             },
             stop_gradient=True)
 
@@ -324,6 +336,7 @@ def __call__(self, var, block):
                 outputs={"Out": var},
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
+
         if not framework.in_dygraph_mode():
             var.op = op
         return op
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2bac9dd9a46b1b..9befbcfe424872 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -197,6 +197,20 @@
     'pixel_shuffle',
     'fsp_matrix',
     'continuous_value_model',
+    'match_matrix_tensor',
+    'var_conv_2d',
+    'sequence_topk_avg_pooling',
+    'sequence_topk_pooling',
+    'search_fc',
+    'search_seq_fc',
+    'search_grnn',
+    'search_embedding',
+    'search_seq_arithmetic',
+    'search_aligned_mat_mul',
+    'search_attention_padding_mask',
+    'search_group_padding',
+    'search_seq_depadding',
+    'search_seq_softmax',
 ]
 
 kIgnoreIndex = -100
@@ -11267,3 +11281,490 @@ def continuous_value_model(input, cvm, use_cvm=True):
         outputs={'Y': [out]},
         attrs={"use_cvm": use_cvm})
     return out
+
+def sequence_topk_pooling(input, topk, batch_size, channel_num):
+    """
+    
+    TODO:
+    """
+    helper = LayerHelper('sequence_topk_pooling', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    pos = helper.create_variable_for_type_inference(dtype=helper.input_dtype(),
+                                                    stop_gradient=True)
+    helper.append_op(
+        type='sequence_topk_pooling',
+        inputs={'X': input},
+        outputs={'Out': out,
+                 'pos': pos},
+        attrs={'topk': topk,
+               'batch_size': batch_size,
+               'channel_num': channel_num})
+    return out
+
+
+def sequence_topk_avg_pooling(input, row, col, topks, channel_num):
+    """
+
+    TODO:
+    """
+    helper = LayerHelper('sequence_topk_avg_pooling', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    pos = helper.create_variable_for_type_inference(dtype=helper.input_dtype(),
+                                                    stop_gradient=True)
+    helper.append_op(
+        type='sequence_topk_avg_pooling',
+        inputs={'X': input,
+                'ROW': row,
+                'COLUMN': col},
+        outputs={'Out': out,
+                 'pos': pos},
+        attrs={'topks': topks,
+               'channel_num': channel_num})
+    return out
+
+
+def var_conv_2d(input,
+                input_channel,
+                output_channel,
+                filter_size,
+                stride=1,
+                param_attr=None,
+                act=None,
+                dtype='float32',
+                name=None):
+    """
+
+    :param input:
+    :param input_channel:
+    :param output_channel:
+    :param filter_size:
+    :param stride:
+    :param param_attr:
+    :param act:
+    :param dtype:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('var_conv_2d', **locals())
+    x_shape = list(input.shape)
+    assert len(x_shape) == 2
+
+    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+    stride = utils.convert_to_list(stride, 2, 'stride')
+
+    filter_shape = [int(output_channel),
+                    int(input_channel) * filter_size[0] * filter_size[1]]
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+    )
+
+    conv_res = helper.create_variable_for_type_inference(dtype)
+    tmp_res = helper.create_variable_for_type_inference(dtype, stop_gradient=True)
+
+    helper.append_op(
+        type='var_conv_2d',
+        inputs={
+            'X': input,
+            'W': filter_param,
+        },
+        outputs={"Out": conv_res, "Col": tmp_res},
+        attrs={
+            'InputChannel': input_channel,
+            'OutputChannel': output_channel,
+            'StrideH': stride[0],
+            'StrideW': stride[1],
+            'KernelH': filter_size[0],
+            'KernelW': filter_size[1],
+        }
+    )
+
+    return helper.append_activation(conv_res)
+
+
+def match_matrix_tensor(
+        input_x,
+        input_y,
+        dim_t,
+        act=None,
+        param_attr=None,
+        dtype='float32',
+        is_test=False,
+        name=None):
+    """
+
+    :param input_x:
+    :param input_y:
+    :param dim_t:
+    :param act:
+    :param param_attr:
+    :param dtype:
+    :param is_test:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('match_matrix_tensor', **locals())
+
+    x_shape = list(input_x.shape)
+    y_shape = list(input_y.shape)
+    assert len(x_shape) == 2 and len(y_shape) == 2 and x_shape[-1] == y_shape[-1]
+
+    weight_shape = [x_shape[-1], dim_t, y_shape[-1]]
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=weight_shape, dtype=dtype, is_bias=False)
+    mm_res = helper.create_variable_for_type_inference(dtype)
+    tmp_res = helper.create_variable_for_type_inference(dtype, stop_gradient=True)
+    helper.append_op(
+        type='match_matrix_tensor',
+        inputs={
+            'X': input_x,
+            'Y': input_y,
+            'W': w,
+        },
+        outputs={"Out": mm_res, "Tmp": tmp_res},
+        attrs={'dim_t': dim_t}
+    )
+
+    return helper.append_activation(mm_res), tmp_res
+
+
+def search_fc(
+        input,
+        size,
+        param_attr=None,
+        bias_attr=None,
+        act=None,
+        is_test=False,
+        name=None):
+    """
+
+    TODO:
+    """
+    helper = LayerHelper('search_fc', **locals())
+    dtype = input.dtype
+    input_shape = list(input.shape)
+    assert len(input_shape) == 2
+    w_shape = [size, input_shape[1]]
+    w = helper.create_parameter(attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False)
+    b_shape = [size]
+    b = helper.create_parameter(attr=bias_attr, shape=b_shape, dtype=dtype, is_bias=False)
+    res = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_fc',
+        inputs={
+            'X': input,
+            'W': w,
+            'b': b,
+        },
+        outputs={"Out": res, },
+        attrs={'out_size': size, }
+    )
+
+    return res
+
+
+def search_seq_fc(
+        input,
+        size,
+        param_attr=None,
+        bias_attr=None,
+        act=None,
+        is_test=False,
+        name=None):
+    """
+
+    TODO:
+    """
+    helper = LayerHelper('search_seq_fc', **locals())
+    dtype = input.dtype
+    input_shape = list(input.shape)
+    assert len(input_shape) == 2
+    w_shape = [size, input_shape[1]]
+    w = helper.create_parameter(attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False)
+    input_dict = {'X': input, 'W': w,}
+    has_bias = False
+    if bias_attr is not None:
+        b_shape = [size]
+        b = helper.create_parameter(attr=bias_attr, shape=b_shape, dtype=dtype, is_bias=False)
+        input_dict['b'] = b
+        has_bias = True
+    res = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_seq_fc',
+        inputs=input_dict,
+        outputs={"Out": res, },
+        attrs={'out_size': size, 'has_bias': has_bias}
+    )
+
+    return res
+
+
+def search_grnn(
+        input,
+        num_input,
+        num_hidden,
+        param_attr_in,
+        param_attr_hidden,
+        dtype='float32',
+        is_test=False,
+        name=None):
+    """
+    
+    TODO:
+    """
+
+    helper = LayerHelper('search_grnn', **locals())
+
+    input_shape = list(input.shape)
+    assert len(input_shape) == 2 and input_shape[-1] == num_input
+
+    _cap_h = num_hidden
+    _cap_e = input_shape[-1]
+    wi_shape = [3, _cap_h, _cap_e]
+    wh_shape = [3, _cap_h, _cap_h]
+    wi = helper.create_parameter(
+        attr=param_attr_in, shape=wi_shape, dtype=dtype, is_bias=False)
+    wh = helper.create_parameter(
+        attr=param_attr_hidden, shape=wh_shape, dtype=dtype, is_bias=False)
+
+    grnn_res = helper.create_variable_for_type_inference(dtype)
+    grnn_buffer = helper.create_variable_for_type_inference(dtype)
+    grnn_idx_sorted_by_width = helper.create_variable_for_type_inference(dtype)
+    grnn_layout_input = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type='search_grnn',
+        inputs={
+            'X': input,
+            'Wi': wi,
+            'Wh': wh,
+        },
+        outputs={"Out": grnn_res,
+                 "tmp_buffer": grnn_buffer,
+                 'idx_sorted_by_width': grnn_idx_sorted_by_width,
+                 'layout_input': grnn_layout_input
+                 },
+        attrs={'num_input': num_input, 'num_hidden': num_hidden}
+    )
+
+    return grnn_res
+
+
+def search_embedding(
+        input,
+        num_voc,
+        num_emb,
+        lr,   
+        param_attr=None,
+        name=None,
+        dtype='float32'):
+    """
+
+    :param input:
+    :param num_voc:
+    :param num_emb:
+    :param lr:
+    :param param_attr:
+    :param name:
+    :param dtype:
+    :return:
+    """
+    helper = LayerHelper('search_embedding', **locals())
+
+    w_shape = [num_voc, num_emb]
+    w = helper.create_parameter(attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False)
+    w.stop_gradient = True
+
+    res = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_embedding',
+        inputs={
+            'X': input,
+            'W': w,
+        },    
+        outputs={"Out": res, },
+        attrs={'num_voc': num_voc, 'num_emb': num_emb, 'lr': lr, } 
+    )
+
+    return res
+
+
+def search_seq_arithmetic(
+        input_x,
+        input_y,
+        op_type,
+        name=None):
+    """
+    :param input_x:
+    :param input_y:
+    :param op_type:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('search_seq_arithmetic', **locals())
+    dtype = input_x.dtype
+
+    res = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_seq_arithmetic',
+        inputs={
+            'X': input_x,
+            'Y': input_y,
+        },
+        outputs={"Out": res},
+        attrs={'op_type': op_type}
+    )
+
+    return res
+
+
+def search_aligned_mat_mul(
+        input_x,
+        input_y,
+        transpose_x,
+        transpose_y,
+        alpha,
+        name=None):
+    """
+    :param input_x:
+    :param input_y:
+    :param transpose_x:
+    :param transpose_y:
+    :param alpha:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('search_aligned_mat_mul', **locals())
+    dtype = input_x.dtype
+
+    out = helper.create_variable_for_type_inference(dtype)
+    _a_addr = helper.create_variable_for_type_inference(dtype)
+    _b_addr = helper.create_variable_for_type_inference(dtype)
+    _c_addr = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_aligned_mat_mul',
+        inputs={
+            'X': input_x,
+            'Y': input_y,
+        },
+        outputs={"Out": out, '_a_addr': _a_addr, '_b_addr': _b_addr, '_c_addr': _c_addr},
+        attrs={'transpose_X': transpose_x, 'transpose_Y': transpose_y,
+               'alpha': alpha}
+    )
+
+    return out
+
+
+def search_attention_padding_mask(
+        input_x,
+        input_y,
+        pad_id,
+        mask,
+        name=None):
+    """
+    :param input_x:
+    :param input_y:
+    :param pad_id:
+    :param mask:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('search_attention_padding_mask', **locals())
+    dtype = input_x.dtype
+
+    out = helper.create_variable_for_type_inference(dtype)
+    pad_begin = helper.create_variable_for_type_inference('int')
+    helper.append_op(
+        type='search_attention_padding_mask',
+        inputs={
+            'X': input_x,
+            'Y': input_y,
+        },
+        outputs={"Out": out, 'pad_begin': pad_begin},
+        attrs={'pad_id': pad_id, 'mask': mask}
+    )
+
+    return out
+
+
+def search_group_padding(
+        input,
+        pad_id,
+        name=None):
+    """
+    :param input:
+    :param pad_id:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('search_group_padding', **locals())
+    dtype = input.dtype
+
+    out_emb_padding = helper.create_variable_for_type_inference(dtype)
+    out_new = helper.create_variable_for_type_inference(dtype, stop_gradient=True)
+    out_padding = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_group_padding',
+        inputs={
+            'X': input,
+        },
+        outputs={"Out_emb_padding": out_emb_padding,
+                 'Out_new': out_new,
+                 'Out_padding': out_padding,
+                 },
+        attrs={'pad_id': pad_id}
+    )
+
+    return [out_emb_padding, out_new, out_padding]
+
+
+def search_seq_depadding(
+        input_pad,
+        input_src,
+        name=None):
+    """
+    :param input_pad:
+    :param input_src:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('search_seq_depadding', **locals())
+    dtype = input_pad.dtype
+
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_seq_depadding',
+        inputs={
+            'Pad': input_pad,
+            'Src': input_src,
+        },
+        outputs={"Out": out},
+    )
+
+    return out
+
+
+def search_seq_softmax(
+        input_x,
+        alg,
+        name=None):
+    """
+    :param input_x:
+    :param alg:
+    :param name:
+    :return:
+    """
+    helper = LayerHelper('search_seq_softmax', **locals())
+    dtype = input_x.dtype
+
+    out = helper.create_variable_for_type_inference(dtype)
+    out_log = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='search_seq_softmax',
+        inputs={
+            'X': input_x,
+        },
+        outputs={"Out": out, 'Out_log': out_log},
+        attrs={'alg': alg}
+    )

From b7b8daf3422f84864fc720a04b22995cc4308fb9 Mon Sep 17 00:00:00 2001
From: zhangliujie <zhangliujie@baidudeMacBook-Pro-6.local>
Date: Thu, 1 Aug 2019 13:23:40 +0800
Subject: [PATCH 2/3] fix noraml init bug

---
 python/paddle/fluid/initializer.py | 77 ++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 4b9c9d2cd59a94..3c6c30069e5916 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -250,6 +250,83 @@ def __call__(self, var, block):
         return op
 
 
+class NormalInitializer(Initializer):
+    """Implements the Random Normal(Gaussian) distribution initializer
+    Args:
+        loc (float): mean of the normal distribution
+        scale (float): standard deviation of the normal distribution
+        seed (int): random seed
+    Examples:
+        .. code-block:: python
+	    import paddle.fluid as fluid
+        x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+	    fc = fluid.layers.fc(input=x, size=10,
+    		param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
+    """
+
+    def __init__(self, loc=0.0, scale=1.0, seed=0):
+        assert loc is not None
+        assert scale is not None
+        assert seed is not None
+        super(NormalInitializer, self).__init__()
+        self._mean = loc
+        self._std_dev = scale
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add normal distribution initialization ops for a variable
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['gaussian_random', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        op = block._prepend_op(
+            type="gaussian_random",
+            outputs={"Out": out_var},
+            attrs={
+                "shape": var.shape,
+                "dtype": out_dtype,
+                "mean": self._mean,
+                "std": self._std_dev,
+                "seed": self._seed,
+                "use_mkldnn": False
+            },
+            stop_gradient=True)
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+        if not framework.in_dygraph_mode():
+            var.op = op
+        return op
+
+
 class UniformInitializer(Initializer):
     """Implements the random uniform distribution initializer
 

From d3c5bfb4d506013bcf03c63b87fa9c1844888091 Mon Sep 17 00:00:00 2001
From: zhangliujie <zhangliujie@baidudeMacBook-Pro-6.local>
Date: Thu, 1 Aug 2019 14:41:43 +0800
Subject: [PATCH 3/3] fix backward.py by qiaolongfei

---
 python/paddle/fluid/backward.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 9fd53a74bf5192..bdd95b529db970 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -232,15 +232,19 @@ def _op_can_be_removed_(op_desc, no_grad_set):
         for arg in op_desc.input_arg_names():
             if core.grad_var_suffix() in arg and arg in no_grad_set:
                 x_in = _strip_grad_suffix_(arg)
-                x_in_var_desc = op_desc.block().find_var_recursive(
-                    cpt.to_bytes(x_in))
-                assert x_in_var_desc is not None, "Variable {} not found".format(
-                    x_in)
-                dtype = x_in_var_desc.dtype()
-
                 to_insert.append(
-                    (_create_op_desc_("fill_zeros_like2", {"X": [x_in]},
-                                      {"Out": [arg]}, {"dtype": dtype}), idx))
+                    (_create_op_desc_("fill_zeros_like", {"X": [x_in]},
+                                      {"Out": [arg]}, {}), idx))
+                
+                # x_in_var_desc = op_desc.block().find_var_recursive(
+                #     cpt.to_bytes(x_in))
+                # assert x_in_var_desc is not None, "Variable {} not found".format(
+                #     x_in)
+                # dtype = x_in_var_desc.dtype()
+
+                # to_insert.append(
+                #     (_create_op_desc_("fill_zeros_like2", {"X": [x_in]},
+                #                       {"Out": [arg]}, {"dtype": dtype}), idx))
 
     list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])