From 179e100f533cc8225e7b01e23d7b53b05024f88a Mon Sep 17 00:00:00 2001 From: zhangliujie Date: Wed, 31 Jul 2019 14:10:01 +0800 Subject: [PATCH 1/3] add ops from content dnn --- .../fluid/operators/match_matrix_tensor_op.cc | 644 ++++++++++++++ .../fluid/operators/match_matrix_tensor_op.h | 53 ++ .../operators/search_aligned_mat_mul_op.cc | 373 ++++++++ .../search_attention_padding_mask_op.cc | 263 ++++++ paddle/fluid/operators/search_compute.h | 422 +++++++++ paddle/fluid/operators/search_embedding_op.cc | 370 ++++++++ paddle/fluid/operators/search_fc_op.cc | 331 +++++++ paddle/fluid/operators/search_grnn_op.cc | 807 ++++++++++++++++++ paddle/fluid/operators/search_grnn_op.h | 27 + .../operators/search_group_padding_op.cc | 222 +++++ .../operators/search_seq_arithmetic_op.cc | 255 ++++++ .../operators/search_seq_depadding_op.cc | 207 +++++ paddle/fluid/operators/search_seq_fc_op.cc | 217 +++++ .../fluid/operators/search_seq_softmax_op.cc | 211 +++++ .../sequence_topk_avg_pooling_op.cc | 134 +++ .../sequence_topk_avg_pooling_op.h | 264 ++++++ .../sequence_ops/sequence_topk_pooling_op.cc | 137 +++ .../sequence_ops/sequence_topk_pooling_op.h | 175 ++++ paddle/fluid/operators/uniform_random_op.cc | 21 + paddle/fluid/operators/var_conv_2d_op.cc | 470 ++++++++++ paddle/fluid/operators/var_conv_2d_op.h | 46 + python/paddle/fluid/initializer.py | 49 +- python/paddle/fluid/layers/nn.py | 501 +++++++++++ 23 files changed, 6181 insertions(+), 18 deletions(-) create mode 100644 paddle/fluid/operators/match_matrix_tensor_op.cc create mode 100644 paddle/fluid/operators/match_matrix_tensor_op.h create mode 100644 paddle/fluid/operators/search_aligned_mat_mul_op.cc create mode 100644 paddle/fluid/operators/search_attention_padding_mask_op.cc create mode 100644 paddle/fluid/operators/search_compute.h create mode 100644 paddle/fluid/operators/search_embedding_op.cc create mode 100644 paddle/fluid/operators/search_fc_op.cc create mode 100644 paddle/fluid/operators/search_grnn_op.cc create mode 100644 paddle/fluid/operators/search_grnn_op.h create mode 100644 paddle/fluid/operators/search_group_padding_op.cc create mode 100644 paddle/fluid/operators/search_seq_arithmetic_op.cc create mode 100644 paddle/fluid/operators/search_seq_depadding_op.cc create mode 100644 paddle/fluid/operators/search_seq_fc_op.cc create mode 100644 paddle/fluid/operators/search_seq_softmax_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h create mode 100644 paddle/fluid/operators/var_conv_2d_op.cc create mode 100644 paddle/fluid/operators/var_conv_2d_op.h diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc new file mode 100644 index 00000000000000..e0108f69a82641 --- /dev/null +++ b/paddle/fluid/operators/match_matrix_tensor_op.cc @@ -0,0 +1,644 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include +//#include "naive_gemm.h" + +#include "paddle/fluid/operators/match_matrix_tensor_op.h" + +#ifndef WIN32 +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/dynload/mklml.h" +#include "paddle/fluid/operators/math/blas.h" + +// To align with Lego +#ifndef LEGO_USE_FLOAT +#define LEGO_USE_FLOAT +#endif + +#if defined(LEGO_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_setzero_px _mm256_setzero_ps +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_hadd_px _mm256_hadd_ps +#define _mm256_permute2f128_px _mm256_permute2f128_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss +#define _mm256_castpx256_px128 _mm256_castps256_ps128 +#define _mm256_max_px _mm256_max_ps +#define _mm256_sub_px _mm256_sub_ps +#define _mm256_set1_px _mm256_set1_ps +#define _mm256_sqrt_px _mm256_sqrt_ps +#define _mm256_div_px _mm256_div_ps +#define _mm_setzero_px _mm_setzero_ps +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_hadd_px _mm_hadd_ps +#define _mm_store_sx _mm_store_ss +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps +#define _mm_max_px _mm_max_ps +#define _mm_sub_px _mm_sub_ps +#define _mm_set1_px _mm_set1_ps +#define _mm_sqrt_px _mm_sqrt_ps +#define _mm_div_px _mm_div_ps + +#elif defined(LEGO_USE_DOUBLE) + +#define __m256x __m256d +#define __m128x __m128d + +static const unsigned int AVX_STEP_SIZE = 4; +static const unsigned int SSE_STEP_SIZE = 2; +static const unsigned int AVX_CUT_LEN_MASK = 3U; +static const unsigned int SSE_CUT_LEN_MASK = 1U; + +#define _mm256_setzero_px _mm256_setzero_pd +#define _mm256_mul_px _mm256_mul_pd +#define _mm256_add_px _mm256_add_pd +#define _mm256_load_px _mm256_loadu_pd +#define _mm256_hadd_px _mm256_hadd_pd +#define _mm256_permute2f128_px _mm256_permute2f128_pd +#define _mm256_store_px _mm256_storeu_pd +#define _mm256_broadcast_sx _mm256_broadcast_sd +#define _mm256_castpx256_px128 _mm256_castpd256_pd128 +#define _mm256_max_px _mm256_max_pd +#define _mm256_sub_px _mm256_sub_pd +#define _mm256_set1_px _mm256_set1_pd +#define _mm256_sqrt_px _mm256_sqrt_pd +#define _mm256_div_px _mm256_div_pd +#define _mm_setzero_px _mm_setzero_pd +#define _mm_add_px _mm_add_pd +#define _mm_mul_px _mm_mul_pd +#define _mm_load_px _mm_loadu_pd +#define _mm_hadd_px _mm_hadd_pd +#define _mm_store_sx _mm_store_sd +#define _mm_store_px _mm_storeu_pd +#define _mm_load1_px _mm_load1_pd +#define _mm_max_px _mm_max_pd +#define _mm_sub_px _mm_sub_pd +#define _mm_set1_px _mm_set1_pd +#define _mm_sqrt_px _mm_sqrt_pd +#define _mm_div_px _mm_div_pd +#endif +#endif + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), + "X(Input) of MatchMatrix should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Y(Input) of MatchMatrix should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "W(Input) of MatchMatrix should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Out(Output) of Fully Connected should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Tmp"), + "Tmp(Output) of Fully Connected should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + // for (int i = 0; i < x_dims.size(); i++) { + // LOG(ERROR) << "match_matrix_tensor: x_dims[" << i << "]:" << x_dims << + // "]"; + //} + PADDLE_ENFORCE_EQ(x_dims.size(), 2, + "The rank of Input(X) can't be less than 2."); + + auto y_dims = ctx->GetInputDim("Y"); + /* + for (int i = 0; i < y_dims.size(); i++) { + LOG(ERROR) << "match_matrix_tensor: y_dims[" << i << "]:" << y_dims << "]"; + } + */ + PADDLE_ENFORCE_EQ(y_dims.size(), 2, + "The rank of Input(Y) can't be less than 2."); + + auto w_dims = ctx->GetInputDim("W"); + PADDLE_ENFORCE_EQ(w_dims.size(), 3UL, "W should be 3-D tensor"); + /* + for (int i = 0; i < w_dims.size(); i++) { + LOG(ERROR) << "match_matrix_tensor: w_dims[" << i << "]:" << w_dims << "]"; + } + */ + int dim_t = ctx->Attrs().Get("dim_t"); + + PADDLE_ENFORCE( + x_dims[1] == w_dims[0] && y_dims[1] == w_dims[2] && w_dims[1] == dim_t, + "W 's shape must be X[1] * dim_t * Y[1]."); + + int out_dim_0 = -1; + int tmp_dim_0 = -1; + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + framework::Variable* y_var = + boost::get(ctx->GetInputVarPtrs("Y")[0]); + const auto& y_lod = y_var->Get().lod(); + PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info."); + const auto& y_lod_0 = y_lod[0]; + PADDLE_ENFORCE_GE(y_lod_0.size(), 2, + "The Input(Y)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + y_dims[0], static_cast(y_lod_0.back()), + "The Input(Y)'s lod info mismatches the actual tensor shape."); + + PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(), + "The Length of X and Y must be equal."); + + out_dim_0 = 0; + for (size_t i = 1; i < x_lod_0.size(); i++) { + int x_len = x_lod_0[i] - x_lod_0[i - 1]; + int y_len = y_lod_0[i] - y_lod_0[i - 1]; + out_dim_0 += (x_len * y_len); + } + out_dim_0 *= dim_t; + + tmp_dim_0 = x_dims[0] * dim_t * x_dims[1]; + } else { + // compile time + framework::VarDesc* x_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1); + framework::VarDesc* y_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1); + } + + std::vector out_dims_vec{out_dim_0}; + out_dims_vec.push_back(1); + std::vector tmp_dims_vec{tmp_dim_0}; + tmp_dims_vec.push_back(1); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + ctx->SetOutputDim("Tmp", framework::make_ddim(tmp_dims_vec)); +} + +void MatchMatrixTensorOpGrad::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Tmp")), + // "Input(Tmp@GRAD) of SequencePadGradOp should not be + // null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("Y"))) { + ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y")); + ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y")); + } + if (ctx->HasOutput(framework::GradVarName("W"))) { + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + } +} + +void MatchMatrixTensorOpMaker::Make() { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("Y", + "Y (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("W", "W (Tensor), The weight of X and Y."); + AddAttr("dim_t", "the dim of W").SetDefault(1); + AddOutput("Out", + "(LoDTensor, default LoDTensor) Output variable which " + "is X * W * Y"); + AddOutput("Tmp", + "(LoDTensor, default LoDTensor) tmp variable which is " + "used for X * W"); + AddComment(R"DOC( + Match Matrix Tensor Operator + + This operator calculate X * W * Y, only support 2-D for X and Y. + the output is a level-3 LodTensor: + level_0: dim_t + level_1: query length + level_2: title length + + NOTE: only support 'float32' data type now. + + )DOC"); +} +#ifndef WIN32 + +template +void lego_cpu_gemm(const math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, + const T* A, const T* B, const T beta, T* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + //#ifdef LEGO_USE_FLOAT +#ifndef __NAIVE_GEMM__ + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm(true, (TransA == CblasTrans), (TransB == CblasTrans), M, N, K, + alpha, A, lda, B, ldb, beta, C, N); +#endif // !__NAIVE_GEMM__ + + + + // platform::dynload::cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, + // alpha, + // A, lda, B, ldb, beta, C, N); + // cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + // ldb, + // beta, C, N); + // #else + // cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + // ldb, + // beta, C, N); + // #endif +} + +template +void lego_cpu_gemm_with_lda(const math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C, int lda) { + // int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + // #ifdef LEGO_USE_FLOAT + +#ifndef __NAIVE_GEMM__ + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm(true, (TransA == CblasTrans), (TransB == CblasTrans), M, N, K, + alpha, A, lda, B, ldb, beta, C, N); +#endif // !__NAIVE_GEMM__ + + + // platform::dynload::cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, + // alpha, + // A, lda, B, ldb, beta, C, N); + // #else + // cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + // ldb, + // beta, C, N); + // #endif +} + +template +inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) { + unsigned int jjj, lll; + jjj = lll = 0; + + // #if defined(LEGO_AVX) + // lll = len & ~AVX_CUT_LEN_MASK; + // __m256x mm_alpha = _mm256_broadcast_sx(&alpha); + // for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + // _mm256_store_px( + // y + jjj, + // _mm256_add_px(_mm256_load_px(y + jjj), + // _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)))); + // } + // + // #elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_alpha = _mm_load1_px(&alpha); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(y + jjj, + _mm_add_px(_mm_load_px(y + jjj), + _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)))); + } + + // #endif + for (; jjj < len; jjj++) { + y[jjj] += alpha * x[jjj]; + } +} +#endif + +template +class CPUMatchMatrixTensorOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#ifndef WIN32 + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* w = ctx.Input("W"); + auto* out = ctx.Output("Out"); + auto* tmp = ctx.Output("Tmp"); + + int dim_t = ctx.Attr("dim_t"); + + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + auto* out_data = out->mutable_data(ctx.GetPlace()); + memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T)); + + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* t_data = w->data(); + auto* bottom_l_trans_data = tmp->mutable_data(ctx.GetPlace()); + memset(bottom_l_trans_data, 0.0, + tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); + + auto blas = math::GetBlas(ctx); + + // int M = x->dims()[0], N = dim_t * dim_in, K = dim_in; + lego_cpu_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0], + dim_t * dim_in, dim_in, 1.0f, bottom_l_data, t_data, 0.0f, + bottom_l_trans_data); + // cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, + // bottom_l_data, K, t_data, N, 0.0, bottom_l_trans_data, + // N); +/* +if (top_size == 9792) +{ + std::ofstream out_to_file("out.model", std::ios::binary); + out_to_file.write((char*)bottom_l_trans_data, tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); + out_to_file.close(); + std::ofstream out_to_file_r("out.model.bottom_r_data", std::ios::binary); + out_to_file_r.write((char*)bottom_r_data, y->dims()[0] * y->dims()[1]*sizeof(float)); + out_to_file_r.close(); + + float * p = new float[9792]; + Tensor out_tensor; + out_tensor.Resize(out->dims()); + auto* out_tensor_data = out_tensor.mutable_data(ctx.GetPlace()); + blas.GEMM(CblasNoTrans, CblasTrans, 4, 51, 128, 1.0f, + bottom_l_trans_data, + 384, + bottom_r_data, + 128, + 0.0f, + out_tensor_data, + 51); + + LOG(ERROR) << "check_mkl: p[28] = " << out_tensor_data[28] << " "; + LOG(ERROR) << "check_mkl: p[39] = " << out_tensor_data[39] << " "; + LOG(ERROR) << "check_mkl: p[49] = " << out_tensor_data[49] << " "; + + int n; + n = memcmp(bottom_r_data + 39*128, bottom_r_data + 49*128, 128 ); + LOG(ERROR) << "memcmp = " << n; + + float f28 = 0; + blas.GEMM(CblasNoTrans, CblasTrans, 1, 1, 128, 1.0f, + bottom_l_trans_data, + 384, + bottom_r_data + 28*128, + 128, + 0.0f, + &f28, + 1); + LOG(ERROR) << "check_mkl: single f28 = " << f28 << " "; + + float f39 = 0; + blas.GEMM(CblasNoTrans, CblasTrans, 1, 1, 128, 1.0f, + bottom_l_trans_data, + 384, + bottom_r_data + 39*128, + 128, + 0.0f, + &f39, + 1); + LOG(ERROR) << "check_mkl: single f39 = " << f39 << " "; + + float f49 = 0; + blas.GEMM(CblasNoTrans, CblasTrans, 1, 1, 128, 1.0f, + bottom_l_trans_data, + 384, + bottom_r_data + 49*128, + 128, + 0.0f, + &f49, + 1); + LOG(ERROR) << "check_mkl: single f49 = " << f49 << " "; + + for (int tt= 0; tt < 4*51; tt++) + { + LOG(ERROR) << p[tt] << " "; + } + LOG(ERROR) << "check_end"; +} +*/ + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + auto* top_data = out_data + top_offset[b] + t * len_l * len_r; + const auto* l_t_data = + bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; + const auto* r_data = bottom_r_data + offset_r[b] * dim_in; + auto blas_2 = math::GetBlas(ctx); + lego_cpu_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r, + dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data, + dim_t * dim_in); + /* + if (top_size == 9792) + { + LOG(ERROR) << "top_565 = " << out_data[565] << " " ; + } + */ + } + } + + int batch_size = x->lod()[0].size() - 1; + int lod_lv1_size = batch_size * dim_t; + int lod_lv2_size = x->lod()[0].back() * dim_t; + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); + for (int i = 0; i < batch_size; i++) { + out_lod0[i + 1] = out_lod0[i] + dim_t; + int len_l = offset_l[i + 1] - offset_l[i]; + + for (int j = 0; j < dim_t; j++) { + out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l; + int len_r = offset_r[i + 1] - offset_r[i]; + + for (int k = 0; k < len_l; k++) { + out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = + out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; + } + } + } + + framework::LoD out_lod; + // out_lod.push_back(out_lod0); + // out_lod.push_back(out_lod1); + // out_lod.push_back(out_lod2); + out_lod.push_back(top_offset); + out_lod.push_back(offset_l); + out_lod.push_back(offset_r); + + out->set_lod(out_lod); + +#endif + } +}; + +template +class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#ifndef WIN32 + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* w = ctx.Input("W"); + auto* tmp = ctx.Input("Tmp"); + + int dim_t = ctx.Attr("dim_t"); + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* bottom_l_trans_data = tmp->data(); + + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_y = ctx.Output(framework::GradVarName("Y")); + // auto* d_tmp = ctx.Input(framework::GradVarName("Tmp")); + + Tensor tmp_grad; + tmp_grad.Resize(tmp->dims()); + auto* d_tmp_data = tmp_grad.mutable_data(ctx.GetPlace()); + auto* top_diff = d_out->data(); + auto* bottom_l_diff = d_x->mutable_data(ctx.GetPlace()); + auto* bottom_r_diff = d_y->mutable_data(ctx.GetPlace()); + // auto* d_tmp_data = d_tmp->data(); + auto* bottom_l_trans_diff = const_cast(d_tmp_data); + memset(bottom_l_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T)); + memset(bottom_r_diff, 0.0, y->dims()[0] * y->dims()[1] * sizeof(T)); + memset(bottom_l_trans_diff, 0.0, + tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); + + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + + for (int i = 0; i < len_l; i++) { + for (int j = 0; j < len_r; j++) { + auto diff = + top_diff[top_offset[b] + t * len_l * len_r + i * len_r + j]; + auto* l_trans_data = bottom_l_trans_data + + (offset_l[b] + i) * dim_in * dim_t + + t * dim_in; + auto* l_trans_diff = bottom_l_trans_diff + + (offset_l[b] + i) * dim_in * dim_t + + t * dim_in; + auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in; + auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in; + if (diff != 0.0) { + sse_axpy(r_data, l_trans_diff, dim_in, diff); + sse_axpy(l_trans_data, r_diff, dim_in, diff); + } + } + } + } + } + + auto blas = math::GetBlas(ctx); + + auto* t_data = w->data(); + auto* d_w = ctx.Output(framework::GradVarName("W")); + auto* t_diff = d_w->mutable_data(ctx.GetPlace()); + memset(t_diff, 0.0, w->dims()[0] * w->dims()[1] * w->dims()[2] * sizeof(T)); + // bottom_diff + lego_cpu_gemm(blas, CblasNoTrans, CblasTrans, x->dims()[0], dim_in, + dim_t * dim_in, 1.0f, bottom_l_trans_diff, t_data, 1.0f, + bottom_l_diff); + + // t_diff + lego_cpu_gemm(blas, CblasTrans, CblasNoTrans, dim_in, dim_t * dim_in, + x->dims()[0], 1.0f, bottom_l_data, bottom_l_trans_diff, 1.0f, + t_diff); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(match_matrix_tensor, ops::MatchMatrixTensorOP, + ops::MatchMatrixTensorOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad); + +REGISTER_OP_CPU_KERNEL( + match_matrix_tensor, + ops::CPUMatchMatrixTensorOPKernel + // ops::CPUMatchMatrixTensorOPKernel +); +REGISTER_OP_CPU_KERNEL( + match_matrix_tensor_grad, + ops::CPUMatchMatrixTensorOPGradKernel + // ops::CPUMatchMatrixTensorOPGradKernel +); diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h new file mode 100644 index 00000000000000..847f34f1c71388 --- /dev/null +++ b/paddle/fluid/operators/match_matrix_tensor_op.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +class MatchMatrixTensorOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: +// framework::OpKernelType GetExpectedKernelType( +// const framework::ExecutionContext& ctx) const override; + + private: +}; + +class MatchMatrixTensorOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + + protected: +// framework::OpKernelType GetExpectedKernelType( +// const framework::ExecutionContext& ctx) const override; +}; + +class MatchMatrixTensorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; +} // namespace operators +} // namespace paddle + diff --git a/paddle/fluid/operators/search_aligned_mat_mul_op.cc b/paddle/fluid/operators/search_aligned_mat_mul_op.cc new file mode 100644 index 00000000000000..594771476bb254 --- /dev/null +++ b/paddle/fluid/operators/search_aligned_mat_mul_op.cc @@ -0,0 +1,373 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "search_compute.h" +#include + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; +using DDim = framework::DDim; + +void assign_dims(int64_t x_dims_1, int64_t x_aligned_size, int64_t y_dims_1, + int64_t y_aligned_size, CBLAS_TRANSPOSE trans_x, + CBLAS_TRANSPOSE trans_y, std::vector& _dims) { + std::vector _trans{trans_x, trans_y}; + _dims.resize(3); + + const auto bot0_aligned_size = x_aligned_size; + const auto bot1_aligned_size = y_aligned_size; + + _dims[0] = (_trans[0] == CblasTrans) ? x_dims_1 : bot0_aligned_size; + _dims[1] = (_trans[0] == CblasTrans) ? bot0_aligned_size : x_dims_1; + _dims[2] = (_trans[1] == CblasTrans) ? bot1_aligned_size : y_dims_1; + + int bot1_row_num = (_trans[1] == CblasTrans) ? y_dims_1 : bot1_aligned_size; + PADDLE_ENFORCE_EQ(_dims[1], bot1_row_num, + "Mismatch size, bot0_final_cols=[%d] bot1_final_rows=[%d]", + _dims[1], bot1_row_num); +} + +void assign_dims(const DDim& x_dims, const LoD& x_lod, const DDim& y_dims, + const LoD& y_lod, CBLAS_TRANSPOSE trans_x, CBLAS_TRANSPOSE trans_y, + std::vector& _dims) { + + std::vector _trans{trans_x, trans_y}; + _dims.resize(3); + + const auto bot0_aligned_size = static_cast(x_lod[0][1]); + const auto bot1_aligned_size = static_cast(y_lod[0][1]); + + _dims[0] = (_trans[0] == CblasTrans) ? x_dims[1] : bot0_aligned_size; + _dims[1] = (_trans[0] == CblasTrans) ? bot0_aligned_size : x_dims[1]; + _dims[2] = (_trans[1] == CblasTrans) ? bot1_aligned_size : y_dims[1]; + + int bot1_row_num = (_trans[1] == CblasTrans) ? y_dims[1] : bot1_aligned_size; + PADDLE_ENFORCE_EQ(_dims[1], bot1_row_num, + "Mismatch size, bot0_final_cols=[%d] bot1_final_rows=[%d]", _dims[1], + bot1_row_num); +} + +class SearchAlignedMatMulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("Y", + "Y (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + + AddAttr("transpose_X", "If true, use the transpose of `X`.") + .SetDefault(false); + AddAttr("transpose_Y", "If true, use the transpose of `Y`.") + .SetDefault(false); + AddAttr("alpha", "The scale of Out").SetDefault(1.0f); + + AddOutput("Out", "Out (Tensor, default Tensor) Output variable"); + AddOutput("_a_addr", + "_a_addr (Tensor, default Tensor) Output variable"); + AddOutput("_b_addr", + "_b_addr (Tensor, default Tensor) Output variable"); + AddOutput("_c_addr", + "_c_addr (Tensor, default Tensor) Output variable"); + + AddComment(R"DOC( + SearchAlignedMatMul + + NOTE: only support 'float32' data type now. + +)DOC"); + } +}; + +class SearchAlignedMatMulOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Y(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("_a_addr"), + "_a_addr(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("_b_addr"), + "_b_addr(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("_c_addr"), + "_c_addr(Output) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "X should be 2-D tensor"); + + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Y should be 2-D tensor"); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + framework::Variable* y_var = + boost::get(ctx->GetInputVarPtrs("Y")[0]); + const auto& y_lod = y_var->Get().lod(); + PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info."); + const auto& y_lod_0 = y_lod[0]; + PADDLE_ENFORCE_GE(y_lod_0.size(), 2, + "The Input(Y)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + y_dims[0], static_cast(y_lod_0.back()), + "The Input(Y)'s lod info mismatches the actual tensor shape."); + + PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(), + "The Length of X and Y must be equal."); + } else { + // compile time + bool trans_x = ctx->Attrs().Get("transpose_X"); + bool trans_y = ctx->Attrs().Get("transpose_Y"); + + std::vector _trans{CblasNoTrans, CblasNoTrans}; + _trans[0] = trans_x ? CblasTrans : CblasNoTrans; + _trans[1] = trans_y ? CblasTrans : CblasNoTrans; + + std::vector _dims; + assign_dims(x_dims[1], -1, y_dims[1], -1, _trans[0], _trans[1], _dims); + ctx->SetOutputDim("Out", framework::make_ddim({-1, _dims[2]})); + } + } +}; + +template +class CPUSearchAlignedMatMulOPKernel : public framework::OpKernel { + public: + void prepare_ff(const framework::ExecutionContext& ctx, + std::vector& _dims) const { + auto* bottom0 = ctx.Input("X"); + auto* bottom1 = ctx.Input("Y"); + auto* top = ctx.Output("Out"); + auto* _a_addr = ctx.Output("_a_addr"); + auto* _b_addr = ctx.Output("_b_addr"); + auto* _c_addr = ctx.Output("_c_addr"); + + const int batch = bottom0->lod()[0].size() - 1; + _a_addr->Resize(framework::make_ddim({batch})); + _b_addr->Resize(framework::make_ddim({batch})); + _c_addr->Resize(framework::make_ddim({batch})); + + T** a_addr_data = (T**)_a_addr->mutable_data(ctx.GetPlace()); + T** b_addr_data = (T**)_b_addr->mutable_data(ctx.GetPlace()); + T** c_addr_data = (T**)_c_addr->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_EQ(_dims.size(), 3, "_dims.size() should be eq 3."); + const int bot0_size = _dims[0] * _dims[1]; + const int bot1_size = _dims[1] * _dims[2]; + const int top_size = _dims[0] * _dims[2]; + + for (int i = 0; i < batch; ++i) { + a_addr_data[i] = const_cast(bottom0->data()) + bot0_size * i; + b_addr_data[i] = const_cast(bottom1->data()) + bot1_size * i; + c_addr_data[i] = top->mutable_data(ctx.GetPlace()) + top_size * i; + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* bottom1 = ctx.Input("Y"); + auto* top = ctx.Output("Out"); + auto* _a_addr = ctx.Output("_a_addr"); + auto* _b_addr = ctx.Output("_b_addr"); + auto* _c_addr = ctx.Output("_c_addr"); + float _scale = ctx.Attr("alpha"); + + bool trans_x = ctx.Attr("transpose_X"); + bool trans_y = ctx.Attr("transpose_Y"); + + std::vector _trans{CblasNoTrans, CblasNoTrans}; + _trans[0] = trans_x ? CblasTrans : CblasNoTrans; + _trans[1] = trans_y ? CblasTrans : CblasNoTrans; + + std::vector _dims; + assign_dims(bottom0->dims(), bottom0->lod(), bottom1->dims(), + bottom1->lod(), _trans[0], _trans[1], _dims); + + const int batch = bottom0->lod()[0].size() - 1; + std::vector offset(batch + 1); + for (int i = 0; i <= batch; ++i) { + offset[i] = _dims[0] * i; + } + + framework::LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + top->Resize(framework::make_ddim({static_cast(offset[batch]), _dims[2]})); + + prepare_ff(ctx, _dims); + + call_gemm_batched(ctx, _trans[0], _trans[1], static_cast(_dims[0]), static_cast(_dims[2]), static_cast(_dims[1]), + _scale, (const T**)_a_addr->data(), + (const T**)_b_addr->data(), (T)0.0, + (T**)_c_addr->mutable_data(ctx.GetPlace()), batch); + } +}; + +class SearchAlignedMatMulOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("_a_addr"), + "_a_addr(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("_b_addr"), + "_b_addr(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("_c_addr"), + "_c_addr(Output) should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("Y"))) { + ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y")); + ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y")); + } + } +}; + +template +class CPUSearchAlignedMatMulOPGradKernel : public framework::OpKernel { + public: + void prepare_bp(const framework::ExecutionContext& ctx, + std::vector& _dims, Tensor& _a_addr_diff, + Tensor& _b_addr_diff, Tensor& _c_addr_diff) const { + auto* bottom0 = ctx.Input("X"); + auto* _a_addr = ctx.Input("_a_addr"); + auto* _b_addr = ctx.Input("_b_addr"); + auto* _c_addr = ctx.Input("_c_addr"); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_y = ctx.Output(framework::GradVarName("Y")); + + const int batch = bottom0->lod()[0].size() - 1; + PADDLE_ENFORCE_EQ(_a_addr->dims()[0], batch, "blob should be initialized before bp"); + + _a_addr_diff.Resize(_a_addr->dims()); + _b_addr_diff.Resize(_b_addr->dims()); + _c_addr_diff.Resize(_c_addr->dims()); + T** a_addr_diff = (T**)_a_addr_diff.mutable_data(ctx.GetPlace()); + T** b_addr_diff = (T**)_b_addr_diff.mutable_data(ctx.GetPlace()); + T** c_addr_diff = (T**)_c_addr_diff.mutable_data(ctx.GetPlace()); + + + + const int bot0_size = _dims[0] * _dims[1]; + const int bot1_size = _dims[1] * _dims[2]; + const int top_size = _dims[0] * _dims[2]; + + for (int i = 0; i < batch; ++i) { + a_addr_diff[i] = d_x->mutable_data(ctx.GetPlace()) + bot0_size * i; + b_addr_diff[i] = d_y->mutable_data(ctx.GetPlace()) + bot1_size * i; + c_addr_diff[i] = + const_cast(d_out->data()) + top_size * i; + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* bottom1 = ctx.Input("Y"); + auto* _a_addr = ctx.Input("_a_addr"); + auto* _b_addr = ctx.Input("_b_addr"); + bool trans_x = ctx.Attr("transpose_X"); + bool trans_y = ctx.Attr("transpose_Y"); + float _scale = ctx.Attr("alpha"); + + std::vector _trans{CblasNoTrans, CblasNoTrans}; + _trans[0] = trans_x ? CblasTrans : CblasNoTrans; + _trans[1] = trans_y ? CblasTrans : CblasNoTrans; + + std::vector _dims; + assign_dims(bottom0->dims(), bottom0->lod(), bottom1->dims(), + bottom1->lod(), _trans[0], _trans[1], _dims); + + Tensor _a_addr_diff, _b_addr_diff, _c_addr_diff; + prepare_bp(ctx, _dims, _a_addr_diff, _b_addr_diff, _c_addr_diff); + + const int batch = bottom0->lod()[0].size() - 1; + if (_trans[1] == CblasTrans) { + call_gemm_batched( + ctx, CblasTrans, _trans[0], _dims[2], _dims[1], _dims[0], _scale, + (const T**)_c_addr_diff.data(), (const T**)_a_addr->data(), + (T)0.0, (T**)_b_addr_diff.mutable_data(ctx.GetPlace()), batch); + } else { + CBLAS_TRANSPOSE bot0_trans = + _trans[0] == CblasTrans ? CblasNoTrans : CblasTrans; + call_gemm_batched( + ctx, bot0_trans, CblasNoTrans, static_cast(_dims[1]), static_cast(_dims[2]), static_cast(_dims[0]), _scale, + (const T**)_a_addr->data(), (const T**)_c_addr_diff.data(), + (T)0.0, (T**)_b_addr_diff.mutable_data(ctx.GetPlace()), batch); + } + + if (_trans[0] == CblasTrans) { + call_gemm_batched( + ctx, _trans[1], CblasTrans, _dims[1], _dims[0], _dims[2], _scale, + (const T**)_b_addr->data(), (const T**)_c_addr_diff.data(), + (T)0.0, (T**)_a_addr_diff.mutable_data(ctx.GetPlace()), batch); + } else { + CBLAS_TRANSPOSE bot1_trans = + (_trans[1] == CblasTrans) ? CblasNoTrans : CblasTrans; + call_gemm_batched( + ctx, CblasNoTrans, bot1_trans, _dims[0], _dims[1], _dims[2], _scale, + (const T**)_c_addr_diff.data(), (const T**)_b_addr->data(), + (T)0.0, (T**)_a_addr_diff.mutable_data(ctx.GetPlace()), batch); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_aligned_mat_mul, ops::SearchAlignedMatMulOP, + ops::SearchAlignedMatMulOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_aligned_mat_mul_grad, ops::SearchAlignedMatMulOpGrad); + +REGISTER_OP_CPU_KERNEL( + search_aligned_mat_mul, + ops::CPUSearchAlignedMatMulOPKernel + // ops::CPUSearchAlignedMatMulOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_aligned_mat_mul_grad, + ops::CPUSearchAlignedMatMulOPGradKernel + // ops::CPUSearchAlignedMatMulOPGradKernel +); diff --git a/paddle/fluid/operators/search_attention_padding_mask_op.cc b/paddle/fluid/operators/search_attention_padding_mask_op.cc new file mode 100644 index 00000000000000..e7ac2224ab9dd2 --- /dev/null +++ b/paddle/fluid/operators/search_attention_padding_mask_op.cc @@ -0,0 +1,263 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class SearchAttentionPaddingMaskOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("Y", + "Y (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + + AddAttr("pad_id", "pad_id").SetDefault(0).EqualGreaterThan(0); + AddAttr("mask", "mask").SetDefault(0.0); + + AddOutput("Out", + "Out (LoDTensor, default LoDTensor) Output variable"); + AddOutput( + "pad_begin", + "pad_begin (LoDTensor, default LoDTensor) Output variable"); + + AddComment(R"DOC( + SearchAttentionPaddingMask + + NOTE: only support 'float32' data type now. + +)DOC"); + } +}; + +class SearchAttentionPaddingMaskOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Y(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("pad_begin"), + "pad_begin(Output) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2."); + + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Y should be 2-D tensor"); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + framework::Variable* y_var = + boost::get(ctx->GetInputVarPtrs("Y")[0]); + const auto& y_lod = y_var->Get().lod(); + PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info."); + const auto& y_lod_0 = y_lod[0]; + PADDLE_ENFORCE_GE(y_lod_0.size(), 2, + "The Input(Y)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + y_dims[0], static_cast(y_lod_0.back()), + "The Input(Y)'s lod info mismatches the actual tensor shape."); + } else { + // compile time + } + + ctx->SetOutputDim("Out", framework::make_ddim({-1, x_dims[1]})); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class CPUSearchAttentionPaddingMaskOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* bottom1 = ctx.Input("Y"); + auto* top = ctx.Output("Out"); + auto* _pad_begin = ctx.Output("pad_begin"); + + int _pad_id = ctx.Attr("pad_id"); + float _mask = ctx.Attr("mask"); + + const auto src_len = static_cast(bottom1->lod()[0][1]); + PADDLE_ENFORCE_EQ(src_len, bottom0->dims()[1], + "Mismatch source length, expect: %d get: %d", src_len, + bottom0->dims()[1]); + const int att_batch = bottom0->lod()[0].size() - 1; + const int src_batch = bottom1->lod()[0].size() - 1; + PADDLE_ENFORCE_EQ(att_batch % src_batch, 0, + "Mismatch batch size, bottom0: %d, bottom1: %d", + att_batch, src_batch); + + _pad_begin->Resize(framework::make_ddim({src_batch})); + int* pad_begin = _pad_begin->mutable_data(ctx.GetPlace()); + for (int i = 0; i < src_batch; ++i) { + // bottom data is padded to be aligned + const auto* src_data = bottom1->data() + src_len * i; + int index = src_len - 1; + for (; index >= 0 && _pad_id == static_cast(src_data[index]); + --index) { + } + pad_begin[i] = index + 1; + } + + top->Resize(bottom0->dims()); + const auto att_len = static_cast(bottom0->lod()[0][1]); + auto* top_data = top->mutable_data(ctx.GetPlace()); + memcpy(top_data, bottom0->data(), + bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T)); + for (int i = 0; i < att_batch; ++i) { + for (int j = 0; j < att_len; ++j) { + top_data = + top->mutable_data(ctx.GetPlace()) + src_len * (att_len * i + j); + int src_idx = i % src_batch; + for (int k = pad_begin[src_idx]; k < src_len; ++k) { + top_data[k] = _mask; + } + } + } + } +}; + +class SearchAttentionPaddingMaskGradOpMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("search_attention_padding_mask_grad"); + op_desc_ptr->SetInput("X", Input("X")); + op_desc_ptr->SetInput("Y", Input("Y")); + op_desc_ptr->SetInput("pad_begin", Output("pad_begin")); + + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +class SearchAttentionPaddingMaskOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("pad_begin"), + "Input(pad_begin) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class CPUSearchAttentionPaddingMaskOPGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* bottom1 = ctx.Input("Y"); + auto* _pad_begin = ctx.Input("pad_begin"); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + + const int* pad_begin = _pad_begin->data(); + const auto att_batch = bottom0->lod()[0].size() - 1; + const auto src_batch = bottom1->lod()[0].size() - 1; + + const auto att_len = bottom0->lod()[0][1]; + const auto src_len = bottom1->lod()[0][1]; + + auto* att_diff = d_x->mutable_data(ctx.GetPlace()); + memcpy(att_diff, d_out->data(), + d_out->dims()[0] * d_out->dims()[1] * sizeof(T)); + for (int i = 0; i < att_batch; ++i) { + for (int j = 0; j < att_len; ++j) { + int src_idx = i % src_batch; + att_diff = d_x->mutable_data(ctx.GetPlace()) + + src_len * (att_len * i + j) + pad_begin[src_idx]; + memset(att_diff, 0, (src_len - pad_begin[src_idx]) * sizeof(T)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_attention_padding_mask, + ops::SearchAttentionPaddingMaskOP, + ops::SearchAttentionPaddingMaskOpMaker, + ops::SearchAttentionPaddingMaskGradOpMaker); +REGISTER_OPERATOR(search_attention_padding_mask_grad, + ops::SearchAttentionPaddingMaskOpGrad); + +REGISTER_OP_CPU_KERNEL( + search_attention_padding_mask, + ops::CPUSearchAttentionPaddingMaskOPKernel + // ops::CPUSearchAttentionPaddingMaskOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_attention_padding_mask_grad, + ops::CPUSearchAttentionPaddingMaskOPGradKernel + // ops::CPUSearchAttentionPaddingMaskOPGradKernel +); diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h new file mode 100644 index 00000000000000..ce72f18f72a0d6 --- /dev/null +++ b/paddle/fluid/operators/search_compute.h @@ -0,0 +1,422 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // sse +#include +#include //fabs +#include // memcpy + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/dynload/mklml.h" +//#include "naive_gemm.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +void call_gemm(const math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C) { +#ifndef __NAIVE_GEMM__ + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A, + B, beta, C); +#endif // !__NAIVE_GEMM__ +} + +template +void call_gemm(const framework::ExecutionContext& ctx, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C) { +#ifndef __NAIVE_GEMM__ + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + auto blas = math::GetBlas(ctx); + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A, + B, beta, C); +#endif // !__NAIVE_GEMM__ +} + +template +void call_gemm_batched(const framework::ExecutionContext& ctx, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const T alpha, const T** A, + const T** B, const T beta, T** C, + const int batch) { + for (int i = 0; i < batch; ++i) { + call_gemm(ctx, TransA, TransB, M, N, K, alpha, A[i], B[i], beta, C[i]); + } +} + +// To align with Lego +#ifndef LEGO_USE_FLOAT +#define LEGO_USE_FLOAT +#endif +#ifndef LEGO_SSE +#define LEGO_SSE +#endif + +#if defined(LEGO_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_setzero_px _mm256_setzero_ps +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_hadd_px _mm256_hadd_ps +#define _mm256_permute2f128_px _mm256_permute2f128_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss +#define _mm256_castpx256_px128 _mm256_castps256_ps128 +#define _mm256_max_px _mm256_max_ps +#define _mm256_sub_px _mm256_sub_ps +#define _mm256_set1_px _mm256_set1_ps +#define _mm256_sqrt_px _mm256_sqrt_ps +#define _mm256_div_px _mm256_div_ps +#define _mm_setzero_px _mm_setzero_ps +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_hadd_px _mm_hadd_ps +#define _mm_store_sx _mm_store_ss +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps +#define _mm_max_px _mm_max_ps +#define _mm_sub_px _mm_sub_ps +#define _mm_set1_px _mm_set1_ps +#define _mm_sqrt_px _mm_sqrt_ps +#define _mm_div_px _mm_div_ps + +#elif defined(LEGO_USE_DOUBLE) + +#define __m256x __m256d +#define __m128x __m128d + +static const unsigned int AVX_STEP_SIZE = 4; +static const unsigned int SSE_STEP_SIZE = 2; +static const unsigned int AVX_CUT_LEN_MASK = 3U; +static const unsigned int SSE_CUT_LEN_MASK = 1U; + +#define _mm256_setzero_px _mm256_setzero_pd +#define _mm256_mul_px _mm256_mul_pd +#define _mm256_add_px _mm256_add_pd +#define _mm256_load_px _mm256_loadu_pd +#define _mm256_hadd_px _mm256_hadd_pd +#define _mm256_permute2f128_px _mm256_permute2f128_pd +#define _mm256_store_px _mm256_storeu_pd +#define _mm256_broadcast_sx _mm256_broadcast_sd +#define _mm256_castpx256_px128 _mm256_castpd256_pd128 +#define _mm256_max_px _mm256_max_pd +#define _mm256_sub_px _mm256_sub_pd +#define _mm256_set1_px _mm256_set1_pd +#define _mm256_sqrt_px _mm256_sqrt_pd +#define _mm256_div_px _mm256_div_pd +#define _mm_setzero_px _mm_setzero_pd +#define _mm_add_px _mm_add_pd +#define _mm_mul_px _mm_mul_pd +#define _mm_load_px _mm_loadu_pd +#define _mm_hadd_px _mm_hadd_pd +#define _mm_store_sx _mm_store_sd +#define _mm_store_px _mm_storeu_pd +#define _mm_load1_px _mm_load1_pd +#define _mm_max_px _mm_max_pd +#define _mm_sub_px _mm_sub_pd +#define _mm_set1_px _mm_set1_pd +#define _mm_sqrt_px _mm_sqrt_pd +#define _mm_div_px _mm_div_pd +#endif + +#if defined(LEGO_USE_FLOAT) + +#define X_MIN FLT_MIN +#define X_MAX FLT_MAX + +#elif defined(LEGO_USE_DOUBLE) + +#define X_MIN DBL_MIN +#define X_MAX DBL_MAX + +#endif + +template +inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj), + _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] + y[jjj]; + } +} + +template +inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + __m256x mm_alpha = _mm256_broadcast_sx(&alpha); + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px( + y + jjj, + _mm256_add_px(_mm256_load_px(y + jjj), + _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)))); + } + +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_alpha = _mm_load1_px(&alpha); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(y + jjj, + _mm_add_px(_mm_load_px(y + jjj), + _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)))); + } + +#endif + for (; jjj < len; jjj++) { + y[jjj] += alpha * x[jjj]; + } +} + +template +inline void sse_axpy_noadd(const T* x, T* y, size_t len, const T alpha) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + __m256x mm_alpha = _mm256_broadcast_sx(&alpha); + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))); + } + +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_alpha = _mm_load1_px(&alpha); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(y + jjj, _mm_mul_px(mm_alpha, _mm_load_px(x + jjj))); + } + +#endif + for (; jjj < len; jjj++) { + y[jjj] = alpha * x[jjj]; + } +} + +template +inline void sse_eltmul(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(z + jjj, _mm256_mul_px(_mm256_load_px(x + jjj), + _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_mul_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] * y[jjj]; + } +} + +template +inline void sse_add_scalar(const T* x, T* y, size_t len, const T alpha) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + __m256x mm_alpha = _mm256_broadcast_sx(&alpha); + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(y + jjj, _mm256_add_px(mm_alpha, _mm256_load_px(x + jjj))); + } + +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_alpha = _mm_load1_px(&alpha); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(y + jjj, _mm_add_px(mm_alpha, _mm_load_px(x + jjj))); + } + +#endif + for (; jjj < len; jjj++) { + y[jjj] = alpha + x[jjj]; + } +} + +template +inline void sse_sum(const T* x, T& y, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + y = 0.; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + + __m256x mm_result = _mm256_setzero_px(); + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + mm_result = _mm256_add_px(mm_result, _mm256_load_px(x + jjj)); + } + +#if defined(LEGO_USE_FLOAT) + __m256x hsum = _mm256_hadd_px(mm_result, mm_result); +#elif defined(LEGO_USE_DOUBLE) + __m256x hsum = mm_result; +#endif + hsum = _mm256_add_px(hsum, _mm256_permute2f128_px(hsum, hsum, 0x1)); + _mm_store_sx(&y, _mm_hadd_px(_mm256_castpx256_px128(hsum), + _mm256_castpx256_px128(hsum))); + +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + __m128x mm_result = _mm_setzero_px(); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + mm_result = _mm_add_px(mm_result, _mm_load_px(x + jjj)); + } + __m128x mm_tmp = _mm_hadd_px(mm_result, mm_result); + +#if defined(LEGO_USE_FLOAT) + _mm_store_sx(&y, _mm_hadd_px(mm_tmp, mm_tmp)); +#elif defined(LEGO_USE_DOUBLE) + _mm_store_sx(&y, mm_tmp); +#endif + +#endif + for (; jjj < len; jjj++) { + y += x[jjj]; + } +} + +template +inline void sse_scale(const T* x, T* y, size_t len, const T alpha) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + __m256x mm_alpha = _mm256_broadcast_sx(&alpha); + + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))); + } + +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_alpha = _mm_load1_px(&alpha); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(y + jjj, _mm_mul_px(mm_alpha, _mm_load_px(x + jjj))); + } +#endif + for (; jjj < len; jjj++) { + y[jjj] = alpha * x[jjj]; + } +} + +template +inline void sse_ip(const T* vec1, const T* vec2, size_t len, T& result) { + unsigned int jjj, lll; + jjj = lll = 0; + result = 0.; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + + __m256x mm_result = _mm256_setzero_px(); + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + mm_result = _mm256_add_px( + mm_result, + _mm256_mul_px(_mm256_load_px(vec1 + jjj), _mm256_load_px(vec2 + jjj))); + } + + // result = mm_result[0]+mm_result[1]+mm_result[2]+mm_result[3]+ + // mm_result[4]+mm_result[5]+mm_result[6]+mm_result[7]; + +#if defined(LEGO_USE_FLOAT) + __m256x hsum = _mm256_hadd_px(mm_result, mm_result); +#elif defined(LEGO_USE_DOUBLE) + __m256x hsum = mm_result; +#endif + + hsum = _mm256_add_px(hsum, _mm256_permute2f128_px(hsum, hsum, 0x1)); + + _mm_store_sx(&result, _mm_hadd_px(_mm256_castpx256_px128(hsum), + _mm256_castpx256_px128(hsum))); + +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_result = _mm_setzero_px(); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + mm_result = _mm_add_px(mm_result, _mm_mul_px(_mm_load_px(vec1 + jjj), + _mm_load_px(vec2 + jjj))); + } + __m128x mm_tmp = _mm_hadd_px(mm_result, mm_result); +#if defined(LEGO_USE_FLOAT) + _mm_store_sx(&result, _mm_hadd_px(mm_tmp, mm_tmp)); +#elif defined(LEGO_USE_DOUBLE) + _mm_store_sx(&result, mm_tmp); +#endif + +#endif + for (; jjj < len; jjj++) { + result += vec1[jjj] * vec2[jjj]; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/search_embedding_op.cc b/paddle/fluid/operators/search_embedding_op.cc new file mode 100644 index 00000000000000..2ae37288debd94 --- /dev/null +++ b/paddle/fluid/operators/search_embedding_op.cc @@ -0,0 +1,370 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +// To align with Lego +#ifndef LEGO_USE_FLOAT +#define LEGO_USE_FLOAT +#endif +#ifndef LEGO_SSE +#define LEGO_SSE +#endif + +#if defined(LEGO_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_setzero_px _mm256_setzero_ps +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_hadd_px _mm256_hadd_ps +#define _mm256_permute2f128_px _mm256_permute2f128_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss +#define _mm256_castpx256_px128 _mm256_castps256_ps128 +#define _mm256_max_px _mm256_max_ps +#define _mm256_sub_px _mm256_sub_ps +#define _mm256_set1_px _mm256_set1_ps +#define _mm256_sqrt_px _mm256_sqrt_ps +#define _mm256_div_px _mm256_div_ps +#define _mm_setzero_px _mm_setzero_ps +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_hadd_px _mm_hadd_ps +#define _mm_store_sx _mm_store_ss +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps +#define _mm_max_px _mm_max_ps +#define _mm_sub_px _mm_sub_ps +#define _mm_set1_px _mm_set1_ps +#define _mm_sqrt_px _mm_sqrt_ps +#define _mm_div_px _mm_div_ps + +#elif defined(LEGO_USE_DOUBLE) + +#define __m256x __m256d +#define __m128x __m128d + +static const unsigned int AVX_STEP_SIZE = 4; +static const unsigned int SSE_STEP_SIZE = 2; +static const unsigned int AVX_CUT_LEN_MASK = 3U; +static const unsigned int SSE_CUT_LEN_MASK = 1U; + +#define _mm256_setzero_px _mm256_setzero_pd +#define _mm256_mul_px _mm256_mul_pd +#define _mm256_add_px _mm256_add_pd +#define _mm256_load_px _mm256_loadu_pd +#define _mm256_hadd_px _mm256_hadd_pd +#define _mm256_permute2f128_px _mm256_permute2f128_pd +#define _mm256_store_px _mm256_storeu_pd +#define _mm256_broadcast_sx _mm256_broadcast_sd +#define _mm256_castpx256_px128 _mm256_castpd256_pd128 +#define _mm256_max_px _mm256_max_pd +#define _mm256_sub_px _mm256_sub_pd +#define _mm256_set1_px _mm256_set1_pd +#define _mm256_sqrt_px _mm256_sqrt_pd +#define _mm256_div_px _mm256_div_pd +#define _mm_setzero_px _mm_setzero_pd +#define _mm_add_px _mm_add_pd +#define _mm_mul_px _mm_mul_pd +#define _mm_load_px _mm_loadu_pd +#define _mm_hadd_px _mm_hadd_pd +#define _mm_store_sx _mm_store_sd +#define _mm_store_px _mm_storeu_pd +#define _mm_load1_px _mm_load1_pd +#define _mm_max_px _mm_max_pd +#define _mm_sub_px _mm_sub_pd +#define _mm_set1_px _mm_set1_pd +#define _mm_sqrt_px _mm_sqrt_pd +#define _mm_div_px _mm_div_pd +#endif + +template +inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj), + _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] + y[jjj]; + } +} + +template +inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + __m256x mm_alpha = _mm256_broadcast_sx(&alpha); + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px( + y + jjj, + _mm256_add_px(_mm256_load_px(y + jjj), + _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)))); + } + +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_alpha = _mm_load1_px(&alpha); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(y + jjj, + _mm_add_px(_mm_load_px(y + jjj), + _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)))); + } + +#endif + for (; jjj < len; jjj++) { + y[jjj] += alpha * x[jjj]; + } +} + +class SearchEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (Tensor, default Tensor) Input variable which " + "should contain lod information."); + AddInput("W", "W (Tensor)"); + AddAttr("num_voc", "num_voc").SetDefault(0).EqualGreaterThan(0); + AddAttr("num_emb", "num_emb").SetDefault(0).EqualGreaterThan(0); + AddAttr("lr", "learning rate").SetDefault(0.0).EqualGreaterThan(0.0); + + AddOutput("Out", "Out (Tensor, default Tensor) Output variable"); + + AddComment(R"DOC( + SearchEmbedding + + NOTE: only support 'float32' data type now. + + )DOC"); + } +}; + +class SearchEmbeddingOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "W(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2."); + + auto w_dims = ctx->GetInputDim("W"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor"); + + int num_voc = ctx->Attrs().Get("num_voc"); + int num_emb = ctx->Attrs().Get("num_emb"); + + PADDLE_ENFORCE_EQ(w_dims[0], num_voc, + "w_dims[0] should be equal to num_voc"); + PADDLE_ENFORCE_EQ(w_dims[1], num_emb, + "w_dims[1] should be equal to num_emb"); + + ctx->SetOutputDim("Out", framework::make_ddim({-1, num_emb})); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class CPUSearchEmbeddingOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* _blobs = ctx.Input("W"); + auto* top = ctx.Output("Out"); + + int _cap_e = ctx.Attr("num_emb"); + + int _cap_l = bottom->dims()[0]; + auto& offset = bottom->lod()[0]; + std::vector top_offset; + top_offset.resize(offset.size()); + top_offset[0] = 0; + + for (int i = 0; i < top_offset.size() - 1; ++i) { + int w = offset[i + 1] - offset[i]; + if (w == 0) { + top_offset[i + 1] = top_offset[i] + 1; + } else { + top_offset[i + 1] = top_offset[i] + w; + } + } + + int top_l = top_offset[top_offset.size() - 1]; + framework::LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + top->Resize(framework::make_ddim({top_l, _cap_e})); + + PADDLE_ENFORCE_EQ(top_l, _cap_l, + "top_l should be equal to _cap_l"); + + auto* top_data = top->mutable_data(ctx.GetPlace()); + const auto* bottom_data = bottom->data(); + const auto* weights = _blobs->data(); + + for (int i = 0; i < offset.size() - 1; ++i) { + int w = offset[i + 1] - offset[i]; + if (w == 1 && bottom_data[offset[i]] == -1) { + //LOG (ERROR) << "zero len sequence " << i << "/" << top_offset.size() - 1; + memset(top_data + top_offset[i] * _cap_e, 0, _cap_e * sizeof(T)); + } else { + for (int j = 0; j < w; ++j) { + unsigned int word_idx = + static_cast(bottom_data[offset[i] + j]); + memcpy((void*)(top_data + (top_offset[i] + j) * _cap_e), + (void*)(weights + word_idx * _cap_e), _cap_e * sizeof(T)); + } + } + } + } +}; + +class SearchEmbeddingOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SearchEmbeddingGradOp should not be null."); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } + +}; + +class SearchEmbeddingGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("search_embedding_grad"); + op_desc_ptr->SetInput("X", Input("X")); + op_desc_ptr->SetInput("W", Input("W")); + + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +template +class CPUSearchEmbeddingOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* _blobs = ctx.Input("W"); + auto* top = ctx.Input(framework::GradVarName("Out")); + + int _cap_e = ctx.Attr("num_emb"); + float _lr = ctx.Attr("lr"); + + auto& offset = bottom->lod()[0]; + auto& top_offset = top->lod()[0]; + + const auto* top_diff = top->data(); + const auto* bottom_data = bottom->data(); + T* weights = (T*) (_blobs->data()); + + T mlr = -1.0 * _lr; + + for (int i = 0; i < offset.size() - 1; ++i) { + int w = offset[i + 1] - offset[i]; + if (!(w == 1 && bottom_data[offset[i]] == -1)) { + for (int j = 0; j < w; ++j) { + unsigned int word_idx = + static_cast(bottom_data[offset[i] + j]); + sse_axpy((const T*)top_diff + (top_offset[i] + j) * _cap_e, + weights + word_idx * _cap_e, _cap_e, mlr); + } + } else { + //LOG(ERROR) << "bp: zero len sequence " << i << "/" + // << top_offset.size() - 1; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_embedding, ops::SearchEmbeddingOP, + ops::SearchEmbeddingOpMaker, ops::SearchEmbeddingGradOpMaker); +REGISTER_OPERATOR(search_embedding_grad, ops::SearchEmbeddingOpGrad); + +REGISTER_OP_CPU_KERNEL(search_embedding, + ops::CPUSearchEmbeddingOPKernel + // ops::CPUSearchEmbeddingOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_embedding_grad, ops::CPUSearchEmbeddingOPGradKernel + // ops::CPUSearchEmbeddingOPGradKernel +); diff --git a/paddle/fluid/operators/search_fc_op.cc b/paddle/fluid/operators/search_fc_op.cc new file mode 100644 index 00000000000000..6752564c05573d --- /dev/null +++ b/paddle/fluid/operators/search_fc_op.cc @@ -0,0 +1,331 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +//#include "naive_gemm.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +void call_gemm(const math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C) { +#ifndef __NAIVE_GEMM__ + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A, + B, beta, C); +#endif // !__NAIVE_GEMM__ +} + +// To align with Lego +#ifndef LEGO_USE_FLOAT +#define LEGO_USE_FLOAT +#endif +#ifndef LEGO_SSE +#define LEGO_SSE +#endif + +#if defined(LEGO_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_setzero_px _mm256_setzero_ps +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_hadd_px _mm256_hadd_ps +#define _mm256_permute2f128_px _mm256_permute2f128_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss +#define _mm256_castpx256_px128 _mm256_castps256_ps128 +#define _mm256_max_px _mm256_max_ps +#define _mm256_sub_px _mm256_sub_ps +#define _mm256_set1_px _mm256_set1_ps +#define _mm256_sqrt_px _mm256_sqrt_ps +#define _mm256_div_px _mm256_div_ps +#define _mm_setzero_px _mm_setzero_ps +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_hadd_px _mm_hadd_ps +#define _mm_store_sx _mm_store_ss +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps +#define _mm_max_px _mm_max_ps +#define _mm_sub_px _mm_sub_ps +#define _mm_set1_px _mm_set1_ps +#define _mm_sqrt_px _mm_sqrt_ps +#define _mm_div_px _mm_div_ps + +#elif defined(LEGO_USE_DOUBLE) + +#define __m256x __m256d +#define __m128x __m128d + +static const unsigned int AVX_STEP_SIZE = 4; +static const unsigned int SSE_STEP_SIZE = 2; +static const unsigned int AVX_CUT_LEN_MASK = 3U; +static const unsigned int SSE_CUT_LEN_MASK = 1U; + +#define _mm256_setzero_px _mm256_setzero_pd +#define _mm256_mul_px _mm256_mul_pd +#define _mm256_add_px _mm256_add_pd +#define _mm256_load_px _mm256_loadu_pd +#define _mm256_hadd_px _mm256_hadd_pd +#define _mm256_permute2f128_px _mm256_permute2f128_pd +#define _mm256_store_px _mm256_storeu_pd +#define _mm256_broadcast_sx _mm256_broadcast_sd +#define _mm256_castpx256_px128 _mm256_castpd256_pd128 +#define _mm256_max_px _mm256_max_pd +#define _mm256_sub_px _mm256_sub_pd +#define _mm256_set1_px _mm256_set1_pd +#define _mm256_sqrt_px _mm256_sqrt_pd +#define _mm256_div_px _mm256_div_pd +#define _mm_setzero_px _mm_setzero_pd +#define _mm_add_px _mm_add_pd +#define _mm_mul_px _mm_mul_pd +#define _mm_load_px _mm_loadu_pd +#define _mm_hadd_px _mm_hadd_pd +#define _mm_store_sx _mm_store_sd +#define _mm_store_px _mm_storeu_pd +#define _mm_load1_px _mm_load1_pd +#define _mm_max_px _mm_max_pd +#define _mm_sub_px _mm_sub_pd +#define _mm_set1_px _mm_set1_pd +#define _mm_sqrt_px _mm_sqrt_pd +#define _mm_div_px _mm_div_pd +#endif + +template +inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj), + _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] + y[jjj]; + } +} + +class SearchFCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (Tensor, default Tensor) Input variable which " + "should contain lod information."); + AddInput("W", "W (Tensor)"); + AddInput("b", "b (Tensor)"); + AddAttr("out_size", "out_size: the output size") + .SetDefault(0) + .EqualGreaterThan(1); + + AddOutput("Out", "Out (Tensor, default Tensor) Output variable"); + + AddComment(R"DOC( + SearchFC + + NOTE: only support 'float32' data type now. + + )DOC"); + } +}; + +class SearchFCOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "W(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("b"), "b(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2."); + + auto w_dims = ctx->GetInputDim("W"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor"); + + auto b_dims = ctx->GetInputDim("b"); + PADDLE_ENFORCE_EQ(b_dims.size(), 1, "b should be 1-D tensor"); + + int out_size = ctx->Attrs().Get("out_size"); + + ctx->SetOutputDim("Out", framework::make_ddim({-1, out_size})); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(w_dims[1], x_dims[1], "wrong shape: w_dims[1] != x_dims[1]"); + } + else { + // compile time + } + } +}; + +template +class CPUSearchFCOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* b = ctx.Input("b"); + auto* top = ctx.Output("Out"); + + int out_size = ctx.Attr("out_size"); // 100 + int batch = bottom->dims()[0]; + + int _out = w->dims()[0]; // 100 + int _in = w->dims()[1]; // 228 + //PADDLE_ENFORCE_EQ(out_size, _out, "out_size should equal to w->dims()[1]"); + //PADDLE_ENFORCE_EQ(bottom->dims()[1], _in, + // "x.dims()[1] should equal to w->dims()[0]"); + + top->Resize(framework::make_ddim({bottom->dims()[0], out_size})); + + const auto* bottom_data = bottom->data(); + auto* top_data = top->mutable_data(ctx.GetPlace()); + const auto* weights = w->data(); + auto blas = math::GetBlas(ctx); + call_gemm(blas, CblasNoTrans, CblasTrans, batch, _out, _in, 1.0f, + bottom_data, weights, 0.0f, top_data); + if (true) { + const auto* bias_data = b->data(); + for (int i = 0; i < batch; ++i) { + // add bias here + sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out); + } + } + } +}; + +class SearchFCOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("b"), "Input(b) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + if (ctx->HasOutput(framework::GradVarName("W"))) { + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + } + if (ctx->HasOutput(framework::GradVarName("b"))) { + ctx->SetOutputDim(framework::GradVarName("b"), ctx->GetInputDim("b")); + } + } +}; + +template +class CPUSearchFCOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // auto* d_x = ctx.Output(framework::GradVarName("X")); + // + // auto* bottom_diff = d_x->mutable_data(ctx.GetPlace()); + // auto* x = ctx.Input("X"); + // memset(bottom_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T)); + auto* bottom = ctx.Input("X"); + auto* w = ctx.Input("W"); + int _out = w->dims()[0]; // 100 + int _in = w->dims()[1]; // 228 + + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_w = ctx.Output(framework::GradVarName("W")); + + int batch = bottom->dims()[0]; + const auto* top_diff = d_out->data(); + const auto* bottom_data = bottom->data(); + auto* bottom_diff = d_x->mutable_data(ctx.GetPlace()); + + const auto* weights = w->data(); + auto* weights_diff = d_w->mutable_data(ctx.GetPlace()); + + auto blas = math::GetBlas(ctx); + //call_gemm(blas, CblasTrans, CblasNoTrans, _in, _out, batch, 1.0f, + // bottom_data, top_diff, 0.0f, weights_diff); + call_gemm(blas, CblasTrans, CblasNoTrans, _out, _in, batch, (T)1.0, + top_diff, bottom_data, (T)0.0, weights_diff); + + call_gemm(blas, CblasNoTrans, CblasNoTrans, batch, _in, _out, (T)1.0, top_diff, + weights, (T)0.0, bottom_diff); + + if (true) { + auto* d_b = ctx.Output(framework::GradVarName("b")); + auto* bias_diff = d_b->mutable_data(ctx.GetPlace()); + memset(bias_diff, 0.0, _out * sizeof(T)); + for (int i = 0; i < batch; ++i) { + sse_eltadd(bias_diff, top_diff + i * _out, bias_diff, _out); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_fc, ops::SearchFCOP, ops::SearchFCOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_fc_grad, ops::SearchFCOpGrad); + +REGISTER_OP_CPU_KERNEL(search_fc, + ops::CPUSearchFCOPKernel + // ops::CPUSearchFCOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_fc_grad, ops::CPUSearchFCOPGradKernel + // ops::CPUSearchFCOPGradKernel +); diff --git a/paddle/fluid/operators/search_grnn_op.cc b/paddle/fluid/operators/search_grnn_op.cc new file mode 100644 index 00000000000000..118a12c69f3044 --- /dev/null +++ b/paddle/fluid/operators/search_grnn_op.cc @@ -0,0 +1,807 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/search_grnn_op.h" +#include +#ifndef WIN32 +//#include "naive_gemm.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifndef _DEBUGGING +#define _DEBUGGING +#endif + +//#include "debug.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +#define SIGMOID(z) (sigmoid(z)) +#define SIGMOID_D(a) ((a) * (1 - (a))) +#define TANHD(a) (1 - (a) * (a)) + +template +void call_gemm(const math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C) { +#ifndef __NAIVE_GEMM__ + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm((TransA == CblasTrans), (TransB == CblasTrans), M, N, K, alpha, A, + B, beta, C); +#endif // !__NAIVE_GEMM__ +} + +template +T sigmoid(T z) { + return 1 / (1 + std::exp(-z)); +} + +// To align with Lego +#ifndef LEGO_USE_FLOAT +#define LEGO_USE_FLOAT +#endif +#ifndef LEGO_SSE +#define LEGO_SSE +#endif + +#if defined(LEGO_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_setzero_px _mm256_setzero_ps +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_hadd_px _mm256_hadd_ps +#define _mm256_permute2f128_px _mm256_permute2f128_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss +#define _mm256_castpx256_px128 _mm256_castps256_ps128 +#define _mm256_max_px _mm256_max_ps +#define _mm256_sub_px _mm256_sub_ps +#define _mm256_set1_px _mm256_set1_ps +#define _mm256_sqrt_px _mm256_sqrt_ps +#define _mm256_div_px _mm256_div_ps +#define _mm_setzero_px _mm_setzero_ps +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_hadd_px _mm_hadd_ps +#define _mm_store_sx _mm_store_ss +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps +#define _mm_max_px _mm_max_ps +#define _mm_sub_px _mm_sub_ps +#define _mm_set1_px _mm_set1_ps +#define _mm_sqrt_px _mm_sqrt_ps +#define _mm_div_px _mm_div_ps + +#elif defined(LEGO_USE_DOUBLE) + +#define __m256x __m256d +#define __m128x __m128d + +static const unsigned int AVX_STEP_SIZE = 4; +static const unsigned int SSE_STEP_SIZE = 2; +static const unsigned int AVX_CUT_LEN_MASK = 3U; +static const unsigned int SSE_CUT_LEN_MASK = 1U; + +#define _mm256_setzero_px _mm256_setzero_pd +#define _mm256_mul_px _mm256_mul_pd +#define _mm256_add_px _mm256_add_pd +#define _mm256_load_px _mm256_loadu_pd +#define _mm256_hadd_px _mm256_hadd_pd +#define _mm256_permute2f128_px _mm256_permute2f128_pd +#define _mm256_store_px _mm256_storeu_pd +#define _mm256_broadcast_sx _mm256_broadcast_sd +#define _mm256_castpx256_px128 _mm256_castpd256_pd128 +#define _mm256_max_px _mm256_max_pd +#define _mm256_sub_px _mm256_sub_pd +#define _mm256_set1_px _mm256_set1_pd +#define _mm256_sqrt_px _mm256_sqrt_pd +#define _mm256_div_px _mm256_div_pd +#define _mm_setzero_px _mm_setzero_pd +#define _mm_add_px _mm_add_pd +#define _mm_mul_px _mm_mul_pd +#define _mm_load_px _mm_loadu_pd +#define _mm_hadd_px _mm_hadd_pd +#define _mm_store_sx _mm_store_sd +#define _mm_store_px _mm_storeu_pd +#define _mm_load1_px _mm_load1_pd +#define _mm_max_px _mm_max_pd +#define _mm_sub_px _mm_sub_pd +#define _mm_set1_px _mm_set1_pd +#define _mm_sqrt_px _mm_sqrt_pd +#define _mm_div_px _mm_div_pd +#endif + +template +inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(z + jjj, _mm256_add_px(_mm256_load_px(x + jjj), + _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] + y[jjj]; + } +} + +template +inline void sse_eltmul(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px(z + jjj, _mm256_mul_px(_mm256_load_px(x + jjj), + _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_mul_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] * y[jjj]; + } +} + +class SearchGrnnOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("Wi", "Wi (Tensor)"); + AddInput("Wh", "Wh (Tensor)"); + AddAttr("num_input", "num_input: the embedding size").SetDefault(0); + AddAttr("num_hidden", "num_hidden: the hidden size").SetDefault(0); + + AddOutput("Out", + "Out (LoDTensor, default LoDTensor) Output variable"); + AddOutput("tmp_buffer", + "tmp_buffer (LoDTensor, default LoDTensor) tmp variable"); + AddOutput("idx_sorted_by_width", + "idx_sorted_by_width (Tensor, Tensor) tmp variable"); + AddOutput( + "layout_input", + "layout_input (LoDTensor, default LoDTensor) tmp variable"); + + AddComment(R"DOC( + SearchGrnn + + NOTE: only support 'float32' data type now. + + )DOC"); + } +}; + +class SearchGrnnOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Wi"), "Wi(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Wh"), "Wh(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("tmp_buffer"), + "tmp_buffer(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("idx_sorted_by_width"), + "idx_sorted_by_width(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("layout_input"), + "layout_input(Output) should not be null."); + + int _cap_h = ctx->Attrs().Get("num_hidden"); + int _cap_e = ctx->Attrs().Get("num_input"); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, + "The rank of X(Input) can't be less than 2."); + PADDLE_ENFORCE_EQ(x_dims[1], _cap_e, "x_dims[1] should be equal to _cap_e"); + + auto wi_dims = ctx->GetInputDim("Wi"); + PADDLE_ENFORCE_EQ(wi_dims.size(), 3, "Wi should be 3-D tensor"); + PADDLE_ENFORCE_EQ(wi_dims[0], 3, "Wi dim[0] should be equal to 3"); + PADDLE_ENFORCE_EQ(wi_dims[1], _cap_h, + "wi_dims[1] should be equal to _cap_h"); + PADDLE_ENFORCE_EQ(wi_dims[2], _cap_e, + "wi_dims[2] should be equal to _cap_e"); + + auto wh_dims = ctx->GetInputDim("Wh"); + PADDLE_ENFORCE_EQ(wh_dims.size(), 3, "Wi should be 3-D tensor"); + PADDLE_ENFORCE_EQ(wh_dims[0], 3, "Wh dim[0] should be equal to 3"); + PADDLE_ENFORCE_EQ(wh_dims[1], _cap_h, + "wh_dims[1] should be equal to _cap_h"); + PADDLE_ENFORCE_EQ(wh_dims[2], _cap_h, + "wh_dims[2] should be equal to _cap_h"); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod[0].back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + } else { + std::vector out_dims_vec{-1}; + out_dims_vec.push_back(_cap_h); + std::vector tmp_buffer_shape{20}; + tmp_buffer_shape.push_back(-1); + tmp_buffer_shape.push_back(_cap_h); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + ctx->SetOutputDim("tmp_buffer", framework::make_ddim(tmp_buffer_shape)); + } + + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class CPUSearchGrnnOPKernel : public framework::OpKernel { + public: + void prepare_layout(const framework::ExecutionContext& ctx, + const LoDTensor* input_blob) const { + auto* _idx_sorted_by_width = ctx.Output("idx_sorted_by_width"); + auto* _layout_input = ctx.Output("layout_input"); + + auto _input = input_blob; + + // usually total length + int dim0 = _input->dims()[0]; + // if it is id only sequence + int dim1 = 1; + + // if its a embedding like sequence (dim1 would be embedding_size) + if (_input->dims().size() > 1) { + dim1 = _input->dims()[1]; + } + + int batch = _input->lod()[0].size() - 1; + + auto& offset = _input->lod()[0]; + + Tensor _width; + _width.Resize(framework::make_ddim({batch})); + _idx_sorted_by_width->Resize(framework::make_ddim({batch})); + int* width_data = _width.mutable_data(ctx.GetPlace()); + int* idx_sorted_by_width_data = + _idx_sorted_by_width->mutable_data(ctx.GetPlace()); + // sort sequence by width (descending) and find the largest width in the + // batch + for (int i = 0; i < batch; i++) { + width_data[i] = offset[i + 1] - offset[i]; + idx_sorted_by_width_data[i] = i; + } + std::sort(idx_sorted_by_width_data, idx_sorted_by_width_data + batch, + [&_width](int a, int b) { + return _width.data()[a] > _width.data()[b]; + }); + int max_width = width_data[idx_sorted_by_width_data[0]]; + + // start of reorganizing the input + std::vector new_offset; + new_offset.resize(max_width + 1); + + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width_data[idx_sorted_by_width_data[k]] > last_width) { + sub_row = width_data[idx_sorted_by_width_data[k]] - last_width; + sub_col = k + 1; + + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + + // move on + last_width = width_data[idx_sorted_by_width_data[k]]; + j = k - 1; + break; + } + } + } + + // copying to the reorganized buffer + if (_input->dims().size() == 1) { + //_layout_input.reshape_batch_sequence({dim0}, new_offset); + } else { + //_layout_input.reshape_batch_sequence({dim0, dim1}, new_offset); + + framework::LoD new_lod; + new_lod.push_back(new_offset); + _layout_input->set_lod(new_lod); + _layout_input->Resize(framework::make_ddim({dim0, dim1})); + } + + auto* new_emb = _layout_input->mutable_data(ctx.GetPlace()); + for (int i = 0; i < max_width; i++) { + int w = new_offset[i + 1] - new_offset[i]; + auto* emb_start = new_emb + dim1 * new_offset[i]; + for (int j = 0; j < w; ++j) { + memcpy(emb_start + dim1 * j, + _input->data() + dim1 * offset[idx_sorted_by_width_data[j]] + + dim1 * i, + dim1 * sizeof(T)); + } + } + // end of reorganizing the input + } + + void copy_back(const framework::ExecutionContext& ctx, T* from, T* to, + int step) const { + auto* _input = ctx.Input("X"); + auto* _layout_input = ctx.Output("layout_input"); + auto* _idx_sorted_by_width = ctx.Output("idx_sorted_by_width"); + + const auto& offset = _input->lod()[0]; + const auto& new_offset = _layout_input->lod()[0]; + const auto* idx_sorted_by_width_data = _idx_sorted_by_width->data(); + for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) { + int w = new_offset[i + 1] - new_offset[i]; + for (int j = 0; j < w; j++) { + memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i), + from + (new_offset[i] + j) * step, step * sizeof(T)); + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* wi = ctx.Input("Wi"); + auto* wh = ctx.Input("Wh"); + auto* top = ctx.Output("Out"); + auto* _buffer = ctx.Output("tmp_buffer"); + + // std::vector _blobs{wi, wh}; + + int _cap_h = ctx.Attr("num_hidden"); + int _cap_e = ctx.Attr("num_input"); + + int _cap_l = bottom->dims()[0]; + int batch = bottom->lod()[0].size() - 1; + + const auto& offset = bottom->lod()[0]; + framework::LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + std::vector top_dims_vec{_cap_l, _cap_h}; + auto* top_hidden = top->mutable_data(framework::make_ddim(top_dims_vec), + ctx.GetPlace()); + + const auto* dense_e2h = wi->data(); + const auto* dense_h2h = wh->data(); + + const auto* e2h = dense_e2h; + const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h; + const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h; + const auto* h2h = dense_h2h; + const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h; + const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h; + + prepare_layout(ctx, bottom); + auto* _layout_input = ctx.Output("layout_input"); + auto* new_emb = _layout_input->mutable_data(ctx.GetPlace()); + const auto& new_offset = _layout_input->lod()[0]; + int max_width = _layout_input->lod()[0].size() - 1; + + // this buffer is used for book keeping info which will be used in bp + // buffer also needed in bp, so make it larger + _buffer->Resize(framework::make_ddim({20, _cap_l, _cap_h})); + auto* buffer_data = _buffer->mutable_data(ctx.GetPlace()); + auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h; + auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h; + auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h; + + auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h; + auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h; + auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h; + + auto* r = buffer_data + 6 * _cap_l * _cap_h; + auto* z = buffer_data + 7 * _cap_l * _cap_h; + auto* tilde = buffer_data + 8 * _cap_l * _cap_h; + // the internal hidden + auto* hidden = buffer_data + 19 * _cap_l * _cap_h; + + // precompute embedding to hidden + auto blas = math::GetBlas(ctx); + call_gemm(blas, CblasNoTrans, CblasTrans, _cap_l, _cap_h, _cap_e, 1.0f, + new_emb, e2h, 0.0f, w_x_e); + call_gemm(blas, CblasNoTrans, CblasTrans, _cap_l, _cap_h, _cap_e, 1.0f, + new_emb, e2hr, 0.0f, wr_x_e); + call_gemm(blas, CblasNoTrans, CblasTrans, _cap_l, _cap_h, _cap_e, 1.0f, + new_emb, e2hz, 0.0f, wz_x_e); + + // precompute hidden0 + for (int i = 0; i < batch * _cap_h; i++) { + tilde[i] = std::tanh(w_x_e[i]); + z[i] = sigmoid(wz_x_e[i]); + hidden[i] = (1. - z[i]) * tilde[i]; + } + + // recurrence + for (int i = 1; i < max_width; i++) { + int w_tm1 = new_offset[i] - new_offset[i - 1]; + int w = new_offset[i + 1] - new_offset[i]; + + // precompute hidden i-1 to hidden i + auto* htm1 = hidden + new_offset[i - 1] * _cap_h; + + call_gemm(blas, CblasNoTrans, CblasTrans, w, _cap_h, _cap_h, 1.0f, htm1, + h2h, 0.0f, u_x_h + new_offset[i] * _cap_h); + call_gemm(blas, CblasNoTrans, CblasTrans, w, _cap_h, _cap_h, 1.0f, htm1, + h2hr, 0.0f, ur_x_h + new_offset[i] * _cap_h); + call_gemm(blas, CblasNoTrans, CblasTrans, w, _cap_h, _cap_h, 1.0f, htm1, + h2hz, 0.0f, uz_x_h + new_offset[i] * _cap_h); + + // compute the gate and hidden + for (size_t j = new_offset[i] * _cap_h; j < (new_offset[i] + w) * _cap_h; + j++) { + r[j] = sigmoid(wr_x_e[j] + ur_x_h[j]); + z[j] = sigmoid(wz_x_e[j] + uz_x_h[j]); + tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]); + + hidden[j] = z[j] * hidden[j - _cap_h * w_tm1] + (1.0 - z[j]) * tilde[j]; + } + } + + // copy back to top + copy_back(ctx, hidden, top_hidden, _cap_h); + } +}; + +class SearchGrnnOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Wi"), "Input(Wi) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Wh"), "Input(Wh) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("Wi"))) { + ctx->SetOutputDim(framework::GradVarName("Wi"), ctx->GetInputDim("Wi")); + } + if (ctx->HasOutput(framework::GradVarName("Wh"))) { + ctx->SetOutputDim(framework::GradVarName("Wh"), ctx->GetInputDim("Wh")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class CPUSearchGrnnOPGradKernel : public framework::OpKernel { + public: + void do_same_layout(const framework::ExecutionContext& ctx, const T* from, + T* to, int step) const { + auto* _input = ctx.Input("X"); + auto* _layout_input = ctx.Input("layout_input"); + auto& offset = _input->lod()[0]; + const auto& new_offset = _layout_input->lod()[0]; + auto* _idx_sorted_by_width = ctx.Input("idx_sorted_by_width"); + const int* idx_sorted_by_width_data = _idx_sorted_by_width->data(); + + for (int i = 0; i < _layout_input->lod()[0].size() - 1; i++) { + int w = new_offset[i + 1] - new_offset[i]; + for (int j = 0; j < w; j++) { + memcpy(to + (new_offset[i] + j) * step, + from + step * (offset[idx_sorted_by_width_data[j]] + i), + step * sizeof(T)); + } + } + } + + void copy_back(const framework::ExecutionContext& ctx, T* from, T* to, + int step) const { + auto* _input = ctx.Input("X"); + auto* _layout_input = ctx.Input("layout_input"); + auto* _idx_sorted_by_width = ctx.Input("idx_sorted_by_width"); + + const auto& offset = _input->lod()[0]; + const auto& new_offset = _layout_input->lod()[0]; + const auto* idx_sorted_by_width_data = _idx_sorted_by_width->data(); + for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) { + int w = new_offset[i + 1] - new_offset[i]; + for (int j = 0; j < w; j++) { + memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i), + from + (new_offset[i] + j) * step, step * sizeof(T)); + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* wi = ctx.Input("Wi"); + auto* wh = ctx.Input("Wh"); + auto* _buffer = ctx.Input("tmp_buffer"); + auto* _layout_input = ctx.Input("layout_input"); + + // std::vector _blobs{wi, wh}; + + int _cap_h = ctx.Attr("num_hidden"); + int _cap_e = ctx.Attr("num_input"); + int _cap_l = bottom->dims()[0]; + + auto* d_bottom = ctx.Output(framework::GradVarName("X")); + auto* d_top = ctx.Input(framework::GradVarName("Out")); + auto* d_wi = ctx.Output(framework::GradVarName("Wi")); + auto* d_wh = ctx.Output(framework::GradVarName("Wh")); + + int batch = bottom->lod()[0].size() - 1; + + const auto& new_offset = _layout_input->lod()[0]; + int max_width = _layout_input->lod()[0].size() - 1; + + // the original top and bottom pointers + auto* top_diff = d_top->data(); + auto* ediff = d_bottom->mutable_data(ctx.GetPlace()); + + const auto* dense_e2h = wi->data(); + const auto* dense_h2h = wh->data(); + + auto* dense_e2h_diff = d_wi->mutable_data(ctx.GetPlace()); + auto* dense_h2h_diff = d_wh->mutable_data(ctx.GetPlace()); + // init parameter's diff + memset(dense_e2h_diff, 0, 3 * _cap_e * _cap_h * sizeof(T)); + memset(dense_h2h_diff, 0, 3 * _cap_h * _cap_h * sizeof(T)); + + const auto* e2h = dense_e2h; + const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h; + const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h; + const auto* h2h = dense_h2h; + const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h; + const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h; + + auto* e2h_diff = dense_e2h_diff; + auto* e2hr_diff = dense_e2h_diff + 1 * _cap_e * _cap_h; + auto* e2hz_diff = dense_e2h_diff + 2 * _cap_e * _cap_h; + auto* h2h_diff = dense_h2h_diff; + auto* h2hr_diff = dense_h2h_diff + 1 * _cap_h * _cap_h; + auto* h2hz_diff = dense_h2h_diff + 2 * _cap_h * _cap_h; + + auto u_x_h = _buffer->data() + 3 * _cap_l * _cap_h; + + Tensor buffer_diff; + buffer_diff.Resize(framework::make_ddim({20, _cap_l, _cap_h})); + auto* buffer_diff_data = buffer_diff.mutable_data(ctx.GetPlace()); + + auto e2hdiff = buffer_diff_data + 0 * _cap_l * _cap_h; + auto e2hrdiff = buffer_diff_data + 1 * _cap_l * _cap_h; + auto e2hzdiff = buffer_diff_data + 2 * _cap_l * _cap_h; + + auto h2hdiff = buffer_diff_data + 3 * _cap_l * _cap_h; + auto h2hrdiff = buffer_diff_data + 4 * _cap_l * _cap_h; + auto h2hzdiff = buffer_diff_data + 5 * _cap_l * _cap_h; + + auto* buffer_data = _buffer->data(); + auto r = buffer_data + 6 * _cap_l * _cap_h; + auto z = buffer_data + 7 * _cap_l * _cap_h; + auto tilde = buffer_data + 8 * _cap_l * _cap_h; + + auto d_r = buffer_diff_data + 9 * _cap_l * _cap_h; + auto d_z = buffer_diff_data + 10 * _cap_l * _cap_h; + auto d_tilde = buffer_diff_data + 11 * _cap_l * _cap_h; + + auto tmp_buffer = buffer_diff_data + 12 * _cap_l * _cap_h; + + auto hidden = buffer_data + 19 * _cap_l * _cap_h; + auto hidden_diff = buffer_diff_data + 19 * _cap_l * _cap_h; + auto embedding = _layout_input->data(); + Tensor _layout_input_grad; + _layout_input_grad.Resize(_layout_input->dims()); + auto embedding_diff = _layout_input_grad.mutable_data(ctx.GetPlace()); + + // copy top_hiddden diff back to the reorganized hidden, so we can use + // segemm to back-prop the sequence + do_same_layout(ctx, top_diff, hidden_diff, _cap_h); + + // precompute nonlinear diff + for (int k = 0; k < new_offset[1] * _cap_h; k++) { + d_z[k] = SIGMOID_D(z[k]); + d_tilde[k] = TANHD(tilde[k]); + } + + for (int k = new_offset[1] * _cap_h; k < new_offset[max_width] * _cap_h; + k++) { + d_r[k] = SIGMOID_D(r[k]); + d_z[k] = SIGMOID_D(z[k]); + d_tilde[k] = TANHD(tilde[k]); + } + + auto blas = math::GetBlas(ctx); + // back prop + for (int i = max_width - 1; i > 0; i--) { + int w_tm1 = new_offset[i] - new_offset[i - 1]; + int w = new_offset[i + 1] - new_offset[i]; + + for (int j = new_offset[i]; j < (new_offset[i] + w); j++) { + for (int k = 0; k < _cap_h; k++) { + int ht = j * _cap_h + k; + int htm1 = ht - _cap_h * w_tm1; + + T common = (1.0 - z[ht]) * d_tilde[ht] * hidden_diff[ht]; + + h2hdiff[htm1] = common * r[ht]; + h2hrdiff[htm1] = common * u_x_h[ht] * d_r[ht]; + h2hzdiff[htm1] = + (hidden[htm1] - tilde[ht]) * d_z[ht] * hidden_diff[ht]; + + e2hdiff[ht] = common; + e2hrdiff[ht] = h2hrdiff[htm1]; + e2hzdiff[ht] = h2hzdiff[htm1]; + } + } + + auto* hidden_htm1 = hidden + new_offset[i - 1] * _cap_h; + auto* h2hdiff_htm1 = h2hdiff + new_offset[i - 1] * _cap_h; + auto* h2hrdiff_htm1 = h2hrdiff + new_offset[i - 1] * _cap_h; + auto* h2hzdiff_htm1 = h2hzdiff + new_offset[i - 1] * _cap_h; + + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_h, w, (T)1.0, + h2hdiff_htm1, hidden_htm1, (T)1.0, h2h_diff); + + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_h, w, (T)1.0, + h2hrdiff_htm1, hidden_htm1, (T)1.0, h2hr_diff); + + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_h, w, (T)1.0, + h2hzdiff_htm1, hidden_htm1, (T)1.0, h2hz_diff); + + auto* embedding_et = embedding + new_offset[i] * _cap_e; + auto* e2hdiff_ht = e2hdiff + new_offset[i] * _cap_h; + auto* e2hrdiff_ht = e2hrdiff + new_offset[i] * _cap_h; + auto* e2hzdiff_ht = e2hzdiff + new_offset[i] * _cap_h; + + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, w, (T)1.0, + e2hdiff_ht, embedding_et, (T)1.0, e2h_diff); + + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, w, (T)1.0, + e2hrdiff_ht, embedding_et, (T)1.0, e2hr_diff); + + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, w, (T)1.0, + e2hzdiff_ht, embedding_et, (T)1.0, e2hz_diff); + + sse_eltmul(z + new_offset[i] * _cap_h, + hidden_diff + new_offset[i] * _cap_h, + tmp_buffer + new_offset[i - 1] * _cap_h, _cap_h * w); + // add this with diff from top + sse_eltadd(hidden_diff + new_offset[i - 1] * _cap_h, + tmp_buffer + new_offset[i - 1] * _cap_h, + hidden_diff + new_offset[i - 1] * _cap_h, _cap_h * w); + + call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_h, _cap_h, (T)1.0, + h2hdiff_htm1, h2h, (T)1.0, + hidden_diff + new_offset[i - 1] * _cap_h); + call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_h, _cap_h, (T)1.0, + h2hrdiff_htm1, h2hr, (T)1.0, + hidden_diff + new_offset[i - 1] * _cap_h); + call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_h, _cap_h, (T)1.0, + h2hzdiff_htm1, h2hz, (T)1.0, + hidden_diff + new_offset[i - 1] * _cap_h); + + // bp embedding diff + auto* embedding_diff_et = embedding_diff + new_offset[i] * _cap_e; + + call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_e, _cap_h, (T)1.0, + e2hdiff_ht, e2h, (T)0.0, embedding_diff_et); + + call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_e, _cap_h, (T)1.0, + e2hrdiff_ht, e2hr, (T)1.0, embedding_diff_et); + + call_gemm(blas, CblasNoTrans, CblasNoTrans, w, _cap_e, _cap_h, (T)1.0, + e2hzdiff_ht, e2hz, (T)1.0, embedding_diff_et); + } + + for (int i = 0; i < batch * _cap_h; i++) { + e2hdiff[i] = (1. - z[i]) * d_tilde[i] * hidden_diff[i]; + e2hzdiff[i] = (-tilde[i]) * d_z[i] * hidden_diff[i]; + } + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, batch, (T)1.0, + e2hdiff, embedding, (T)1.0, e2h_diff); + call_gemm(blas, CblasTrans, CblasNoTrans, _cap_h, _cap_e, batch, (T)1.0, + e2hzdiff, embedding, (T)1.0, e2hz_diff); + + call_gemm(blas, CblasNoTrans, CblasNoTrans, batch, _cap_e, _cap_h, (T)1.0, + e2hdiff, e2h, (T)0.0, embedding_diff); + call_gemm(blas, CblasNoTrans, CblasNoTrans, batch, _cap_e, _cap_h, (T)1.0, + e2hzdiff, e2hz, (T)1.0, embedding_diff); + + // copy back to original embedding diff, and hidden diff (probablly no use, + // but for safety) + copy_back(ctx, embedding_diff, ediff, _cap_e); + //_layout_helper.copy_back(hidden_diff, top_diff, _cap_h); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_grnn, ops::SearchGrnnOP, ops::SearchGrnnOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_grnn_grad, ops::SearchGrnnOpGrad); + +REGISTER_OP_CPU_KERNEL(search_grnn, + ops::CPUSearchGrnnOPKernel + // ops::CPUSearchGrnnOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_grnn_grad, + ops::CPUSearchGrnnOPGradKernel + // ops::CPUSearchGrnnOPGradKernel +); diff --git a/paddle/fluid/operators/search_grnn_op.h b/paddle/fluid/operators/search_grnn_op.h new file mode 100644 index 00000000000000..80411bb0262894 --- /dev/null +++ b/paddle/fluid/operators/search_grnn_op.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/search_group_padding_op.cc b/paddle/fluid/operators/search_group_padding_op.cc new file mode 100644 index 00000000000000..7ebadebed5322a --- /dev/null +++ b/paddle/fluid/operators/search_group_padding_op.cc @@ -0,0 +1,222 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class SearchGroupPaddingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + + AddAttr("pad_id", "pad_id").SetDefault(0).EqualGreaterThan(0); + + AddOutput("Out_emb_padding", "Out_emb_padding"); + AddOutput("Out_new", "Out_new"); + AddOutput("Out_padding", "Out_padding"); + + AddComment(R"DOC( + SearchGroupPadding + + NOTE: only support 'float32' data type now. + +)DOC"); + } +}; + +class SearchGroupPaddingOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out_emb_padding"), + "Out(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out_new"), + "Out(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out_padding"), + "Out(Output) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2."); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + } else { + // compile time + framework::VarDesc* x_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1); + } + + ctx->SetOutputDim("Out_emb_padding", framework::make_ddim({-1, x_dims[1]})); + ctx->SetOutputDim("Out_new", framework::make_ddim({x_dims[0], 1})); + // ctx->ShareLoD("X", /*->*/ "Out_new"); + ctx->SetOutputDim("Out_padding", framework::make_ddim({-1, 1})); + } +}; + +template +class CPUSearchGroupPaddingOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* top0 = ctx.Output("Out_emb_padding"); + auto* top1 = ctx.Output("Out_new"); + auto* top2 = ctx.Output("Out_padding"); + + int _pad_id = ctx.Attr("pad_id"); + + int batch = bottom0->lod()[0].size() - 1; + int dim0 = bottom0->dims()[0]; + int dim1 = bottom0->dims()[1]; // dim1 is usually the embedding size + + const auto offset = bottom0->lod()[0]; + int max_seq = 0; + for (int i = 0; i < batch; ++i) { + if (offset[i + 1] - offset[i] > max_seq) { + max_seq = offset[i + 1] - offset[i]; + } + } + + std::vector new_offset; + new_offset.resize(batch + 1); + + for (int i = 0; i < batch + 1; ++i) { + new_offset[i] = i * max_seq; + } + + // for padding data + framework::LoD top0_lod; + top0_lod.push_back(new_offset); + top0->set_lod(top0_lod); + top0->Resize(framework::make_ddim({batch * max_seq, dim1})); + + // for origin input id + // already set by ShareLoD in InferShape + framework::LoD top1_lod; + top1_lod.push_back(offset); + top1->set_lod(top1_lod); + top1->Resize(framework::make_ddim({dim0, 1})); + memset(top1->mutable_data(ctx.GetPlace()), 0, + top1->dims()[0] * top1->dims()[1] * sizeof(T)); + + // for padding input id + framework::LoD top2_lod; + top2_lod.push_back(new_offset); + top2->set_lod(top2_lod); + top2->Resize(framework::make_ddim({batch * max_seq, 1})); + + // copy data + const auto* bottom_data = bottom0->data(); + auto* top_data = top0->mutable_data(ctx.GetPlace()); + auto* top_padding_input_data = top2->mutable_data(ctx.GetPlace()); + for (int i = 0; i < batch; i++) { + const int copy_step = offset[i + 1] - offset[i]; + const int start = i * max_seq; + memcpy(top_data + start * dim1, bottom_data + offset[i] * dim1, + copy_step * dim1 * sizeof(T)); + memset(top_data + (start + copy_step) * dim1, 0, + (max_seq - copy_step) * dim1 * sizeof(T)); + // for padding input id + memset(top_padding_input_data + start, 0, copy_step * sizeof(T)); + for (int j = start + copy_step; j < start + max_seq; j++) { + top_padding_input_data[j] = static_cast(_pad_id); + } + } + } +}; + +class SearchGroupPaddingOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out_emb_padding")), + "Input(Out_emb_padding@GRAD) of SequencePadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } +}; + +template +class CPUSearchGroupPaddingOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* top0 = ctx.Input("Out_emb_padding"); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_out = + ctx.Input(framework::GradVarName("Out_emb_padding")); + + int batch = bottom0->lod()[0].size() - 1; + int dim1 = bottom0->dims()[1]; // dim1 is usually the embedding size + + auto* bottom_diff = d_x->mutable_data(ctx.GetPlace()); + const auto* top_diff = d_out->data(); + const auto offset = bottom0->lod()[0]; + const auto top_offset = top0->lod()[0]; + for (int i = 0; i < batch; i++) { + const int step = offset[i + 1] - offset[i]; + memcpy(bottom_diff + offset[i] * dim1, top_diff + top_offset[i] * dim1, + step * dim1 * sizeof(T)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_group_padding, ops::SearchGroupPaddingOP, + ops::SearchGroupPaddingOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_group_padding_grad, ops::SearchGroupPaddingOpGrad); + +REGISTER_OP_CPU_KERNEL( + search_group_padding, + ops::CPUSearchGroupPaddingOPKernel + // ops::CPUSearchGroupPaddingOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_group_padding_grad, + ops::CPUSearchGroupPaddingOPGradKernel + // ops::CPUSearchGroupPaddingOPGradKernel +); diff --git a/paddle/fluid/operators/search_seq_arithmetic_op.cc b/paddle/fluid/operators/search_seq_arithmetic_op.cc new file mode 100644 index 00000000000000..8b9048150f520e --- /dev/null +++ b/paddle/fluid/operators/search_seq_arithmetic_op.cc @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "search_compute.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class SearchSeqArithmeticOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("Y", + "Y (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + + AddAttr("op_type", "operation type: 1: add; 2: sub; 3: mul") + .SetDefault(0) + .EqualGreaterThan(1); + + AddOutput("Out", + "Out (LoDTensor, default LoDTensor) Output variable"); + + AddComment(R"DOC( + SearchSeqArithmetic + + NOTE: only support 'float32' data type now. + +)DOC"); + } +}; + +class SearchSeqArithmeticOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Y(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2."); + + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Y should be 2-D tensor"); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + framework::Variable* y_var = + boost::get(ctx->GetInputVarPtrs("Y")[0]); + const auto& y_lod = y_var->Get().lod(); + PADDLE_ENFORCE(!y_lod.empty(), "The Input(Y) must hold lod info."); + const auto& y_lod_0 = y_lod[0]; + PADDLE_ENFORCE_GE(y_lod_0.size(), 2, + "The Input(Y)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + y_dims[0], static_cast(y_lod_0.back()), + "The Input(Y)'s lod info mismatches the actual tensor shape."); + + PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(), + "The Length of X and Y must be equal."); + } else { + // compile time + framework::VarDesc* x_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1); + framework::VarDesc* y_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1); + } + + ctx->SetOutputDim("Out", framework::make_ddim({-1, x_dims[1]})); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class CPUSearchSeqArithmeticOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* bottom1 = ctx.Input("Y"); + auto* top = ctx.Output("Out"); + + int _op_type = ctx.Attr("op_type"); + + auto len1 = bottom0->dims()[0] * bottom0->dims()[1]; + auto len2 = bottom1->dims()[0] * bottom1->dims()[1]; + const auto* bottom_data0 = bottom0->data(); + const auto* bottom_data1 = bottom1->data(); + // already set by ShareLoD in InferShape + // framework::LoD top_lod; + // top_lod.push_back(offset); + // top->set_lod(top_lod); + top->Resize(framework::make_ddim({bottom0->dims()[0], bottom0->dims()[1]})); + auto* top_data = top->mutable_data(ctx.GetPlace()); + + switch (_op_type) { + case 1: // addition: top[0] = bottom[0] + bottom[1] + if (len1 > len2) { + sse_eltadd(bottom_data0, bottom_data1, top_data, len2); + memcpy(&top_data[len2], &bottom_data0[len2], + (len1 - len2) * sizeof(T)); + } else { + sse_eltadd(bottom_data0, bottom_data1, top_data, len1); + } + break; + case 2: // substraction: top[0] = bottom[0] - bottom[1] + memcpy(top_data, bottom_data0, len1 * sizeof(T)); + if (len1 > len2) { + sse_axpy(bottom_data1, top_data, len2, (T)-1.0); + } else { + sse_axpy(bottom_data1, top_data, len1, (T)-1.0); + } + break; + case 3: // multiplication: top[0] = bottom[0] * bottom[1] + if (len1 > len2) { + sse_eltmul(bottom_data0, bottom_data1, top_data, len2); + memcpy(&top_data[len2], &bottom_data0[len2], + (len1 - len2) * sizeof(T)); + } else { + sse_eltmul(bottom_data0, bottom_data1, top_data, len1); + } + break; + } + } +}; + +class SearchSeqArithmeticOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("Y"))) { + ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y")); + ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y")); + } + } +}; + +template +class CPUSearchSeqArithmeticOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* bottom1 = ctx.Input("Y"); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_y = ctx.Output(framework::GradVarName("Y")); + int _op_type = ctx.Attr("op_type"); + + auto len1 = bottom0->dims()[0] * bottom0->dims()[1]; + auto len2 = bottom1->dims()[0] * bottom1->dims()[1]; + auto* bottom_diff0 = d_x->mutable_data(ctx.GetPlace()); + auto* bottom_diff1 = d_y->mutable_data(ctx.GetPlace()); + const auto* top_diff = d_out->data(); + const auto* bottom_data0 = bottom0->data(); + const auto* bottom_data1 = bottom1->data(); + + switch (_op_type) { + case 1: // addition + memcpy(bottom_diff0, top_diff, len1 * sizeof(T)); + if (len1 >= len2) { + memcpy(bottom_diff1, top_diff, len2 * sizeof(T)); + } else { + memset(bottom_diff1, 0, len2 * sizeof(T)); + memcpy(bottom_diff1, top_diff, len1 * sizeof(T)); + } + break; + case 2: // substraction + memcpy(bottom_diff0, top_diff, len1 * sizeof(T)); + if (len1 >= len2) { + sse_axpy_noadd(top_diff, bottom_diff1, len2, (T)-1.0); + } else { + memset(bottom_diff1, 0, len2 * sizeof(T)); + sse_axpy_noadd(top_diff, bottom_diff1, len1, (T)-1.0); + } + break; + case 3: // multiplication + if (len1 >= len2) { + memcpy(bottom_diff0, top_diff, len1 * sizeof(T)); + sse_eltmul(top_diff, bottom_data1, bottom_diff0, len2); + sse_eltmul(top_diff, bottom_data0, bottom_diff1, len2); + } else { + sse_eltmul(top_diff, bottom_data1, bottom_diff0, len1); + memset(bottom_diff1, 0, len2 * sizeof(T)); + sse_eltmul(top_diff, bottom_data0, bottom_diff1, len1); + } + break; + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_seq_arithmetic, ops::SearchSeqArithmeticOP, + ops::SearchSeqArithmeticOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_seq_arithmetic_grad, ops::SearchSeqArithmeticOpGrad); + +REGISTER_OP_CPU_KERNEL( + search_seq_arithmetic, + ops::CPUSearchSeqArithmeticOPKernel + // ops::CPUSearchSeqArithmeticOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_seq_arithmetic_grad, + ops::CPUSearchSeqArithmeticOPGradKernel + // ops::CPUSearchSeqArithmeticOPGradKernel +); diff --git a/paddle/fluid/operators/search_seq_depadding_op.cc b/paddle/fluid/operators/search_seq_depadding_op.cc new file mode 100644 index 00000000000000..8cf0e5a2b6f6e7 --- /dev/null +++ b/paddle/fluid/operators/search_seq_depadding_op.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class SearchSeqDepaddingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Pad", + "Pad (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("Src", + "Src (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + + AddOutput("Out", "Out"); + + AddComment(R"DOC( + SearchSeqDepadding + + NOTE: only support 'float32' data type now. + +)DOC"); + } +}; + +class SearchSeqDepaddingOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Pad"), "Pad(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Src"), "Src(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + + auto pad_dims = ctx->GetInputDim("Pad"); + PADDLE_ENFORCE_EQ(pad_dims.size(), 2, + "The rank of Pad(Input) should be 2."); + + auto src_dims = ctx->GetInputDim("Src"); + PADDLE_ENFORCE_EQ(src_dims.size(), 2, + "The rank of Src(Input) should be 2."); + + if (ctx->IsRuntime()) { + framework::Variable* pad_var = + boost::get(ctx->GetInputVarPtrs("Pad")[0]); + const auto& pad_lod = pad_var->Get().lod(); + PADDLE_ENFORCE(!pad_lod.empty(), "The Input(Pad) must hold lod info."); + const auto& pad_lod_0 = pad_lod[0]; + PADDLE_ENFORCE_GE(pad_lod_0.size(), 2, + "The Input(Pad)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + pad_dims[0], static_cast(pad_lod_0.back()), + "The Input(Pad)'s lod info mismatches the actual tensor shape."); + + framework::Variable* src_var = + boost::get(ctx->GetInputVarPtrs("Src")[0]); + const auto& src_lod = src_var->Get().lod(); + PADDLE_ENFORCE(!src_lod.empty(), "The Input(Src) must hold lod info."); + const auto& src_lod_0 = src_lod[0]; + PADDLE_ENFORCE_GE(src_lod_0.size(), 2, + "The Input(Src)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + src_dims[0], static_cast(src_lod_0.back()), + "The Input(Src)'s lod info mismatches the actual tensor shape."); + } else { + // compile time + } + + ctx->SetOutputDim("Out", framework::make_ddim({-1, pad_dims[1]})); + //ctx->ShareLoD("Src", /*->*/ "Out"); + } +}; + +template +class CPUSearchSeqDepaddingOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("Pad"); + auto* bottom1 = ctx.Input("Src"); + auto* top0 = ctx.Output("Out"); + + const int pad_batch = bottom0->lod()[0].size() - 1; + const int src_batch = bottom1->lod()[0].size() - 1; + PADDLE_ENFORCE_EQ(pad_batch % src_batch, 0, + "Mismatch batch size, bottom0: %d, bottom1: %d", + pad_batch, src_batch); + + const auto& src_offset = bottom1->lod()[0]; + const auto& pad_offset = bottom0->lod()[0]; + const int src_cap_l = bottom1->dims()[0]; + const int pad_cap_e = bottom0->dims()[1]; + + framework::LoD top0_lod; + top0_lod.push_back(src_offset); + top0->set_lod(top0_lod); + top0->Resize(framework::make_ddim({src_cap_l, pad_cap_e})); + + const auto* bottom_data = bottom0->data(); + auto* top_data = top0->mutable_data(ctx.GetPlace()); + for (int i = 0; i < src_batch; ++i) { + const int src_i_l = src_offset[i + 1] - src_offset[i]; + const int pad_i_l = pad_offset[i + 1] - pad_offset[i]; + PADDLE_ENFORCE_GE( + pad_i_l, src_i_l, + "the length of padding seq input is less than source seq input."); + memcpy(top_data + src_offset[i] * pad_cap_e, + bottom_data + pad_offset[i] * pad_cap_e, + src_i_l * pad_cap_e * sizeof(T)); + } + } +}; + +class SearchSeqDepaddingOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Pad"), "Input(Pad) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Src"), "Input(Src) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + if (ctx->HasOutput(framework::GradVarName("Pad"))) { + ctx->SetOutputDim(framework::GradVarName("Pad"), ctx->GetInputDim("Pad")); + ctx->ShareLoD("Pad", /*->*/ framework::GradVarName("Pad")); + } + if (ctx->HasOutput(framework::GradVarName("Src"))) { + ctx->SetOutputDim(framework::GradVarName("Src"), ctx->GetInputDim("Src")); + ctx->ShareLoD("Src", /*->*/ framework::GradVarName("Src")); + } + } +}; + +template +class CPUSearchSeqDepaddingOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("Pad"); + auto* bottom1 = ctx.Input("Src"); + auto* d_bottom0 = ctx.Output(framework::GradVarName("Pad")); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + + const int src_batch = bottom1->lod()[0].size() - 1; + const auto& src_offset = bottom1->lod()[0]; + const auto& pad_offset = bottom0->lod()[0]; + const int pad_cap_e = bottom0->dims()[1]; + + const auto* top_diff = d_out->data(); + auto* bottom_diff = d_bottom0->mutable_data(ctx.GetPlace()); + for (int i = 0; i < src_batch; i++) { + const int src_i_l = src_offset[i + 1] - src_offset[i]; + const int pad_i_l = pad_offset[i + 1] - pad_offset[i]; + PADDLE_ENFORCE_GE( + pad_i_l, src_i_l, + "the length of padding seq input is less than source seq input."); + + memcpy(bottom_diff + pad_offset[i] * pad_cap_e, + top_diff + src_offset[i] * pad_cap_e, + src_i_l * pad_cap_e * sizeof(T)); + memset(bottom_diff + (pad_offset[i] + src_i_l) * pad_cap_e, 0, + (pad_i_l - src_i_l) * pad_cap_e * sizeof(T)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_seq_depadding, ops::SearchSeqDepaddingOP, + ops::SearchSeqDepaddingOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_seq_depadding_grad, ops::SearchSeqDepaddingOpGrad); + +REGISTER_OP_CPU_KERNEL( + search_seq_depadding, + ops::CPUSearchSeqDepaddingOPKernel + // ops::CPUSearchSeqDepaddingOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_seq_depadding_grad, + ops::CPUSearchSeqDepaddingOPGradKernel + // ops::CPUSearchSeqDepaddingOPGradKernel +); diff --git a/paddle/fluid/operators/search_seq_fc_op.cc b/paddle/fluid/operators/search_seq_fc_op.cc new file mode 100644 index 00000000000000..b007a44e701d0f --- /dev/null +++ b/paddle/fluid/operators/search_seq_fc_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "search_compute.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class SearchSeqFCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("W", "W (Tensor)"); + AddInput("b", "b (LoDTensor)"); + AddAttr("out_size", "out_size: the output size") + .SetDefault(0) + .EqualGreaterThan(1); + AddAttr("has_bias", "true or false").SetDefault(true); + + AddOutput("Out", + "Out (LoDTensor, default LoDTensor) Output variable"); + + AddComment(R"DOC( + SearchSeqFC + + NOTE: only support 'float32' data type now. + +)DOC"); + } +}; + +class SearchSeqFCOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "W(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + int out_size = ctx->Attrs().Get("out_size"); + bool has_bias = ctx->Attrs().Get("has_bias"); + + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2."); + + auto w_dims = ctx->GetInputDim("W"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor"); + + PADDLE_ENFORCE_EQ(w_dims[0], out_size, + "wrong shape: w_dims[0] != out_size"); + + PADDLE_ENFORCE_EQ(w_dims[1], x_dims[1], + "wrong shape: w_dims[1] != x_dims[1]"); + + if (has_bias) { + PADDLE_ENFORCE(ctx->HasInput("b"), "b(Input) should not be null."); + auto b_dims = ctx->GetInputDim("b"); + PADDLE_ENFORCE_EQ(b_dims.size(), 1, "b should be 1-D tensor"); + } + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + } else { + // compile time + } + + ctx->SetOutputDim("Out", framework::make_ddim({-1, out_size})); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class CPUSearchSeqFCOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* b = ctx.Input("b"); + auto* top = ctx.Output("Out"); + bool _bias_term = ctx.Attr("has_bias"); + + int _out = w->dims()[0]; + int _in = w->dims()[1]; + int res_num = bottom->dims()[0]; + + top->Resize(framework::make_ddim({res_num, _out})); + const auto* bottom_data = bottom->data(); + auto* top_data = top->mutable_data(ctx.GetPlace()); + const auto* weights = w->data(); + + call_gemm(ctx, CblasNoTrans, CblasTrans, res_num, _out, _in, (T)1.0, + bottom_data, weights, (T)0.0, top_data); + + if (_bias_term) { + const auto* bias = b->data();; + for (int i = 0; i < res_num; ++i) { + sse_eltadd(top_data + i * _out, bias, top_data + i * _out, _out); + } + } + } +}; + +class SearchSeqFCOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("b"), "Input(b) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X"))); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W"))); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + + bool has_bias = ctx->Attrs().Get("has_bias"); + if (has_bias) { + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("b"))); + ctx->SetOutputDim(framework::GradVarName("b"), ctx->GetInputDim("b")); + } + } +}; + +template +class CPUSearchSeqFCOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* w = ctx.Input("W"); + bool _bias_term = ctx.Attr("has_bias"); + + int _out = w->dims()[0]; + int _in = w->dims()[1]; + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_w = ctx.Output(framework::GradVarName("W")); + + int res_num = bottom->dims()[0]; + + const auto* top_diff = d_out->data(); + const auto* bottom_data = bottom->data(); + auto* bottom_diff = d_x->mutable_data(ctx.GetPlace()); + const auto* weights = w->data(); + auto* weights_diff = d_w->mutable_data(ctx.GetPlace()); + + call_gemm(ctx, CblasTrans, CblasNoTrans, _out, _in, res_num, (T)1.0, + top_diff, bottom_data, (T)0.0, weights_diff); + call_gemm(ctx, CblasNoTrans, CblasNoTrans, res_num, _in, _out, (T)1.0, + top_diff, weights, (T)0.0, bottom_diff); + + if (_bias_term) { + auto* d_b = ctx.Output(framework::GradVarName("b")); + auto* bias_diff = d_b->mutable_data(ctx.GetPlace()); + memset(bias_diff, (T)0.0, _out * sizeof(T)); + for (int i = 0; i < res_num; ++i) { + sse_eltadd(bias_diff, top_diff + i * _out, bias_diff, _out); + } + } + + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_seq_fc, ops::SearchSeqFCOP, ops::SearchSeqFCOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_seq_fc_grad, ops::SearchSeqFCOpGrad); + +REGISTER_OP_CPU_KERNEL(search_seq_fc, + ops::CPUSearchSeqFCOPKernel + // ops::CPUSearchSeqFCOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_seq_fc_grad, + ops::CPUSearchSeqFCOPGradKernel + // ops::CPUSearchSeqFCOPGradKernel +); diff --git a/paddle/fluid/operators/search_seq_softmax_op.cc b/paddle/fluid/operators/search_seq_softmax_op.cc new file mode 100644 index 00000000000000..3829710c5e98f4 --- /dev/null +++ b/paddle/fluid/operators/search_seq_softmax_op.cc @@ -0,0 +1,211 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "search_compute.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class SearchSeqSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + + AddAttr("alg", "operation type: 0: accurate; 1: log; others: invalid") + .SetDefault(0) + .EqualGreaterThan(0); + + AddOutput("Out", + "Out (LoDTensor, default LoDTensor) Output variable"); + AddOutput("Out_log", + "Out_log (LoDTensor, default LoDTensor) Output variable"); + + AddComment(R"DOC( + SearchSeqSoftmax + + NOTE: only support 'float32' data type now. + +)DOC"); + } +}; + +class SearchSeqSoftmaxOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Out(Output) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out_log"), "Out_log(Output) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The rank of X(Input) should be 2."); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + } else { + // compile time + } + + ctx->SetOutputDim("Out", framework::make_ddim({-1, x_dims[1]})); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class CPUSearchSeqSoftmaxOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* top0 = ctx.Output("Out"); + auto* _prob = ctx.Output("Out_log"); + int _output_log = ctx.Attr("alg"); + + int seq_size = bottom0->dims()[0]; + int dim = bottom0->dims()[1]; + const auto offset_vec = bottom0->lod()[0]; + top0->Resize(framework::make_ddim({seq_size, dim})); + const auto* bottom_data = bottom0->data(); + auto* top_data = top0->mutable_data(ctx.GetPlace()); + + for (int i = 0; i < seq_size; ++i) { + int offset = i * dim; + auto max_val = + *std::max_element(bottom_data + offset, bottom_data + offset + dim); + max_val *= -1; + sse_add_scalar(bottom_data + offset, top_data + offset, dim, max_val); + for (int j = 0; j < dim; ++j) { + top_data[offset + j] = std::exp(top_data[offset + j]); + } + T sum; + sse_sum(top_data + offset, sum, dim); + sum = 1.0 / sum; + sse_scale(top_data + offset, top_data + offset, dim, sum); + } + + if (_output_log) { + const int size = top0->dims()[0] * top0->dims()[1]; + _prob->Resize(framework::make_ddim({size})); + auto* prob_data = _prob->mutable_data(ctx.GetPlace()); + memcpy(prob_data, top_data, size * sizeof(T)); + for (int i = 0; i < size; ++i) { + top_data[i] = std::log(std::max(prob_data[i], X_MIN)); + } + } else { + _prob->Resize(framework::make_ddim({1})); + _prob->mutable_data(ctx.GetPlace()); + } + } +}; + +class SearchSeqSoftmaxOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } +}; + +template +class CPUSearchSeqSoftmaxOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom0 = ctx.Input("X"); + auto* top0 = ctx.Input("Out"); + auto* _prob = ctx.Input("Out_log"); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + int _output_log = ctx.Attr("alg"); + + int seq_size = bottom0->dims()[0]; + int dim = bottom0->dims()[1]; + const auto* top_diff = d_out->data(); + const auto* top_data = top0->data(); + auto* bottom_diff = d_x->mutable_data(ctx.GetPlace()); + + if (_output_log) { + const auto* prob_data = _prob->data(); + Tensor buffer_diff; + buffer_diff.Resize(_prob->dims()); + auto* prob_diff = buffer_diff.mutable_data(ctx.GetPlace()); + + const int size = top0->dims()[0] * top0->dims()[1]; + PADDLE_ENFORCE_EQ(size, _prob->dims()[0] * _prob->dims()[1], "top_size should be eq to _prob_size"); + for (int i = 0; i < size; ++i) { + prob_diff[i] = top_diff[i] / std::max(prob_data[i], X_MIN); + } + top_diff = prob_diff; + top_data = prob_data; + } + + for (int i = 0; i < seq_size; ++i) { + int offset = i * dim; + T ip_d_t; + sse_ip(top_diff + offset, top_data + offset, dim, ip_d_t); + ip_d_t *= -1; + sse_add_scalar(top_diff + offset, bottom_diff + offset, dim, ip_d_t); + sse_eltmul(top_data + offset, bottom_diff + offset, bottom_diff + offset, + dim); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(search_seq_softmax, ops::SearchSeqSoftmaxOP, + ops::SearchSeqSoftmaxOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(search_seq_softmax_grad, ops::SearchSeqSoftmaxOpGrad); + +REGISTER_OP_CPU_KERNEL( + search_seq_softmax, + ops::CPUSearchSeqSoftmaxOPKernel + // ops::CPUSearchSeqSoftmaxOPKernel +); +REGISTER_OP_CPU_KERNEL( + search_seq_softmax_grad, + ops::CPUSearchSeqSoftmaxOPGradKernel + // ops::CPUSearchSeqSoftmaxOPGradKernel +); diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc new file mode 100644 index 00000000000000..eb553d0e2a1bdf --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h" +#include + +namespace paddle { +namespace operators { + +class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROW"), + "Input(ROW) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("COLUMN"), + "Input(COLUMN) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("pos"), "pos(out) should not be null"); + + auto attr = ctx->Attrs(); + auto channel_num = attr.Get("channel_num"); + auto topks = attr.Get>("topks"); + + auto row_dim = ctx->GetInputDim("ROW"); + + auto num_k = topks.size(); + auto row_shape_0 = row_dim[0]; + + std::vector vec_out_shape; + vec_out_shape.push_back(row_shape_0); + vec_out_shape.push_back(channel_num * num_k); + + ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape)); + ctx->ShareLoD("X", "Out"); + } +}; + +class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor) The variable-length input of SequenceTopkPoolingOp"); + AddInput("ROW", "(LoDTensor) the row info"); + AddInput("COLUMN", "(LoDTensor) the column info"); + AddOutput( + "Out", + "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD " + "infomation."); + AddOutput("pos", "(Tensor) store the topk index ").AsIntermediate(); + AddAttr>("topks", "topks"); + AddAttr("channel_num", "channel number"); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddComment(R"DOC( + sequecen topk average pooling op + )DOC"); + } +}; + +class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null."); + + ctx->ShareDim("X", /*->*/ framework::GradVarName("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // return framework::OpKernelType( + // framework::ToDataType(ctx.Input("X")->type()), + // ctx.device_context()); + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class SequenceTopkAvgPoolGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("sequence_topk_avg_pooling_grad"); + op_desc_ptr->SetInput("X", Input("X")); + op_desc_ptr->SetInput("ROW", Input("ROW")); + op_desc_ptr->SetInput("COLUMN", Input("COLUMN")); + op_desc_ptr->SetInput("pos", Output("pos")); + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_topk_avg_pooling, ops::SequenceTopkAvgPoolingOp, + ops::SequenceTopkAvgPoolingOpMaker, + ops::SequenceTopkAvgPoolGradOpMaker); +REGISTER_OPERATOR(sequence_topk_avg_pooling_grad, + ops::SequenceTopkAvgPoolingGradOp); +REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling, + ops::SequenceTopkAvgPoolingKernel< + paddle::platform::CPUDeviceContext, float>); +REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling_grad, + ops::SequenceTopkAvgPoolingGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h new file mode 100644 index 00000000000000..d6f7dff0535728 --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h @@ -0,0 +1,264 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace { +template +void get_topk_pos(const T* data, int length, int k, int* pos, + bool debug = false) { + size_t real_k = k < length ? k : length; + + std::vector v(data, data + length); + + std::vector topk_pos; + T min_val = -10000000.0; + while (topk_pos.size() < real_k) { + T max_val = min_val; + int max_pos = -1; + for (int i = 0; i < length; ++i) { + if (v[i] > max_val) { + max_pos = i; + max_val = v[i]; + } + } + + assert(max_pos >= 0); + + topk_pos.push_back(max_pos); + v[max_pos] = min_val; + } + + assert(topk_pos.size() > 0); + while (topk_pos.size() < (size_t)k) { + topk_pos.push_back(-1); + } + + for (size_t i = 0; i < topk_pos.size(); ++i) { + pos[i] = topk_pos[i]; + } +} +} // namespace + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceTopkAvgPoolingKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* row = context.Input("ROW"); + auto* col = context.Input("COLUMN"); + auto* out = context.Output("Out"); + auto* pos = context.Output("pos"); + + auto channel_num = context.Attr("channel_num"); + auto topks = context.Attr>("topks"); + auto k_num = topks.size(); + auto max_k = topks[topks.size() - 1]; + std::vector vec_pos_shape; + auto in_lod = in->lod()[0]; + + auto row_lod = row->lod()[0]; + auto col_lod = col->lod()[0]; + int batch_size = row_lod.size() - 1; + int pos_total_size = row_lod[batch_size] * channel_num * max_k; + vec_pos_shape.push_back(pos_total_size); + pos->Resize({framework::make_ddim(vec_pos_shape)}); + auto pos_data = pos->mutable_data(context.GetPlace()); + + int offset = 0; + framework::Vector vec_out_lod; + vec_out_lod.reserve(batch_size + 1); + for (int i = 0; i <= batch_size; ++i) { + offset = row_lod[i]; + vec_out_lod.push_back(offset); + } + + /* + std::vector vec_out_shape; + vec_out_shape.push_back( offset ); + out->Resize( {framework::make_ddim(vec_out_shape)} ); + */ + + framework::LoD lod_temp; + lod_temp.push_back(vec_out_lod); + out->set_lod(lod_temp); + + auto in_data = in->data(); + auto out_data = out->mutable_data(context.GetPlace()); + + T* sum_data = new T[max_k]; + for (int i = 0; i < batch_size; ++i) { + int total_size = in_lod[i + 1] - in_lod[i]; + int row_size = row_lod[i + 1] - row_lod[i]; + int col_size = col_lod[i + 1] - col_lod[i]; + PADDLE_ENFORCE_EQ(total_size, channel_num * row_size * col_size, "size wrong in sequence_topk_avg_pooling_op!"); + + int feature_num = row_size * col_size; + for (int j = 0; j < channel_num; ++j) { + auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num; + + for (int r = 0; r < row_size; ++r) { + auto row_data = input_offset_feature_data + r * col_size; + /* + if( ( in_lod[i] + j * feature_num + r * col_size ) <= + 4871 && ( in_lod[i] + j * feature_num + r * col_size ) <= 4908 && + ( in_lod[i] + j * feature_num + ( r + 1 ) * + col_size ) >= 4908 ) + { + LOG(ERROR) << "in same row " << col_size; + LOG(ERROR) << "edge " << in_lod[i] + j * + feature_num + r * col_size << "\t" + << in_lod[i] + j * feature_num + ( r + 1 + ) * col_size; + + + LOG(ERROR) << "max k " << max_k; + for( size_t h = 0; h < col_size; ++h ) + { + LOG(ERROR) << "element " << row_data[h]; + } + + LOG(ERROR) << "31 " << row_data[31]; + LOG(ERROR) << "68 " << row_data[68]; + + LOG(ERROR) << "cmp " << ( row_data[31] > + row_data[68] ); LOG(ERROR) << "cmp " << ( row_data[31] == + row_data[68] ); + } + */ + auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + + r * channel_num * max_k + j * max_k; + auto out_slice_data = out_data + row_lod[i] * channel_num * k_num + + r * channel_num * k_num + j * k_num; + + get_topk_pos(row_data, col_size, max_k, pos_slice_data); + if (pos_slice_data[0] == -1) { + sum_data[0] = 0.0; + } else { + sum_data[0] = row_data[pos_slice_data[0]]; + } + for (int k = 1; k < max_k; ++k) { + if (pos_slice_data[k] == -1) { + sum_data[k] = sum_data[k - 1]; + } else { + sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]]; + } + } + /* + LOG(ERROR) << "topk_avg_debug row: " << row_lod.size() + << ", col:" << col_lod.size() << ", k_size:" << k_num << "\n"; for( + size_t k = 0; k < k_num; ++k ) + { + LOG(ERROR) << "pos in ff: " << in_lod[i] + j * + feature_num + r * col_size + pos_slice_data[k]; + } + */ + for (size_t k = 0; k < k_num; ++k) { + out_slice_data[k] = sum_data[topks[k] - 1] / topks[k]; + } + } + } + } + delete[] sum_data; + } +}; + +template +class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* in_grad = context.Output(framework::GradVarName("X")); + auto* pos_input = context.Input("pos"); + auto* row_input = context.Input("ROW"); + auto* col_input = context.Input("COLUMN"); + auto* forward_input = context.Input("X"); + + int batch_size = row_input->lod()[0].size() - 1; + auto channel_num = context.Attr("channel_num"); + auto topks = context.Attr>("topks"); + auto k_num = topks.size(); + auto max_k = topks[k_num - 1]; + + auto out_lod = forward_input->lod(); + in_grad->set_lod(out_lod); + + in_grad->mutable_data(context.GetPlace()); + auto pos_data = pos_input->data(); + auto out_data = out_grad->data(); + + auto& dev_ctx = + context.template device_context(); + math::SetConstant zero; + zero(dev_ctx, in_grad, static_cast(0.0)); + + auto in_data = in_grad->data(); + + auto out_offset = out_lod[0]; + auto row_lod = row_input->lod()[0]; + auto col_lod = col_input->lod()[0]; + + // LOG( ERROR ) << "----------------------------------"; + for (int i = 0; i < batch_size; ++i) { + int row_size = row_lod[i + 1] - row_lod[i]; + int col_size = col_lod[i + 1] - col_lod[i]; + int feature_num = row_size * col_size; + + for (int j = 0; j < channel_num; ++j) { + auto in_offset_feature_data = in_data + out_offset[i] + j * feature_num; + + for (int r = 0; r < row_size; r++) { + auto row_data = out_data + row_lod[i] * channel_num * k_num + + r * channel_num * k_num + j * k_num; + auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + + r * channel_num * max_k + j * max_k; + auto in_slice_data = in_offset_feature_data + r * col_size; + + for (size_t m = 0; m < k_num; ++m) { + for (int k = 0; k < topks[m]; ++k) { + if (pos_slice_data[k] == -1) { + break; + } else { + in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m]; + /* + if ( out_offset[i] + j * feature_num + + r * col_size + pos_slice_data[k] == 4909 ) + { + LOG(ERROR) << "pos in bp " << + out_offset[i] + j * feature_num + r * col_size + + pos_slice_data[k] << "\t" << row_data[m] / topks[m]; + } + */ + } + } + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc new file mode 100644 index 00000000000000..3243e2a496fe02 --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h" +#include + +namespace paddle { +namespace operators { + +class SequenceTopkPoolingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("pos"), + "pos(out) should not be null"); + + auto attr = ctx->Attrs(); + auto channel_num = attr.Get("channel_num"); + auto topk = attr.Get("topk"); + + std::vector vec_out_shape; + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + vec_out_shape.push_back(x_lod_0.size() - 1); + } + else + { + vec_out_shape.push_back( -1 ); + } + + vec_out_shape.push_back( channel_num * topk ); + + ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape)); + ctx->ShareLoD("X", "Out"); + } +}; + +class SequenceTopkPoolingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(LoDTensor) The variable-length input of SequenceTopkPoolingOp"); + AddOutput("Out", + "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD " + "infomation."); + AddOutput("pos", + "(Tensor) store the topk index " ).AsIntermediate(); + AddAttr("topk", + "topk attr"); + AddAttr("channel_num", + "channel number"); + AddAttr("batch_size", + "batch size"); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddComment(R"DOC( + sequecen topk pooling op + )DOC"); + } +}; + +class SequenceTopkPoolingGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null."); + + ctx->ShareDim("X", /*->*/ framework::GradVarName("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + //return framework::OpKernelType( + // framework::ToDataType(ctx.Input("X")->type()), + // ctx.device_context()); + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class SequenceTopkPoolingGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("sequence_topk_pooling_grad"); + op_desc_ptr->SetInput("X", Input("X")); + op_desc_ptr->SetInput("pos", Output("pos")); + + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_topk_pooling, ops::SequenceTopkPoolingOp, ops::SequenceTopkPoolingOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_topk_pooling_grad, ops::SequenceTopkPoolingGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_topk_pooling, + ops::SequenceTopkPoolingKernel); +REGISTER_OP_CPU_KERNEL( + sequence_topk_pooling_grad, + ops::SequenceTopkPoolingGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h new file mode 100644 index 00000000000000..122ec0b8e35c03 --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_pooling_op.h @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace { +template +void get_topk_pos(const T* data, int length, int k, int* pos) { + size_t real_k = k < length ? k : length; + + std::vector v(data, data + length); + + std::vector topk_pos; + T min_val = -10000000.0; + while (topk_pos.size() < real_k) { + T max_val = min_val; + int max_pos = -1; + for (int i = 0; i < length; ++i) { + if (v[i] > max_val) { + max_pos = i; + max_val = v[i]; + } + } + + assert(max_pos >= 0); + + topk_pos.push_back(max_pos); + v[max_pos] = min_val; + } + + assert(topk_pos.size() > 0); + while (topk_pos.size() < (size_t)k) { + topk_pos.push_back(-1); + } + + for (size_t i = 0; i < topk_pos.size(); ++i) { + pos[i] = topk_pos[i]; + } +} +} // namespace + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceTopkPoolingKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + auto* pos = context.Output("pos"); + + auto channel_num = context.Attr("channel_num"); + auto topk = context.Attr("topk"); + std::vector vec_pos_shape; + auto batch_size = in->lod()[0].size() - 1; + vec_pos_shape.push_back(batch_size * channel_num * topk); + pos->Resize({framework::make_ddim(vec_pos_shape)}); + auto pos_data = pos->mutable_data(context.GetPlace()); + + auto in_lod = in->lod()[0]; + + framework::Vector vec_out_lod; + vec_out_lod.reserve(batch_size + 1); + for (int i = 0; i <= batch_size; ++i) { + vec_out_lod.push_back(i * channel_num * topk); + } + framework::LoD lod_temp; + lod_temp.push_back(vec_out_lod); + out->set_lod(lod_temp); + + auto in_data = in->data(); + auto out_data = out->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; ++i) { + int total_size = in_lod[i + 1] - in_lod[i]; + if (total_size % channel_num != 0) { + LOG(ERROR) << "input cannot mod channel num"; + } + + int feature_num = total_size / channel_num; + auto in_offset_data = in_data + in_lod[i]; + for (int j = 0; j < channel_num; ++j) { + auto input_slice_data = in_offset_data + j * feature_num; + auto pos_slice_data = pos_data + i * topk * channel_num + j * topk; + auto out_slice_data = out_data + i * topk * channel_num + j * topk; + + get_topk_pos(input_slice_data, feature_num, topk, pos_slice_data); + for (int k = 0; k < topk; ++k) { + if (pos_slice_data[k] == -1) { + out_slice_data[k] = 0.0; + } else { + out_slice_data[k] = input_slice_data[pos_slice_data[k]]; + } + } + } + } + } +}; + +template +class SequenceTopkPoolingGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* in_grad = context.Output(framework::GradVarName("X")); + auto* pos_input = context.Input("pos"); + auto* real_input = context.Input("X"); + + auto channel_num = context.Attr("channel_num"); + auto topk = context.Attr("topk"); + + auto out_lod = real_input->lod(); + in_grad->set_lod(out_lod); + + in_grad->mutable_data(context.GetPlace()); + auto pos_data = pos_input->data(); + + auto out_data = out_grad->data(); + + auto& dev_ctx = + context.template device_context(); + math::SetConstant zero; + zero(dev_ctx, in_grad, static_cast(0.0)); + + auto in_data = in_grad->data(); + + auto out_offset = out_lod[0]; + + auto batch_size = real_input->lod()[0].size() - 1; + for (int i = 0; i < batch_size; ++i) { + auto in_offset_data = in_data + out_offset[i]; + int total_size = out_offset[i + 1] - out_offset[i]; + int feature_num = total_size / channel_num; + + for (int j = 0; j < channel_num; ++j) { + auto in_slice_data = in_offset_data + j * feature_num; + auto pos_slice_data = pos_data + i * channel_num * topk + j * topk; + auto out_slice_data = out_data + i * channel_num * topk + j * topk; + + for (int k = 0; k < topk; ++k) { + if (pos_slice_data[k] == -1) { + continue; + } else { + // LOG(ERROR) << i << " " << j << " " << k << " " << + // pos_slice_data[k] << " " << out_slice_data[k]; + in_slice_data[pos_slice_data[k]] = out_slice_data[k]; + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index bb6a1c5b165693..bcc1dd89e562a7 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -53,6 +53,16 @@ class CPUUniformRandomKernel : public framework::OpKernel { for (int64_t i = 0; i < size; ++i) { data[i] = dist(engine); } + // Init diag element + unsigned int diag_num = static_cast(ctx.Attr("diag_num")); + unsigned int diag_step = static_cast(ctx.Attr("diag_step")); + auto diag_val = static_cast(ctx.Attr("diag_val")); + if (diag_num > 0) { + for (int64_t i = 0; i < diag_num; ++i) { + int64_t pos = i*diag_step + i; + data[pos] = diag_val; + } + } } }; @@ -67,6 +77,10 @@ class UniformRandomOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), "uniform_random's min must less then max"); + PADDLE_ENFORCE_GE(ctx->Attrs().Get("diag_num"), 0, + "diag_num must greater than 0"); + PADDLE_ENFORCE_GE(ctx->Attrs().Get("diag_step"), 0, + "diag_step must greater than 0"); auto &shape = ctx->Attrs().Get>("shape"); std::vector temp; temp.reserve(shape.size()); @@ -105,6 +119,13 @@ uniform distribution. The random result is in set [min, max]. "Note that if seed is not 0, this operator will always " "generate the same random numbers every time. [default 0].") .SetDefault(0); + AddAttr("diag_num", "The number of diag elements. Note that if " + "diag_num is 0, it means without diag init.[default 0].") + .SetDefault(0); + AddAttr("diag_step", "The step between two diag element.[default 0].") + .SetDefault(0); + AddAttr("diag_val", "The value of diag initialition. [default 1.0].") + .SetDefault(1.0f); AddAttr("dtype", "Output tensor data type. [default 5(FP32)].") .SetDefault(framework::proto::VarType::FP32); } diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc new file mode 100644 index 00000000000000..78c2aa713eba6f --- /dev/null +++ b/paddle/fluid/operators/var_conv_2d_op.cc @@ -0,0 +1,470 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/var_conv_2d_op.h" +#ifndef WIN32 +//#include "naive_gemm.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class VarConv2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("W", "W (Tensor), the filter."); + AddAttr("InputChannel", "the input filter num").SetDefault(1); + AddAttr("OutputChannel", "the output filter num").SetDefault(1); + AddAttr("StrideH", "the height of Stride").SetDefault(1); + AddAttr("StrideW", "the width of Stride").SetDefault(1); + AddAttr("KernelH", "the height of Kernel").SetDefault(1); + AddAttr("KernelW", "the width of Kernel").SetDefault(1); + + AddOutput("Out", "(LoDTensor, default LoDTensor) Output variable"); + AddOutput("Col", + "(LoDTensor, default LoDTensor) the intermediate result " + "variable"); + + AddComment(R"DOC( + Var Size Conv Operator + + This operator calculate Out = \sigma \left ( W * X + b \right ), + only support 2-D for X. + the input is a level-3 LodTensor: + + NOTE: only support 'float32' data type now. + + )DOC"); + } +}; + +class VarConv2dOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "X(Input) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "W(Input) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Out(Output) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Col"), + "Col(Output) of VarConv2dOP should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, + "The rank of X(Input) can't be less than 2."); + + auto w_dims = ctx->GetInputDim("W"); + /* + for (int i = 0; i < w_dims.size(); i++) { + LOG(ERROR) << "var_conv_2d: w_dims[" << i << "]:" << w_dims << "]"; + } + */ + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor"); + int output_channel = ctx->Attrs().Get("OutputChannel"); + int input_channel = ctx->Attrs().Get("InputChannel"); + int kernel_h = ctx->Attrs().Get("KernelH"); + int kernel_w = ctx->Attrs().Get("KernelW"); + PADDLE_ENFORCE_EQ(w_dims[0], output_channel, + "W dim[0] should be equal to OutputChannel"); + PADDLE_ENFORCE_EQ( + w_dims[1], input_channel * kernel_h * kernel_w, + "W dim[1] should be equal to InputChannel * StrideH * StrideW"); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + PADDLE_ENFORCE_GE(x_lod.size(), 3, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod[0].back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + } else { + std::vector out_dims_vec{-1}; + out_dims_vec.push_back(1); + std::vector col_dims_vec{-1}; + col_dims_vec.push_back(1); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + ctx->SetOutputDim("Col", framework::make_ddim(col_dims_vec)); + } + } +}; + +template +class CPUVarConv2dOPKernel : public framework::OpKernel { + public: + void Im2Col(const framework::ExecutionContext& ctx, const LoDTensor& input, + LoDTensor* col) const { + int input_channel = ctx.Attr("InputChannel"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + int stride_h = ctx.Attr("StrideH"); + int stride_w = ctx.Attr("StrideW"); + + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + const auto& offset_x = input.lod()[2]; + const auto& offset_y = input.lod()[1]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_y * top_im_x; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + framework::LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + auto* top_data = col->mutable_data(framework::make_ddim(col_dims_vec), + ctx.GetPlace()); + auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* top = ctx.Output("Out"); + auto* col = ctx.Output("Col"); + + int output_channel = ctx.Attr("OutputChannel"); + int input_channel = ctx.Attr("InputChannel"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + int stride_h = ctx.Attr("StrideH"); + int stride_w = ctx.Attr("StrideW"); + + Im2Col(ctx, *bottom, col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + const auto& offset_x = bottom->lod()[2]; + const auto& offset_y = bottom->lod()[1]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + framework::LoD top_lod; + top_lod.push_back(top_offset); + top_lod.push_back(bottom->lod()[1]); + top_lod.push_back(bottom->lod()[2]); + + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + auto* top_data = top->mutable_data(framework::make_ddim(top_dims_vec), + ctx.GetPlace()); + + auto* w_data = w->data(); + auto* col_data = col->data(); + +#ifndef WIN32 +#ifndef __NAIVE_GEMM__ + auto blas = math::GetBlas(ctx); +#endif // !__NAIVE_GEMM__ + +#endif + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } +#ifndef WIN32 + +#ifndef __NAIVE_GEMM__ + blas.GEMM(CblasNoTrans, CblasNoTrans, output_channel, top_im_size, + input_channel * kernel_h * kernel_w, 1.0, w_data, + col_data + col_offset[b], 0.0, top_data + top_offset[b]); +#else + naive::gemm(false, false, output_channel, top_im_size, + input_channel * kernel_h * kernel_w, 1.0, w_data, + col_data + col_offset[b], 0.0, top_data + top_offset[b]); + +#endif // !__NAIVE_GEMM__ + +#endif + } + } +}; + +class VarConv2dOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Col")), + // "Input(Col@GRAD) of SequencePadGradOp should not be + // null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("W"))) { + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + } + } +}; + +template +class CPUVarConv2dOPGradKernel : public framework::OpKernel { + public: + void Im2ColGrad(const framework::ExecutionContext& ctx, T* top_diff) const { + auto* x = ctx.Input("X"); + auto* col = ctx.Input("Col"); + + int input_channel = ctx.Attr("InputChannel"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + int stride_h = ctx.Attr("StrideH"); + int stride_w = ctx.Attr("StrideW"); + + auto* d_x = ctx.Output(framework::GradVarName("X")); + + auto* bottom_diff = d_x->mutable_data(ctx.GetPlace()); + memset(bottom_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T)); + + const auto& bottom_offset = x->lod()[0]; + const auto& offset_x = x->lod()[2]; + const auto& offset_y = x->lod()[1]; + const auto& top_offset = col->lod()[0]; + int batch = x->lod()[0].size() - 1; + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + bottom_diff[b_offset + im_offset + im_y * width + im_x] += + top_diff[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset]; + } + } + } + } + } + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* col = ctx.Input("Col"); + auto* out = ctx.Input("Out"); + + int output_channel = ctx.Attr("OutputChannel"); + int input_channel = ctx.Attr("InputChannel"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + + auto* d_out = ctx.Input(framework::GradVarName("Out")); + // auto* d_col = ctx.Input(framework::GradVarName("Col")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_w = ctx.Output(framework::GradVarName("W")); + + Tensor col_grad; + col_grad.Resize(col->dims()); + auto* col_diff = col_grad.mutable_data(ctx.GetPlace()); + auto* bottom_diff = d_x->mutable_data(ctx.GetPlace()); + auto* w_diff = d_w->mutable_data(ctx.GetPlace()); + // auto* col_diff = const_cast(d_col->data()); + memset(bottom_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T)); + memset(w_diff, 0.0, w->dims()[0] * w->dims()[1] * sizeof(T)); + memset(col_diff, 0.0, col->dims()[0] * col->dims()[1] * sizeof(T)); + auto* top_diff = d_out->data(); + auto* w_data = w->data(); + auto* col_data = col->data(); + int batch = x->lod()[0].size() - 1; + const auto& top_offset = out->lod()[0]; + const auto& col_offset = col->lod()[0]; +#ifndef WIN32 +#ifndef __NAIVE_GEMM__ + auto blas = math::GetBlas(ctx); +#endif // !__NAIVE_GEMM__ +#endif + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } +#ifndef WIN32 + +#ifndef __NAIVE_GEMM__ + blas.GEMM(CblasTrans, CblasNoTrans, input_channel * kernel_h * kernel_w, + top_im_size, output_channel, 1.0, w_data, + top_diff + top_offset[b], 1.0, col_diff + col_offset[b]); +#else + naive::gemm(true, false, input_channel * kernel_h * kernel_w, + top_im_size, output_channel, 1.0, w_data, + top_diff + top_offset[b], 1.0, col_diff + col_offset[b]); +#endif // !__NAIVE_GEMM__ + +#ifndef __NAIVE_GEMM__ + blas.GEMM(CblasNoTrans, CblasTrans, output_channel, + input_channel * kernel_h * kernel_w, top_im_size, 1.0, + top_diff + top_offset[b], col_data + col_offset[b], 1.0, + w_diff); +#else + naive::gemm(false, true, output_channel, + input_channel * kernel_h * kernel_w, top_im_size, 1.0, + top_diff + top_offset[b], col_data + col_offset[b], 1.0, + w_diff); +#endif // !__NAIVE_GEMM__ + +#endif + } + Im2ColGrad(ctx, col_diff); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(var_conv_2d, ops::VarConv2dOP, ops::VarConv2dOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(var_conv_2d_grad, ops::VarConv2dOpGrad); + +REGISTER_OP_CPU_KERNEL(var_conv_2d, + ops::CPUVarConv2dOPKernel + // ops::CPUVarConv2dOPKernel +); +REGISTER_OP_CPU_KERNEL( + var_conv_2d_grad, + ops::CPUVarConv2dOPGradKernel + // ops::CPUVarConv2dOPGradKernel +); diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h new file mode 100644 index 00000000000000..80ccb94fbaf664 --- /dev/null +++ b/paddle/fluid/operators/var_conv_2d_op.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; +/* +class MatchMatrixTensorOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class MatchMatrixTensorOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class MatchMatrixTensorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; +*/ +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index da2591b98058a2..4b9c9d2cd59a94 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -250,32 +250,41 @@ def __call__(self, var, block): return op -class NormalInitializer(Initializer): - """Implements the Random Normal(Gaussian) distribution initializer +class UniformInitializer(Initializer): + """Implements the random uniform distribution initializer Args: - loc (float): mean of the normal distribution - scale (float): standard deviation of the normal distribution + low (float): lower boundary of the uniform distribution + high (float): upper boundary of the uniform distribution seed (int): random seed Examples: .. code-block:: python + import paddle.fluid as fluid + x = fluid.layers.data(name='x', shape=[1], dtype='float32') fc = fluid.layers.fc(input=x, size=10, - param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0)) + param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5)) """ - def __init__(self, loc=0.0, scale=1.0, seed=0): - assert loc is not None - assert scale is not None + def __init__(self, low=-1.0, high=1.0, seed=0, diag_num=0, diag_step=0, diag_val=1.0): + assert low is not None + assert high is not None + assert high >= low assert seed is not None - super(NormalInitializer, self).__init__() - self._mean = loc - self._std_dev = scale + assert diag_num is not None + assert diag_step is not None + assert diag_val is not None + super(UniformInitializer, self).__init__() + self._low = low + self._high = high self._seed = seed + self._diag_num = diag_num + self._diag_step = diag_step + self._diag_val = diag_val def __call__(self, var, block): - """Add normal distribution initialization ops for a variable + """Add uniform distribution initialization ops for a variable Args: var: Variable that needs to be initialized @@ -291,11 +300,12 @@ def __call__(self, var, block): if self._seed == 0: self._seed = block.program.random_seed - # to be compatible of fp16 initalizers + # to be compatible of fp16 initializers if var.dtype == VarDesc.VarType.FP16: out_dtype = VarDesc.VarType.FP32 out_var = block.create_var( - name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + name=unique_name.generate(".".join( + ['uniform_random', var.name, 'tmp'])), shape=var.shape, dtype=out_dtype, type=VarDesc.VarType.LOD_TENSOR, @@ -305,15 +315,17 @@ def __call__(self, var, block): out_var = var op = block._prepend_op( - type="gaussian_random", + type="uniform_random", outputs={"Out": out_var}, attrs={ "shape": var.shape, "dtype": out_dtype, - "mean": self._mean, - "std": self._std_dev, + "min": self._low, + "max": self._high, "seed": self._seed, - "use_mkldnn": False + "diag_num": self._diag_num, + "diag_step": self._diag_step, + "diag_val": self._diag_val }, stop_gradient=True) @@ -324,6 +336,7 @@ def __call__(self, var, block): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) + if not framework.in_dygraph_mode(): var.op = op return op diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2bac9dd9a46b1b..9befbcfe424872 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -197,6 +197,20 @@ 'pixel_shuffle', 'fsp_matrix', 'continuous_value_model', + 'match_matrix_tensor', + 'var_conv_2d', + 'sequence_topk_avg_pooling', + 'sequence_topk_pooling', + 'search_fc', + 'search_seq_fc', + 'search_grnn', + 'search_embedding', + 'search_seq_arithmetic', + 'search_aligned_mat_mul', + 'search_attention_padding_mask', + 'search_group_padding', + 'search_seq_depadding', + 'search_seq_softmax', ] kIgnoreIndex = -100 @@ -11267,3 +11281,490 @@ def continuous_value_model(input, cvm, use_cvm=True): outputs={'Y': [out]}, attrs={"use_cvm": use_cvm}) return out + +def sequence_topk_pooling(input, topk, batch_size, channel_num): + """ + + TODO: + """ + helper = LayerHelper('sequence_topk_pooling', **locals()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + pos = helper.create_variable_for_type_inference(dtype=helper.input_dtype(), + stop_gradient=True) + helper.append_op( + type='sequence_topk_pooling', + inputs={'X': input}, + outputs={'Out': out, + 'pos': pos}, + attrs={'topk': topk, + 'batch_size': batch_size, + 'channel_num': channel_num}) + return out + + +def sequence_topk_avg_pooling(input, row, col, topks, channel_num): + """ + + TODO: + """ + helper = LayerHelper('sequence_topk_avg_pooling', **locals()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + pos = helper.create_variable_for_type_inference(dtype=helper.input_dtype(), + stop_gradient=True) + helper.append_op( + type='sequence_topk_avg_pooling', + inputs={'X': input, + 'ROW': row, + 'COLUMN': col}, + outputs={'Out': out, + 'pos': pos}, + attrs={'topks': topks, + 'channel_num': channel_num}) + return out + + +def var_conv_2d(input, + input_channel, + output_channel, + filter_size, + stride=1, + param_attr=None, + act=None, + dtype='float32', + name=None): + """ + + :param input: + :param input_channel: + :param output_channel: + :param filter_size: + :param stride: + :param param_attr: + :param act: + :param dtype: + :param name: + :return: + """ + helper = LayerHelper('var_conv_2d', **locals()) + x_shape = list(input.shape) + assert len(x_shape) == 2 + + filter_size = utils.convert_to_list(filter_size, 2, 'filter_size') + stride = utils.convert_to_list(stride, 2, 'stride') + + filter_shape = [int(output_channel), + int(input_channel) * filter_size[0] * filter_size[1]] + filter_param = helper.create_parameter( + attr=helper.param_attr, + shape=filter_shape, + dtype=dtype, + ) + + conv_res = helper.create_variable_for_type_inference(dtype) + tmp_res = helper.create_variable_for_type_inference(dtype, stop_gradient=True) + + helper.append_op( + type='var_conv_2d', + inputs={ + 'X': input, + 'W': filter_param, + }, + outputs={"Out": conv_res, "Col": tmp_res}, + attrs={ + 'InputChannel': input_channel, + 'OutputChannel': output_channel, + 'StrideH': stride[0], + 'StrideW': stride[1], + 'KernelH': filter_size[0], + 'KernelW': filter_size[1], + } + ) + + return helper.append_activation(conv_res) + + +def match_matrix_tensor( + input_x, + input_y, + dim_t, + act=None, + param_attr=None, + dtype='float32', + is_test=False, + name=None): + """ + + :param input_x: + :param input_y: + :param dim_t: + :param act: + :param param_attr: + :param dtype: + :param is_test: + :param name: + :return: + """ + helper = LayerHelper('match_matrix_tensor', **locals()) + + x_shape = list(input_x.shape) + y_shape = list(input_y.shape) + assert len(x_shape) == 2 and len(y_shape) == 2 and x_shape[-1] == y_shape[-1] + + weight_shape = [x_shape[-1], dim_t, y_shape[-1]] + w = helper.create_parameter( + attr=helper.param_attr, shape=weight_shape, dtype=dtype, is_bias=False) + mm_res = helper.create_variable_for_type_inference(dtype) + tmp_res = helper.create_variable_for_type_inference(dtype, stop_gradient=True) + helper.append_op( + type='match_matrix_tensor', + inputs={ + 'X': input_x, + 'Y': input_y, + 'W': w, + }, + outputs={"Out": mm_res, "Tmp": tmp_res}, + attrs={'dim_t': dim_t} + ) + + return helper.append_activation(mm_res), tmp_res + + +def search_fc( + input, + size, + param_attr=None, + bias_attr=None, + act=None, + is_test=False, + name=None): + """ + + TODO: + """ + helper = LayerHelper('search_fc', **locals()) + dtype = input.dtype + input_shape = list(input.shape) + assert len(input_shape) == 2 + w_shape = [size, input_shape[1]] + w = helper.create_parameter(attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False) + b_shape = [size] + b = helper.create_parameter(attr=bias_attr, shape=b_shape, dtype=dtype, is_bias=False) + res = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_fc', + inputs={ + 'X': input, + 'W': w, + 'b': b, + }, + outputs={"Out": res, }, + attrs={'out_size': size, } + ) + + return res + + +def search_seq_fc( + input, + size, + param_attr=None, + bias_attr=None, + act=None, + is_test=False, + name=None): + """ + + TODO: + """ + helper = LayerHelper('search_seq_fc', **locals()) + dtype = input.dtype + input_shape = list(input.shape) + assert len(input_shape) == 2 + w_shape = [size, input_shape[1]] + w = helper.create_parameter(attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False) + input_dict = {'X': input, 'W': w,} + has_bias = False + if bias_attr is not None: + b_shape = [size] + b = helper.create_parameter(attr=bias_attr, shape=b_shape, dtype=dtype, is_bias=False) + input_dict['b'] = b + has_bias = True + res = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_seq_fc', + inputs=input_dict, + outputs={"Out": res, }, + attrs={'out_size': size, 'has_bias': has_bias} + ) + + return res + + +def search_grnn( + input, + num_input, + num_hidden, + param_attr_in, + param_attr_hidden, + dtype='float32', + is_test=False, + name=None): + """ + + TODO: + """ + + helper = LayerHelper('search_grnn', **locals()) + + input_shape = list(input.shape) + assert len(input_shape) == 2 and input_shape[-1] == num_input + + _cap_h = num_hidden + _cap_e = input_shape[-1] + wi_shape = [3, _cap_h, _cap_e] + wh_shape = [3, _cap_h, _cap_h] + wi = helper.create_parameter( + attr=param_attr_in, shape=wi_shape, dtype=dtype, is_bias=False) + wh = helper.create_parameter( + attr=param_attr_hidden, shape=wh_shape, dtype=dtype, is_bias=False) + + grnn_res = helper.create_variable_for_type_inference(dtype) + grnn_buffer = helper.create_variable_for_type_inference(dtype) + grnn_idx_sorted_by_width = helper.create_variable_for_type_inference(dtype) + grnn_layout_input = helper.create_variable_for_type_inference(dtype) + + helper.append_op( + type='search_grnn', + inputs={ + 'X': input, + 'Wi': wi, + 'Wh': wh, + }, + outputs={"Out": grnn_res, + "tmp_buffer": grnn_buffer, + 'idx_sorted_by_width': grnn_idx_sorted_by_width, + 'layout_input': grnn_layout_input + }, + attrs={'num_input': num_input, 'num_hidden': num_hidden} + ) + + return grnn_res + + +def search_embedding( + input, + num_voc, + num_emb, + lr, + param_attr=None, + name=None, + dtype='float32'): + """ + + :param input: + :param num_voc: + :param num_emb: + :param lr: + :param param_attr: + :param name: + :param dtype: + :return: + """ + helper = LayerHelper('search_embedding', **locals()) + + w_shape = [num_voc, num_emb] + w = helper.create_parameter(attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False) + w.stop_gradient = True + + res = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_embedding', + inputs={ + 'X': input, + 'W': w, + }, + outputs={"Out": res, }, + attrs={'num_voc': num_voc, 'num_emb': num_emb, 'lr': lr, } + ) + + return res + + +def search_seq_arithmetic( + input_x, + input_y, + op_type, + name=None): + """ + :param input_x: + :param input_y: + :param op_type: + :param name: + :return: + """ + helper = LayerHelper('search_seq_arithmetic', **locals()) + dtype = input_x.dtype + + res = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_seq_arithmetic', + inputs={ + 'X': input_x, + 'Y': input_y, + }, + outputs={"Out": res}, + attrs={'op_type': op_type} + ) + + return res + + +def search_aligned_mat_mul( + input_x, + input_y, + transpose_x, + transpose_y, + alpha, + name=None): + """ + :param input_x: + :param input_y: + :param transpose_x: + :param transpose_y: + :param alpha: + :param name: + :return: + """ + helper = LayerHelper('search_aligned_mat_mul', **locals()) + dtype = input_x.dtype + + out = helper.create_variable_for_type_inference(dtype) + _a_addr = helper.create_variable_for_type_inference(dtype) + _b_addr = helper.create_variable_for_type_inference(dtype) + _c_addr = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_aligned_mat_mul', + inputs={ + 'X': input_x, + 'Y': input_y, + }, + outputs={"Out": out, '_a_addr': _a_addr, '_b_addr': _b_addr, '_c_addr': _c_addr}, + attrs={'transpose_X': transpose_x, 'transpose_Y': transpose_y, + 'alpha': alpha} + ) + + return out + + +def search_attention_padding_mask( + input_x, + input_y, + pad_id, + mask, + name=None): + """ + :param input_x: + :param input_y: + :param pad_id: + :param mask: + :param name: + :return: + """ + helper = LayerHelper('search_attention_padding_mask', **locals()) + dtype = input_x.dtype + + out = helper.create_variable_for_type_inference(dtype) + pad_begin = helper.create_variable_for_type_inference('int') + helper.append_op( + type='search_attention_padding_mask', + inputs={ + 'X': input_x, + 'Y': input_y, + }, + outputs={"Out": out, 'pad_begin': pad_begin}, + attrs={'pad_id': pad_id, 'mask': mask} + ) + + return out + + +def search_group_padding( + input, + pad_id, + name=None): + """ + :param input: + :param pad_id: + :param name: + :return: + """ + helper = LayerHelper('search_group_padding', **locals()) + dtype = input.dtype + + out_emb_padding = helper.create_variable_for_type_inference(dtype) + out_new = helper.create_variable_for_type_inference(dtype, stop_gradient=True) + out_padding = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_group_padding', + inputs={ + 'X': input, + }, + outputs={"Out_emb_padding": out_emb_padding, + 'Out_new': out_new, + 'Out_padding': out_padding, + }, + attrs={'pad_id': pad_id} + ) + + return [out_emb_padding, out_new, out_padding] + + +def search_seq_depadding( + input_pad, + input_src, + name=None): + """ + :param input_pad: + :param input_src: + :param name: + :return: + """ + helper = LayerHelper('search_seq_depadding', **locals()) + dtype = input_pad.dtype + + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_seq_depadding', + inputs={ + 'Pad': input_pad, + 'Src': input_src, + }, + outputs={"Out": out}, + ) + + return out + + +def search_seq_softmax( + input_x, + alg, + name=None): + """ + :param input_x: + :param alg: + :param name: + :return: + """ + helper = LayerHelper('search_seq_softmax', **locals()) + dtype = input_x.dtype + + out = helper.create_variable_for_type_inference(dtype) + out_log = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='search_seq_softmax', + inputs={ + 'X': input_x, + }, + outputs={"Out": out, 'Out_log': out_log}, + attrs={'alg': alg} + ) From b7b8daf3422f84864fc720a04b22995cc4308fb9 Mon Sep 17 00:00:00 2001 From: zhangliujie Date: Thu, 1 Aug 2019 13:23:40 +0800 Subject: [PATCH 2/3] fix noraml init bug --- python/paddle/fluid/initializer.py | 77 ++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 4b9c9d2cd59a94..3c6c30069e5916 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -250,6 +250,83 @@ def __call__(self, var, block): return op +class NormalInitializer(Initializer): + """Implements the Random Normal(Gaussian) distribution initializer + Args: + loc (float): mean of the normal distribution + scale (float): standard deviation of the normal distribution + seed (int): random seed + Examples: + .. code-block:: python + import paddle.fluid as fluid + x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + fc = fluid.layers.fc(input=x, size=10, + param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0)) + """ + + def __init__(self, loc=0.0, scale=1.0, seed=0): + assert loc is not None + assert scale is not None + assert seed is not None + super(NormalInitializer, self).__init__() + self._mean = loc + self._std_dev = scale + self._seed = seed + + def __call__(self, var, block): + """Add normal distribution initialization ops for a variable + Args: + var: Variable that needs to be initialized + block: The block in which initialization ops + should be added + Returns: + the initialization op + """ + assert isinstance(var, framework.Variable) + assert isinstance(block, framework.Block) + # Initialization Ops should be prepended and not appended + if self._seed == 0: + self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['gaussian_random', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + + op = block._prepend_op( + type="gaussian_random", + outputs={"Out": out_var}, + attrs={ + "shape": var.shape, + "dtype": out_dtype, + "mean": self._mean, + "std": self._std_dev, + "seed": self._seed, + "use_mkldnn": False + }, + stop_gradient=True) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + if not framework.in_dygraph_mode(): + var.op = op + return op + + class UniformInitializer(Initializer): """Implements the random uniform distribution initializer From d3c5bfb4d506013bcf03c63b87fa9c1844888091 Mon Sep 17 00:00:00 2001 From: zhangliujie Date: Thu, 1 Aug 2019 14:41:43 +0800 Subject: [PATCH 3/3] fix backward.py by qiaolongfei --- python/paddle/fluid/backward.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 9fd53a74bf5192..bdd95b529db970 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -232,15 +232,19 @@ def _op_can_be_removed_(op_desc, no_grad_set): for arg in op_desc.input_arg_names(): if core.grad_var_suffix() in arg and arg in no_grad_set: x_in = _strip_grad_suffix_(arg) - x_in_var_desc = op_desc.block().find_var_recursive( - cpt.to_bytes(x_in)) - assert x_in_var_desc is not None, "Variable {} not found".format( - x_in) - dtype = x_in_var_desc.dtype() - to_insert.append( - (_create_op_desc_("fill_zeros_like2", {"X": [x_in]}, - {"Out": [arg]}, {"dtype": dtype}), idx)) + (_create_op_desc_("fill_zeros_like", {"X": [x_in]}, + {"Out": [arg]}, {}), idx)) + + # x_in_var_desc = op_desc.block().find_var_recursive( + # cpt.to_bytes(x_in)) + # assert x_in_var_desc is not None, "Variable {} not found".format( + # x_in) + # dtype = x_in_var_desc.dtype() + + # to_insert.append( + # (_create_op_desc_("fill_zeros_like2", {"X": [x_in]}, + # {"Out": [arg]}, {"dtype": dtype}), idx)) list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])