PaddlePaddle
diff --git a/‎paddlenlp/ops/CMakeLists.txt‎
Lines changed: 31 additions & 3 deletions b/‎paddlenlp/ops/CMakeLists.txt‎
Lines changed: 31 additions & 3 deletions
diff --git a/‎paddlenlp/ops/faster_transformer/src/demo/gpt.cc‎
Lines changed: 0 additions & 8 deletions b/‎paddlenlp/ops/faster_transformer/src/demo/gpt.cc‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎paddlenlp/ops/faster_transformer/src/fusion_unified_decoding_op.cc‎
Lines changed: 311 additions & 0 deletions b/‎paddlenlp/ops/faster_transformer/src/fusion_unified_decoding_op.cc‎
Lines changed: 311 additions & 0 deletions
@@ -25,6 +25,7 @@ option(WITH_GPU         "Compile with GPU/CPU, default use CPU."
 option(USE_TENSORRT     "Compile with TensorRT."                                  OFF)
 option(WITH_TRANSFORMER "Compile with Transformer"                                ON)
 option(WITH_GPT         "Compile with GPT"                                        OFF)
+option(WITH_UNIFIED         "Compile with Unified Transformer"                        ON)
 
 if(NOT WITH_GPU)
   message(FATAL_ERROR "Faster transformer custom op doesn't support CPU. Please add the flag -DWITH_GPU=ON to use GPU. ")
@@ -38,6 +39,10 @@ if(WITH_GPT)
   list(APPEND decoding_op_files fusion_gpt_op.cc fusion_gpt_op.cu)
 endif()
 
+if(WITH_UNIFIED)
+  list(APPEND decoding_op_files fusion_unified_decoding_op.cc fusion_unified_decoding_op.cu)
+endif()
+
 if(NOT WITH_TRANSFORMER AND NOT WITH_GPT)
   message(FATAL_ERROR "-DWITH_TRANSFORMER=ON or/and -DWITH_GPT=ON must be set to use FasterTransformer. ")
 endif()
@@ -124,15 +129,38 @@ file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/cuda/topk_kernel
 file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/topk_kernels.cu topk_kernels_dst)
 
 file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_decoder.cu open_decoder_cu_dst)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/open_decoder.h open_decoder_header_dst)
+file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/open_decoder.h open_decoder_h_dst)
+
+file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/cuda_kernels.h cuda_kernels_h_dst)
+file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/decoding_kernels.cu decoding_kernels_cu_dst)
 
 file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/cuda/transformer_decoder.cu trans_decoder_cu_src)
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/transformer_decoder.h trans_decoder_header_src)
+file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/transformer_decoder.h trans_decoder_h_src)
+
+file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/cuda/transformer_cuda_kernels.h cuda_kernels_h_src)
+file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/cuda/transformer_decoding_kernels.cu decoding_kernels_cu_src)
+
+file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/transformer_beamsearch.h beamsearch_h_src)
+file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/transformer_sampling.h sampling_h_src)
+file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/arguments.h arguments_h_src)
 set(trans_dst ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/)
 
 # TODO(guosheng): `find` seems meeting errors missing argument to `-exec', fix it
 set(MUTE_COMMAND grep -rl "printf(\"\\[WARNING\\]" ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/ | xargs -i{} sed -i "s/printf(\"\\WWARNING\\W decoding[^)]\\{1,\\})/ /" {})
-set(FT_PATCH_COMMAND cp ${allocator_src} ${allocator_dst} && cp ${common_src} ${common_dst} && cp ${cmakelists_src} ${cmakelists_dst} && cp ${topk_kernels_src} ${topk_kernels_dst} && cat ${trans_decoder_cu_src} >> ${open_decoder_cu_dst} && cat ${trans_decoder_header_src} >> ${open_decoder_header_dst} && ${MUTE_COMMAND})
+set(FT_PATCH_COMMAND
+  cp ${allocator_src} ${allocator_dst}
+  && cp ${common_src} ${common_dst}
+  && cp ${cmakelists_src} ${cmakelists_dst}
+  && cp ${topk_kernels_src} ${topk_kernels_dst}
+  && cp ${beamsearch_h_src} ${trans_dst}
+  && cp ${sampling_h_src} ${trans_dst}
+  && cp ${arguments_h_src} ${trans_dst}
+  && cat ${trans_decoder_cu_src} >> ${open_decoder_cu_dst}
+  && cat ${trans_decoder_h_src} >> ${open_decoder_h_dst}
+  && cat ${cuda_kernels_h_src} >> ${cuda_kernels_h_dst}
+  && cat ${decoding_kernels_cu_src} >> ${decoding_kernels_cu_dst}
+  && ${MUTE_COMMAND}
+)
 
 ######################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
 
@@ -84,7 +84,6 @@ bool get_result_tensor(const std::unique_ptr<paddle_infer::Tensor>& seq_ids,
 
     for (int i = 0; i < tmp_result_q.length(); ++i) {
       char32_t tmp = tmp_result_q[i];
-      // std::cout << tmp << std::endl;
       if (byte_decoder.find(tmp) != byte_decoder.end()) {
         dataresultvec[bsz].result_q = dataresultvec[bsz].result_q +
                                       static_cast<wchar_t>(byte_decoder[tmp]);
@@ -126,13 +125,6 @@ std::unordered_map<char32_t, int> convert_unicode() {
     }
   }
 
-  // std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv32;
-  // for (int i=0; i<256; ++i) {
-  //   std::cout << "=====" << std::endl;
-  //   std::cout << conv32.to_bytes(cs[i]) << std::endl;
-  //   std::cout << bs[i] << std::endl;
-  // }
-
   return ret;
 }
 
 
@@ -0,0 +1,311 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include <vector>
+
+#include "fusion_unified_decoding_op.h"
+#include "pd_traits.h"
+
+
+std::vector<paddle::Tensor> UnifiedDecodingForward(
+    const std::vector<paddle::Tensor>& cache_k,
+    const std::vector<paddle::Tensor>& cache_v,
+    const paddle::Tensor& mem_seq_len,
+    const paddle::Tensor& logits_mask,
+    const paddle::Tensor& word_embedding,
+    const std::vector<paddle::Tensor>& self_ln_weight,
+    const std::vector<paddle::Tensor>& self_ln_bias,
+    const std::vector<paddle::Tensor>& self_q_weight,
+    const std::vector<paddle::Tensor>& self_q_bias,
+    const std::vector<paddle::Tensor>& self_k_weight,
+    const std::vector<paddle::Tensor>& self_k_bias,
+    const std::vector<paddle::Tensor>& self_v_weight,
+    const std::vector<paddle::Tensor>& self_v_bias,
+    const std::vector<paddle::Tensor>& self_out_weight,
+    const std::vector<paddle::Tensor>& self_out_bias,
+    const std::vector<paddle::Tensor>& ffn_ln_weight,
+    const std::vector<paddle::Tensor>& ffn_ln_bias,
+    const std::vector<paddle::Tensor>& ffn_inter_weight,
+    const std::vector<paddle::Tensor>& ffn_inter_bias,
+    const std::vector<paddle::Tensor>& ffn_out_weight,
+    const std::vector<paddle::Tensor>& ffn_out_bias,
+    const paddle::Tensor& decoder_ln_weight,
+    const paddle::Tensor& decoder_ln_bias,
+    const paddle::Tensor& trans_weight,
+    const paddle::Tensor& trans_bias,
+    const paddle::Tensor& lm_ln_weight,
+    const paddle::Tensor& lm_ln_bias,
+    const paddle::Tensor& embedding_weight,
+    const paddle::Tensor& embedding_bias,
+    const paddle::Tensor& positional_embedding_weight,
+    const paddle::Tensor& type_embedding_weight,
+    const std::string& decoding_strategy,
+    const int& beam_size,
+    const int& topk,
+    const float& topp,
+    const int& n_head,
+    const int& size_per_head,
+    const int& num_layer,
+    const int& bos_id,
+    const int& eos_id,
+    const int64_t& max_len,
+    const float& beam_search_diversity_rate,
+    const int& type_id,
+    const int& unk_id,
+    const int& mask_id,
+    const float& temperature,
+    const float& len_penalty) {
+  int batch_size = cache_k[0].shape()[0];
+
+  std::vector<int64_t> output_dims;
+  std::vector<int64_t> parent_ids_dims;
+  std::vector<int64_t> sequence_length_dims({batch_size});
+  if (decoding_strategy == "beam_search") {
+    if (batch_size != -1) {
+      batch_size /= beam_size;
+    }
+    output_dims = {max_len, batch_size, beam_size};
+    parent_ids_dims = output_dims;
+  } else if (decoding_strategy == "topk_sampling" ||
+             decoding_strategy == "topp_sampling") {
+    output_dims = {max_len, batch_size};
+    parent_ids_dims = {1};
+  } else {
+    PD_THROW("Not supported decoding strategy. ");
+  }
+  auto output_ids = paddle::Tensor(cache_k[0].place(), output_dims);
+  auto parent_ids = paddle::Tensor(cache_k[0].place(), parent_ids_dims);
+  auto sequence_length =
+      paddle::Tensor(cache_k[0].place(), sequence_length_dims);
+
+  if (cache_k[0].place() == paddle::PlaceType::kGPU) {
+    auto sequence_length = paddle::Tensor(paddle::PlaceType::kGPU);
+
+    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
+      sequence_length = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
+    } else {
+      sequence_length = mem_seq_len;
+    }
+
+    return UnifiedDecodingCUDAForward(cache_k,
+                                      cache_v,
+                                      sequence_length,
+                                      logits_mask,
+                                      word_embedding,
+                                      self_ln_weight,
+                                      self_ln_bias,
+                                      self_q_weight,
+                                      self_q_bias,
+                                      self_k_weight,
+                                      self_k_bias,
+                                      self_v_weight,
+                                      self_v_bias,
+                                      self_out_weight,
+                                      self_out_bias,
+                                      ffn_ln_weight,
+                                      ffn_ln_bias,
+                                      ffn_inter_weight,
+                                      ffn_inter_bias,
+                                      ffn_out_weight,
+                                      ffn_out_bias,
+                                      decoder_ln_weight,
+                                      decoder_ln_bias,
+                                      trans_weight,
+                                      trans_bias,
+                                      lm_ln_weight,
+                                      lm_ln_bias,
+                                      embedding_weight,
+                                      embedding_bias,
+                                      positional_embedding_weight,
+                                      type_embedding_weight,
+                                      output_ids,
+                                      parent_ids,
+                                      sequence_length,
+                                      decoding_strategy,
+                                      beam_size,
+                                      topk,
+                                      topp,
+                                      n_head,
+                                      size_per_head,
+                                      num_layer,
+                                      bos_id,
+                                      eos_id,
+                                      max_len,
+                                      beam_search_diversity_rate,
+                                      type_id,
+                                      unk_id,
+                                      mask_id,
+                                      temperature,
+                                      len_penalty);
+  } else {
+    PD_THROW("Not implemented place. Only GPU is supported. ");
+  }
+}
+
+std::vector<std::vector<int64_t>> UnifiedDecodingInferShape(
+    const std::vector<std::vector<int64_t>>& cache_k_shapes,
+    const std::vector<std::vector<int64_t>>& cache_v_shapes,
+    const std::vector<int64_t>& mem_seq_len_shape,
+    const std::vector<int64_t>& logits_mask_shape,
+    const std::vector<int64_t>& word_embedding_shape,
+    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
+    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
+    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
+    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
+    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
+    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
+    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
+    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
+    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
+    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
+    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
+    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
+    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
+    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
+    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
+    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
+    const std::vector<int64_t>& decoder_ln_weight_shape,
+    const std::vector<int64_t>& decoder_ln_bias_shape,
+    const std::vector<int64_t>& trans_weight_shape,
+    const std::vector<int64_t>& trans_bias_shape,
+    const std::vector<int64_t>& lm_ln_weight_shape,
+    const std::vector<int64_t>& lm_ln_bias_shape,
+    const std::vector<int64_t>& embedding_weight_shape,
+    const std::vector<int64_t>& embedding_bias_shape,
+    const std::vector<int64_t>& positional_embedding_weight_shape,
+    const std::vector<int64_t>& type_embedding_weight_shape,
+    const std::string& decoding_strategy,
+    const int& beam_size,
+    const int& topk,
+    const float& topp,
+    const int& n_head,
+    const int& size_per_head,
+    const int& num_layer,
+    const int& bos_id,
+    const int& eos_id,
+    const int64_t& max_len,
+    const float& beam_search_diversity_rate,
+    const int& type_id,
+    const int& unk_id,
+    const int& mask_id,
+    const float& temperature,
+    const float& len_penalty) {
+  int batch_size = cache_k_shapes[0][0];
+
+  std::vector<int64_t> output_dims;
+  std::vector<int64_t> sequence_length_dims({batch_size});
+  if (decoding_strategy == "beam_search") {
+    if (batch_size != -1) {
+      batch_size /= beam_size;
+    }
+    output_dims = {max_len, batch_size, beam_size};
+    return {output_dims, output_dims, sequence_length_dims};
+  } else if (decoding_strategy == "topk_sampling" ||
+             decoding_strategy == "topp_sampling") {
+    output_dims = {max_len, batch_size};
+    return {output_dims, {1}, sequence_length_dims};
+  } else {
+    PD_THROW("Not supported decoding strategy. ");
+  }
+}
+
+std::vector<paddle::DataType> UnifiedDecodingInferDtype(
+    const std::vector<paddle::DataType>& cache_k,
+    const std::vector<paddle::DataType>& cache_v,
+    const paddle::DataType& mem_seq_len,
+    const paddle::DataType& logits_mask,
+    const paddle::DataType& word_embedding,
+    const std::vector<paddle::DataType>& self_ln_weight,
+    const std::vector<paddle::DataType>& self_ln_bias,
+    const std::vector<paddle::DataType>& self_q_weight,
+    const std::vector<paddle::DataType>& self_q_bias,
+    const std::vector<paddle::DataType>& self_k_weight,
+    const std::vector<paddle::DataType>& self_k_bias,
+    const std::vector<paddle::DataType>& self_v_weight,
+    const std::vector<paddle::DataType>& self_v_bias,
+    const std::vector<paddle::DataType>& self_out_weight,
+    const std::vector<paddle::DataType>& self_out_bias,
+    const std::vector<paddle::DataType>& ffn_ln_weight,
+    const std::vector<paddle::DataType>& ffn_ln_bias,
+    const std::vector<paddle::DataType>& ffn_inter_weight,
+    const std::vector<paddle::DataType>& ffn_inter_bias,
+    const std::vector<paddle::DataType>& ffn_out_weight,
+    const std::vector<paddle::DataType>& ffn_out_bias,
+    const paddle::DataType& decoder_ln_weight,
+    const paddle::DataType& decoder_ln_bias,
+    const paddle::DataType& trans_weight,
+    const paddle::DataType& trans_bias,
+    const paddle::DataType& lm_ln_weight,
+    const paddle::DataType& lm_ln_bias,
+    const paddle::DataType& embedding_weight,
+    const paddle::DataType& embedding_bias,
+    const paddle::DataType& positional_embedding_weight,
+    const paddle::DataType& type_embedding_weight) {
+  return {paddle::DataType::INT32,
+          paddle::DataType::INT32,
+          paddle::DataType::INT32};
+}
+
+PD_BUILD_OP(fusion_unified_decoding)
+    .Inputs({paddle::Vec("CacheK"),
+             paddle::Vec("CacheV"),
+             "MemSeqLen",
+             "LogitsMask",
+             "WordEmbedding",
+             paddle::Vec("SelfLayernormWeight"),
+             paddle::Vec("SelfLayernormBias"),
+             paddle::Vec("SelfQueryWeight"),
+             paddle::Vec("SelfQueryBias"),
+             paddle::Vec("SelfKeyWeight"),
+             paddle::Vec("SelfKeyBias"),
+             paddle::Vec("SelfValueWeight"),
+             paddle::Vec("SelfValueBias"),
+             paddle::Vec("SelfOutWeight"),
+             paddle::Vec("SelfOutBias"),
+             paddle::Vec("FFNLayernormWeight"),
+             paddle::Vec("FFNLayernormBias"),
+             paddle::Vec("FFNInterWeight"),
+             paddle::Vec("FFNInterBias"),
+             paddle::Vec("FFNOutWeight"),
+             paddle::Vec("FFNOutBias"),
+             "DecoderLayernormWeight",
+             "DecoderLayernormBias",
+             "TransWeight",
+             "TransBias",
+             "LMLayernormWeight",
+             "LMLayernormBias",
+             "EmbWeight",
+             "EmbBias",
+             "PositionEncEmb",
+             "TypeEmb"})
+    .Outputs({"OutputIds", "ParentIds", "SequenceLength"})
+    .Attrs({"decoding_strategy: std::string",
+            "beam_size: int",
+            "topk: int",
+            "topp: float",
+            "n_head: int",
+            "size_per_head: int",
+            "num_layer: int",
+            "bos_id: int",
+            "eos_id: int",
+            "max_len: int64_t",
+            "beam_search_diversity_rate: float",
+            "type_id: int",
+            "unk_id: int",
+            "mask_id: int",
+            "temperature: float",
+            "len_penalty: float"})
+    .SetKernelFn(PD_KERNEL(UnifiedDecodingForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(UnifiedDecodingInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(UnifiedDecodingInferDtype));
Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,6 @@ bool get_result_tensor(const std::unique_ptr<paddle_infer::Tensor>& seq_ids,`
`84`	`84`
`85`	`85`	`for (int i = 0; i < tmp_result_q.length(); ++i) {`
`86`	`86`	`char32_t tmp = tmp_result_q[i];`
`87`		`- // std::cout << tmp << std::endl;`
`88`	`87`	`if (byte_decoder.find(tmp) != byte_decoder.end()) {`
`89`	`88`	`dataresultvec[bsz].result_q = dataresultvec[bsz].result_q +`
`90`	`89`	`static_cast<wchar_t>(byte_decoder[tmp]);`
`@@ -126,13 +125,6 @@ std::unordered_map<char32_t, int> convert_unicode() {`
`126`	`125`	`}`
`127`	`126`	`}`
`128`	`127`
`129`		`- // std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv32;`
`130`		`- // for (int i=0; i<256; ++i) {`
`131`		`- // std::cout << "=====" << std::endl;`
`132`		`- // std::cout << conv32.to_bytes(cs[i]) << std::endl;`
`133`		`- // std::cout << bs[i] << std::endl;`
`134`		`- // }`
`135`		`-`
`136`	`128`	`return ret;`
`137`	`129`	`}`
`138`	`130`