Refine FasterTransformer (PaddlePaddle#1122)

guoshengCS · ZeyuChen · web-flow · commit 82c32ebeb576 · 2021-10-10T22:19:34.000+08:00
* Expose diversity rate.
Refine extension utility.
Update topk_update in FT.

* Remove duplicate doc for diversity_rate.

* Fix FT jit compiling cmake args.

* Fix sources attribute of FasterTransformerExtension.

* Use UPDATE_COMMAND instead of PATCH_COMMAND to make re-run always use the latest patches.

Fix beam_id_in_output calculation in topk_stage_1_opt3.

* Update FT BLEU report in README.

* Fix diversity in beam search when not fusing topK and softmax.

Co-authored-by: Zeyu Chen &lt;chenzeyu01@baidu.com&gt;
diff --git a/examples/machine_translation/transformer/configs/transformer.base.yaml b/examples/machine_translation/transformer/configs/transformer.base.yaml
@@ -69,14 +69,19 @@ label_smooth_eps: 0.1
 # decrease when meeting the end token. However, 'v2' always generates
 # longer results thus might do more calculation and be slower.
 beam_search_version: "v1"
-beam_size: 5
+beam_size: 4
 max_out_len: 256
 # Indicating whether max_out_len in configurations is the length relative to
 # that of source text. Only works in `v2` temporarily.
 use_rel_len: False
 # The power number in length penalty calculation. Only works in `v2` temporarily.
 # Please refer to GNMT <https://arxiv.org/pdf/1609.08144.pdf>.
 alpha: 0.6
+# Refer to `A Simple, Fast Diverse Decoding Algorithm for Neural Generation
+# <https://arxiv.org/abs/1611.08562>`_ for details. Bigger `diversity_rate`
+# would lead to more diversity. if `diversity_rate == 0` is equivalent to naive
+# BeamSearch. **NOTE**: Only works when using FasterTransformer temporarily.
+diversity_rate: 0.0
 # The number of decoded sentences to output.
 n_best: 1
 
diff --git a/examples/machine_translation/transformer/configs/transformer.big.yaml b/examples/machine_translation/transformer/configs/transformer.big.yaml
@@ -69,14 +69,19 @@ label_smooth_eps: 0.1
 # decrease when meeting the end token. However, 'v2' always generates
 # longer results thus might do more calculation and be slower.
 beam_search_version: "v1"
-beam_size: 5
+beam_size: 4
 max_out_len: 1024
 # Indicating whether max_out_len in configurations is the length relative to
 # that of source text. Only works in `v2` temporarily.
 use_rel_len: False
 # The power number in length penalty calculation. Only works in `v2` temporarily.
 # Please refer to GNMT <https://arxiv.org/pdf/1609.08144.pdf>.
 alpha: 0.6
+# Refer to `A Simple, Fast Diverse Decoding Algorithm for Neural Generation
+# <https://arxiv.org/abs/1611.08562>`_ for details. Bigger `diversity_rate`
+# would lead to more diversity. if `diversity_rate == 0` is equivalent to naive
+# BeamSearch. **NOTE**: Only works when using FasterTransformer temporarily.
+diversity_rate: 0.0
 # The number of decoded sentences to output.
 n_best: 1
 
diff --git a/examples/machine_translation/transformer/faster_transformer/README.md b/examples/machine_translation/transformer/faster_transformer/README.md
@@ -178,7 +178,7 @@ git clone https://github.com/moses-smt/mosesdecoder.git
 perl mosesdecoder/scripts/generic/multi-bleu.perl ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data/newstest2014.tok.de < predict.tok.txt
 ```
 
-执行上述操作之后，可以看到类似如下的结果，此处结果是 base model 在 newstest2014 上的 BLEU 结果：
+执行上述操作之后，可以看到类似如下的结果，此处结果是 beam_size 为 5 时 base model 在 newstest2014 上的 BLEU 结果：
 ```
 BLEU = 26.89, 58.4/32.6/20.5/13.4 (BP=1.000, ratio=1.010, hyp_len=65166, ref_len=64506)
 ```
@@ -300,7 +300,7 @@ git clone https://github.com/moses-smt/mosesdecoder.git
 perl mosesdecoder/scripts/generic/multi-bleu.perl ~/.paddlenlp/datasets/WMT14ende/WMT14.en-de/wmt14_ende_data/newstest2014.tok.de < predict.tok.txt
 ```
 
-执行上述操作之后，可以看到类似如下的结果，此处结果是 base model 在 newstest2014 上的 BLEU 结果：
+执行上述操作之后，可以看到类似如下的结果，此处结果是 beam_size 为 5 时 base model 在 newstest2014 上的 BLEU 结果：
 ```
 BLEU = 26.89, 58.4/32.6/20.5/13.4 (BP=1.000, ratio=1.010, hyp_len=65166, ref_len=64506)
 ```
diff --git a/examples/machine_translation/transformer/predict.py b/examples/machine_translation/transformer/predict.py
@@ -110,6 +110,7 @@ def do_predict(args):
         beam_search_version=args.beam_search_version,
         rel_len=args.use_rel_len,  # only works when using FT or beam search v2
         alpha=args.alpha,  # only works when using beam search v2
+        diversity_rate=args.diversity_rate,  # only works when using FT
         use_fp16_decoding=False)  # only works when using FT
 
     # Load the trained model
diff --git a/paddlenlp/ops/CMakeLists.txt b/paddlenlp/ops/CMakeLists.txt
@@ -75,6 +75,59 @@ set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}")
 set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  -Xcompiler -Wall")
 
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include \"stdio.h\"\n"
+      "#include \"cuda.h\"\n"
+      "#include \"cuda_runtime.h\"\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # Only use last item of nvcc_out (the last device's compute capability).
+      string(REGEX REPLACE "\\." "" nvcc_out "${nvcc_out}")
+      string(REGEX MATCHALL "[0-9()]+" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+if (NOT SM)
+# TODO(guosheng): Remove it if `GetCUDAComputeCapability` is exposed by paddle.
+# Currently, if `CUDA_gpu_detect_output` is not defined, use the detected arch.
+detect_installed_gpus(SM)
+endif()
+
 if (SM STREQUAL 80 OR
     SM STREQUAL 86 OR
     SM STREQUAL 70 OR
@@ -217,64 +270,19 @@ set(FT_PATCH_COMMAND
   && ${MUTE_COMMAND}
 )
 
-######################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   detect_installed_gpus(out_variable)
-function(detect_installed_gpus out_variable)
-  if(NOT CUDA_gpu_detect_output)
-    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${cufile} ""
-      "#include \"stdio.h\"\n"
-      "#include \"cuda.h\"\n"
-      "#include \"cuda_runtime.h\"\n"
-      "int main() {\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device) {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-
-    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
-                    "--run" "${cufile}"
-                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    if(nvcc_res EQUAL 0)
-      # Only use last item of nvcc_out (the last device's compute capability).
-      string(REGEX REPLACE "\\." "" nvcc_out "${nvcc_out}")
-      string(REGEX MATCHALL "[0-9()]+" nvcc_out "${nvcc_out}")
-      list(GET nvcc_out -1 nvcc_out)
-      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
-    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
-  else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-# TODO(guosheng): Remove it if `GetCUDAComputeCapability` is exposed by paddle.
-# Currently, if `CUDA_gpu_detect_output` is not defined, use the detected arch.
-detect_installed_gpus(SM)
+# TODO(guosheng): Use UPDATE_COMMAND instead of PATCH_COMMAND to make cmake
+# re-run always use the latest patches when the developer changes FT patch codes,
+# all patches rather than the changes would re-build, any better way to do this.
+# Or maybe hidden this function for simplicity.
+set(FT_UPDATE_COMMAND git checkout v3.1 && git checkout . && ${FT_PATCH_COMMAND})
 
 ExternalProject_Add(
   extern_${THIRD_PARTY_NAME}
   GIT_REPOSITORY    https://github.com/NVIDIA/FasterTransformer.git
   GIT_TAG           v3.1
   PREFIX            ${THIRD_PATH}
   SOURCE_DIR        ${THIRD_PATH}/source/${THIRD_PARTY_NAME}
-  PATCH_COMMAND     ${FT_PATCH_COMMAND}
+  UPDATE_COMMAND    ${FT_UPDATE_COMMAND}  # PATCH_COMMAND     ${FT_PATCH_COMMAND}
   BINARY_DIR        ${THIRD_PATH}/build/${THIRD_PARTY_NAME}
   INSTALL_COMMAND   ""
   CMAKE_ARGS        -DCMAKE_BUILD_TYPE=Release -DSM=${SM} -DBUILD_PD=ON -DPY_CMD=${PY_CMD} -DON_INFER=${ON_INFER} -DPADDLE_LIB=${PADDLE_LIB} -DWITH_MKL=${WITH_MKL} -DWITH_STATIC_LIB=${WITH_STATIC_LIB}
diff --git a/paddlenlp/ops/ext_utils.py b/paddlenlp/ops/ext_utils.py
@@ -35,6 +35,20 @@
     CUDA_HOME = None
 
 
+def _get_files(path):
+    """
+    Helps to list all files under the given path.
+    """
+    if os.path.isfile(path):
+        return [path]
+    all_files = []
+    for root, _dirs, files in os.walk(path, followlinks=True):
+        for file in files:
+            file = os.path.join(root, file)
+            all_files.append(file)
+    return all_files
+
+
 class CMakeExtension(Extension):
     def __init__(self, name, source_dir=None):
         # A CMakeExtension needs a source_dir instead of a file list.
@@ -43,10 +57,7 @@ def __init__(self, name, source_dir=None):
             self.source_dir = str(Path(__file__).parent.resolve())
         else:
             self.source_dir = os.path.abspath(os.path.expanduser(source_dir))
-        self.sources = [
-            os.path.join(self.source_dir, f)
-            for f in os.listdir(self.source_dir)
-        ]
+        self.sources = _get_files(self.source_dir)
 
     def build_with_command(self, ext_builder):
         """
@@ -95,6 +106,10 @@ def get_target_filename(self):
 class FasterTransformerExtension(CMakeExtension):
     def __init__(self, name, source_dir=None):
         super(FasterTransformerExtension, self).__init__(name, source_dir)
+        self.sources = _get_files(
+            os.path.
+            join(self.source_dir, "faster_transformer", "src")) + _get_files(
+                os.path.join(self.source_dir, "patches", "FasterTransformer"))
         self._std_out_handle = None
         # Env variable may not work as expected, since jit compile by `load`
         # would not re-built if source code is not update.
@@ -114,7 +129,7 @@ def build_with_command(self, ext_builder):
         # `GetCUDAComputeCapability` is not exposed yet, and detect CUDA/GPU
         # version in cmake file.
         # self.cmake_args += [f"-DSM={self.sm}"] if self.sm is not None else []
-        self.cmake_args = [f"-DWITH_GPT=ON"]
+        self.cmake_args += [f"-DWITH_GPT=ON"]
         try:
             super(FasterTransformerExtension,
                   self).build_with_command(ext_builder)
@@ -207,7 +222,11 @@ def load(name, build_dir=None, force=False, verbose=False, **kwargs):
                        name)
         raise NotImplementedError
     if build_dir is None:
-        build_dir = os.path.join(PPNLP_HOME, 'extenstions')
+        # Maybe under package dir is better to avoid cmake source path conflict
+        # with different source path.
+        # build_dir = os.path.join(PPNLP_HOME, 'extenstions')
+        build_dir = os.path.join(
+            str(Path(__file__).parent.resolve()), 'extenstions')
     build_base_dir = os.path.abspath(
         os.path.expanduser(os.path.join(build_dir, name)))
     if not os.path.exists(build_base_dir):
diff --git a/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py b/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py
@@ -92,7 +92,10 @@ class FasterTransformer(TransformerModel):
         max_out_len (int, optional):
             The maximum output length. Defaults to 256.
         diversity_rate (float, optional):
-            The diversity rate for beam search. Defaults to 0.0.
+            Refer to `A Simple, Fast Diverse Decoding Algorithm for Neural Generation <https://arxiv.org/abs/1611.08562>`_
+            for details. Bigger `diversity_rate` would lead to more diversity.
+            if `diversity_rate == 0` is equivalent to naive BeamSearch. Default
+            to 0 if not set.
         use_fp16_decoding(bool, optional): Whether to use fp16 for decoding. 
         rel_len(bool, optional):
             Indicating whether `max_out_len` in is the length relative to that
@@ -458,6 +461,13 @@ class TransformerGenerator(paddle.nn.Layer):
             - `alpha(float, optional)`: The power number in length penalty
             calculation. Refer to `GNMT <https://arxiv.org/pdf/1609.08144.pdf>`_.
             Only works in `v2` temporarily. Default to 0.6 if not set.
+        
+            - diversity_rate(float, optional): Refer to `A Simple, Fast Diverse
+            Decoding Algorithm for Neural Generation <https://arxiv.org/abs/1611.08562>`_
+            for details. Bigger `diversity_rate` would lead to more diversity.
+            if `diversity_rate == 0` is equivalent to naive BeamSearch. Default
+            to 0 if not set. **NOTE**: Only works when using FasterTransformer
+            temporarily.
     """
 
     def __init__(self,
@@ -524,6 +534,10 @@ def __init__(self,
                 logger.warning(
                     "Exception occurs when using Faster Transformer. " \
                     "The original forward will be involved. ")
+                if diversity_rate != 0:
+                    logger.warning(
+                        "diversity_rate would not work since it is only " \
+                        "supported by FasterTransformer temporarily.")
                 self.transformer = InferTransformerModel(
                     src_vocab_size=src_vocab_size,
                     trg_vocab_size=trg_vocab_size,
@@ -544,6 +558,10 @@ def __init__(self,
                     rel_len=rel_len,
                     alpha=alpha)
         else:
+            if diversity_rate != 0:
+                logger.warning(
+                    "diversity_rate would not work since it is only " \
+                    "supported by FasterTransformer temporarily.")
             self.transformer = InferTransformerModel(
                 src_vocab_size=src_vocab_size,
                 trg_vocab_size=trg_vocab_size,
diff --git a/paddlenlp/ops/patches/FasterTransformer/cuda/topk_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/cuda/topk_kernels.cu