PaddlePaddle
diff --git a/‎examples/machine_translation/transformer/README.md‎
Lines changed: 2 additions & 0 deletions b/‎examples/machine_translation/transformer/README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/machine_translation/transformer/predict.py‎
Lines changed: 8 additions & 12 deletions b/‎examples/machine_translation/transformer/predict.py‎
Lines changed: 8 additions & 12 deletions
diff --git a/‎paddlenlp/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎paddlenlp/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddlenlp/ops/CMakeLists.txt‎
Lines changed: 54 additions & 1 deletion b/‎paddlenlp/ops/CMakeLists.txt‎
Lines changed: 54 additions & 1 deletion
diff --git a/‎paddlenlp/ops/ext_utils.py‎
Lines changed: 230 additions & 0 deletions b/‎paddlenlp/ops/ext_utils.py‎
Lines changed: 230 additions & 0 deletions
@@ -99,6 +99,8 @@ python predict.py --config ./configs/transformer.base.yaml
 
  需要注意的是，目前预测仅实现了单卡的预测，原因在于，翻译后面需要的模型评估依赖于预测结果写入文件顺序，多卡情况下，目前暂未支持将结果按照指定顺序写入文件。
 
+ 另外 `predict.py` 中使用的 `TransformerGenerator` 接口对于GPU预测将在适配的条件下自动切换到 `FasterTransformer` 预测加速版本（期间会进行jit编译）， `FasterTransformer`的更多内容可以参考 `faster_transformer/README.md`。
+
 #### 导出静态图预测模型与预测引擎预测
 
 Transformer 同时提供了将训练的动态图的 checkpoint 转成静态图模型功能，并提供了对应的使用预测引擎进行预测推理的方法。具体的使用方式如下：
 
@@ -7,9 +7,9 @@
 from attrdict import AttrDict
 
 import paddle
+from paddlenlp.ops import TransformerGenerator
 
 import reader
-from paddlenlp.transformers import InferTransformerModel, position_encoding_init
 
 
 def parse_args():
@@ -56,7 +56,9 @@ def do_predict(args):
     test_loader, to_tokens = reader.create_infer_loader(args)
 
     # Define model
-    transformer = InferTransformerModel(
+    # `TransformerGenerator` automatically chioces using `FasterTransformer`
+    # (with jit building) or the slower verison `InferTransformerModel`.
+    transformer = TransformerGenerator(
         src_vocab_size=args.src_vocab_size,
         trg_vocab_size=args.trg_vocab_size,
         max_length=args.max_length + 1,
@@ -75,25 +77,19 @@ def do_predict(args):
     assert args.init_from_params, (
         "Please set init_from_params to load the infer model.")
 
-    model_dict = paddle.load(
+    transformer.load(
         os.path.join(args.init_from_params, "transformer.pdparams"))
 
-    # To avoid a longer length than training, reset the size of position
-    # encoding to max_length
-    model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
-        args.max_length + 1, args.d_model)
-    model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
-        args.max_length + 1, args.d_model)
-    transformer.load_dict(model_dict)
-
     # Set evaluate mode
     transformer.eval()
 
     f = open(args.output_file, "w")
     with paddle.no_grad():
         for (src_word, ) in test_loader:
+            # The shape of finished_seq is `[seq_len, batch_size, beam_size]`
+            # when `output_time_major` argument is `True` for TransformerGenerator.
             finished_seq = transformer(src_word=src_word)
-            finished_seq = finished_seq.numpy().transpose([0, 2, 1])
+            finished_seq = finished_seq.numpy().transpose([1, 2, 0])
             for ins in finished_seq:
                 for beam_idx, beam in enumerate(ins):
                     if beam_idx >= args.n_best:
 
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = '2.0.0rc19'
+__version__ = '2.0.0rc19'  # Maybe dev is better
 
 from . import data
 from . import datasets
 
@@ -123,7 +123,60 @@ file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/${THIRD_PARTY_NAME}/CMakeL
 file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/topk_kernels.cu topk_kernels_src)
 file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/${THIRD_PARTY_NAME}/fastertransformer/cuda/topk_kernels.cu topk_kernels_dst)
 
-set(FT_PATCH_COMMAND cp ${allocator_src} ${allocator_dst} | cp ${common_src} ${common_dst} | cp ${cmakelists_src} ${cmakelists_dst} | cp ${topk_kernels_src} ${topk_kernels_dst})
+# TODO(guosheng): `find` seems meeting errors missing argument to `-exec', fix it
+set(MUTE_COMMAND grep -rl "printf(\"\\[WARNING\\]" ${CMAKE_BINARY_DIR}/${THIRD_PATH}/${THIRD_PARTY_NAME}/ | xargs -i{} sed -i "s/printf(\"\\WWARNING\\W decoding[^)]\\{1,\\})/ /" {})
+set(FT_PATCH_COMMAND cp ${allocator_src} ${allocator_dst} && cp ${common_src} ${common_dst} && cp ${cmakelists_src} ${cmakelists_dst} && cp ${topk_kernels_src} ${topk_kernels_dst} && ${MUTE_COMMAND})
+
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include \"stdio.h\"\n"
+      "#include \"cuda.h\"\n"
+      "#include \"cuda_runtime.h\"\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # Only use last item of nvcc_out (the last device's compute capability).
+      string(REGEX REPLACE "\\." "" nvcc_out "${nvcc_out}")
+      string(REGEX MATCHALL "[0-9()]+" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# TODO(guosheng): Remove it if `GetCUDAComputeCapability` is exposed by paddle.
+# Currently, if `CUDA_gpu_detect_output` is not defined, use the detected arch.
+detect_installed_gpus(SM)
 
 ExternalProject_Add(
   extern_${THIRD_PARTY_NAME}
 
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+import subprocess
+import textwrap
+import inspect
+from pathlib import Path
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.dep_util import newer_group
+
+from paddle.utils.cpp_extension import load_op_meta_info_and_register_op
+from paddle.utils.cpp_extension.extension_utils import _jit_compile, _import_module_from_library
+from paddle.utils.cpp_extension.cpp_extension import (
+    CUDA_HOME, CppExtension, BuildExtension as PaddleBuildExtension)
+from paddlenlp.utils.env import PPNLP_HOME
+from paddlenlp.utils.log import logger
+
+if not os.path.exists(CUDA_HOME):
+    # CUDA_HOME is only None when `core.is_compiled_with_cuda()` is True in
+    # find_cuda_home. Clear it for paddle cpu version.
+    CUDA_HOME = None
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name, source_dir=None):
+        # A CMakeExtension needs a source_dir instead of a file list.
+        Extension.__init__(self, name, sources=[])
+        if source_dir is None:
+            self.source_dir = Path(__file__).parent.resolve()
+        else:
+            self.source_dir = os.path.abspath(os.path.expanduser(source_dir))
+        self.sources = [
+            os.path.join(self.source_dir, f)
+            for f in os.listdir(self.source_dir)
+        ]
+
+    def build_with_command(self, ext_builder):
+        """
+        Custom `build_ext.build_extension` in `Extension` instead of `Command`.
+        `ext_builder` is the instance of `build_ext` command.
+        """
+        # refer to https://github.com/pybind/cmake_example/blob/master/setup.py
+        if ext_builder.compiler.compiler_type == "msvc":
+            raise NotImplementedError
+        cmake_args = getattr(self, "cmake_args", []) + [
+            "-DCMAKE_BUILD_TYPE={}".format("Debug"
+                                           if ext_builder.debug else "Release"),
+            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}".format(ext_builder.build_lib),
+        ]
+        build_args = []
+
+        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
+        # across all generators.
+        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(ext_builder, "parallel") and ext_builder.parallel:
+                # CMake 3.12+ only.
+                build_args += ["-j{}".format(ext_builder.parallel)]
+
+        if not os.path.exists(ext_builder.build_temp):
+            os.makedirs(ext_builder.build_temp)
+
+        # Redirect stdout/stderr to mute, especially when allowing errors
+        stdout = getattr(self, "_std_out_handle", None)
+        subprocess.check_call(
+            ["cmake", self.source_dir] + cmake_args,
+            cwd=ext_builder.build_temp,
+            stdout=stdout,
+            stderr=stdout)
+        subprocess.check_call(
+            ["cmake", "--build", "."] + build_args,
+            cwd=ext_builder.build_temp,
+            stdout=stdout,
+            stderr=stdout)
+
+    def get_target_filename(self):
+        raise NotImplementedError
+
+
+class FasterTransformerExtension(CMakeExtension):
+    def __init__(self, name, source_dir=None):
+        super(FasterTransformerExtension, self).__init__(name, source_dir)
+        self._std_out_handle = None
+        # Env variable may not work as expected, since jit compile by `load`
+        # would not re-built if source code is not update.
+        # self.sm = os.environ.get("PPNLP_GENERATE_CODE", None)
+
+    def build_with_command(self, ext_builder):
+        if CUDA_HOME is None:  # GPU only
+            # TODO(guosheng): should we touch a dummy file or add a quick exit
+            # method to avoid meaningless process in `load`
+            logger.warning(
+                "FasterTransformer is not available because CUDA can not be found."
+            )
+            raise NotImplementedError
+        # TODO(guosheng): Multiple -std seems be passed in FasterTransformer,
+        # which is not allowed by NVCC. Fix it later.
+        self.cmake_args = [f"-DPY_CMD={sys.executable}"]
+        # `GetCUDAComputeCapability` is not exposed yet, and detect CUDA/GPU
+        # version in cmake file.
+        # self.cmake_args += [f"-DSM={self.sm}"] if self.sm is not None else []
+        self.cmake_args = [f"-DWITH_GPT=ON"]
+        try:
+            super(FasterTransformerExtension,
+                  self).build_with_command(ext_builder)
+            # FasterTransformer cmake file resets `CMAKE_LIBRARY_OUTPUT_DIRECTORY`
+            # to `CMAKE_BINARY_DIR/lib`, thus copy the lib back to `build_ext.build_lib`.
+            # Maybe move this copy to CMakeList.
+            # `copy_tree` or `copy_file`, boost lib might be included
+            ext_builder.copy_tree(
+                os.path.join(ext_builder.build_temp, "lib"),
+                ext_builder.build_lib)
+        except Exception as e:
+            logger.warning(
+                "FasterTransformer is not available due to build errors.")
+            raise e
+
+    def get_target_filename(self):
+        # CMake file has fixed the name of lib, maybe we can copy it as the name
+        # returned by `BuildExtension.get_ext_filename` after build.
+        return "libdecoding_op.so"
+
+
+class BuildExtension(PaddleBuildExtension):
+    """
+    Support both `CppExtention` of Paddle and custom extensions of PaddleNLP.
+    """
+
+    def build_extensions(self):
+        custom_exts = []  # for
+        no_custom_exts = []  # for normal extentions paddle.utils.cpp_extension
+        for ext in self.extensions:
+            if hasattr(ext, "build_with_command"):
+                # custom build in Extension
+                ext.build_with_command(self)
+                custom_exts.append(ext)
+            else:
+                no_custom_exts.append(ext)
+        if no_custom_exts:
+            # Build CppExtentio/CUDAExtension with `PaddleBuildExtension`
+            self.extensions = no_custom_exts
+            super(BuildExtension, self).build_extensions()
+        self.extensions = custom_exts + no_custom_exts
+
+
+EXTENSIONS = {"FasterTransformer": FasterTransformerExtension}
+
+
+def get_extension_maker(name):
+    # Use `paddle.utils.cpp_extension.CppExtension` as the default
+    # TODO(guosheng): Maybe register extension classes into `Extensions`.
+    return EXTENSIONS.get(name, CppExtension)
+
+
+def _write_setup_file(name, file_path, build_dir, **kwargs):
+    """
+    Automatically generate setup.py and write it into build directory.
+    `kwargws` is arguments for the corresponding Extension initialization.
+    Any type extension can be jit build.
+    """
+    template = textwrap.dedent("""
+    from setuptools import setup
+    from paddlenlp.ops.ext_utils import get_extension_maker, BuildExtension
+
+    setup(
+        name='{name}',
+        ext_modules=[
+            get_extension_maker('{name}')(
+                name='{name}',
+                {kwargs_str})],
+        cmdclass={{'build_ext' : BuildExtension.with_options(
+            output_dir=r'{build_dir}')
+        }})""").lstrip()
+    kwargs_str = ""
+    for key, value in kwargs.items():
+        kwargs_str += key + "=" + (f"'{value}'"
+                                   if isinstance(value, str) else value) + ","
+    content = template.format(
+        name=name, kwargs_str=kwargs_str, build_dir=build_dir)
+
+    with open(file_path, 'w') as f:
+        f.write(content)
+
+
+def load(name, build_dir=None, force=False, verbose=False, **kwargs):
+    # TODO(guosheng): Need better way to resolve unsupported such as CPU. Currently,
+    # raise NotImplementedError and skip `_jit_compile`. Otherwise, `_jit_compile`
+    # will output the error to stdout (when verbose is True) and raise `RuntimeError`,
+    # which is not friendly for users though no other bad effect.
+    if CUDA_HOME is None:
+        logger.warning("%s is not available because CUDA can not be found." %
+                       name)
+        raise NotImplementedError
+    if build_dir is None:
+        build_dir = os.path.join(PPNLP_HOME, 'extenstions')
+    build_base_dir = os.path.abspath(
+        os.path.expanduser(os.path.join(build_dir, name)))
+    if not os.path.exists(build_base_dir):
+        os.makedirs(build_base_dir)
+
+    extension = get_extension_maker(name)(name, **kwargs)
+    # Check if 'target' is out-of-date with respect to any file to avoid rebuild
+    if isinstance(extension, CMakeExtension):
+        # `CppExtention/CUDAExtension `has version manager by `PaddleBuildExtension`
+        # Maybe move this to CMakeExtension later.
+        # TODO(guosheng): flags/args changes may also trigger build, and maybe
+        # need version manager like `PaddleBuildExtension`.
+        ext_filename = extension.get_target_filename()
+        ext_filepath = os.path.join(build_base_dir, ext_filename)
+        if not force:
+            ext_sources = extension.sources
+            if os.path.exists(ext_filepath) and not newer_group(
+                    ext_sources, ext_filepath, 'newer'):
+                logger.debug("skipping '%s' extension (up-to-date) build" %
+                             name)
+                return load_op_meta_info_and_register_op(ext_filepath)
+
+    # write setup file and jit compile
+    file_path = os.path.join(build_dir, "{}_setup.py".format(name))
+    _write_setup_file(name, file_path, build_base_dir, **kwargs)
+    _jit_compile(file_path, verbose)
+    if isinstance(extension, CMakeExtension):
+        # Load a shared library (if exists) only to register op.
+        if os.path.exists(ext_filepath):
+            load_op_meta_info_and_register_op(ext_filepath)
+    else:
+        # Import as callable python api
+        return _import_module_from_library(name, build_base_dir, verbose)