Skip to content

Commit 8b8a07c

Browse files
authored
Add setup build for external ops. (#302)
* Add setup for custom_op. * Add setup build for external ops. * Add setup tmp commit. * Complete CMakeExtension, BuildExtension and load. * Add files in paddle.ops into package data. * Add jit complie sample for FasterTransformer. * Fix sources of CMakeExtension. * Remove the assert for decoding_lib argument. * Add patch command to mute FasterTransformer warnings. * Add cuda arch detection in cmake. * Add TransformerGenerator api. * Add docs for TransformerGenerator.forward * Add more package_data. * Remove print in transformer predict. * Add WITH_GPT=ON for jit build. * Fix FasterTransformerExtension break on CPU.
1 parent f2dbd81 commit 8b8a07c

File tree

9 files changed

+531
-41
lines changed

9 files changed

+531
-41
lines changed

examples/machine_translation/transformer/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ python predict.py --config ./configs/transformer.base.yaml
9999

100100
需要注意的是,目前预测仅实现了单卡的预测,原因在于,翻译后面需要的模型评估依赖于预测结果写入文件顺序,多卡情况下,目前暂未支持将结果按照指定顺序写入文件。
101101

102+
另外 `predict.py` 中使用的 `TransformerGenerator` 接口对于GPU预测将在适配的条件下自动切换到 `FasterTransformer` 预测加速版本(期间会进行jit编译), `FasterTransformer`的更多内容可以参考 `faster_transformer/README.md`
103+
102104
#### 导出静态图预测模型与预测引擎预测
103105

104106
Transformer 同时提供了将训练的动态图的 checkpoint 转成静态图模型功能,并提供了对应的使用预测引擎进行预测推理的方法。具体的使用方式如下:

examples/machine_translation/transformer/predict.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
from attrdict import AttrDict
88

99
import paddle
10+
from paddlenlp.ops import TransformerGenerator
1011

1112
import reader
12-
from paddlenlp.transformers import InferTransformerModel, position_encoding_init
1313

1414

1515
def parse_args():
@@ -56,7 +56,9 @@ def do_predict(args):
5656
test_loader, to_tokens = reader.create_infer_loader(args)
5757

5858
# Define model
59-
transformer = InferTransformerModel(
59+
# `TransformerGenerator` automatically chioces using `FasterTransformer`
60+
# (with jit building) or the slower verison `InferTransformerModel`.
61+
transformer = TransformerGenerator(
6062
src_vocab_size=args.src_vocab_size,
6163
trg_vocab_size=args.trg_vocab_size,
6264
max_length=args.max_length + 1,
@@ -75,25 +77,19 @@ def do_predict(args):
7577
assert args.init_from_params, (
7678
"Please set init_from_params to load the infer model.")
7779

78-
model_dict = paddle.load(
80+
transformer.load(
7981
os.path.join(args.init_from_params, "transformer.pdparams"))
8082

81-
# To avoid a longer length than training, reset the size of position
82-
# encoding to max_length
83-
model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
84-
args.max_length + 1, args.d_model)
85-
model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
86-
args.max_length + 1, args.d_model)
87-
transformer.load_dict(model_dict)
88-
8983
# Set evaluate mode
9084
transformer.eval()
9185

9286
f = open(args.output_file, "w")
9387
with paddle.no_grad():
9488
for (src_word, ) in test_loader:
89+
# The shape of finished_seq is `[seq_len, batch_size, beam_size]`
90+
# when `output_time_major` argument is `True` for TransformerGenerator.
9591
finished_seq = transformer(src_word=src_word)
96-
finished_seq = finished_seq.numpy().transpose([0, 2, 1])
92+
finished_seq = finished_seq.numpy().transpose([1, 2, 0])
9793
for ins in finished_seq:
9894
for beam_idx, beam in enumerate(ins):
9995
if beam_idx >= args.n_best:

paddlenlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = '2.0.0rc19'
15+
__version__ = '2.0.0rc19' # Maybe dev is better
1616

1717
from . import data
1818
from . import datasets

paddlenlp/ops/CMakeLists.txt

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,60 @@ file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/${THIRD_PARTY_NAME}/CMakeL
123123
file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/topk_kernels.cu topk_kernels_src)
124124
file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/${THIRD_PARTY_NAME}/fastertransformer/cuda/topk_kernels.cu topk_kernels_dst)
125125

126-
set(FT_PATCH_COMMAND cp ${allocator_src} ${allocator_dst} | cp ${common_src} ${common_dst} | cp ${cmakelists_src} ${cmakelists_dst} | cp ${topk_kernels_src} ${topk_kernels_dst})
126+
# TODO(guosheng): `find` seems meeting errors missing argument to `-exec', fix it
127+
set(MUTE_COMMAND grep -rl "printf(\"\\[WARNING\\]" ${CMAKE_BINARY_DIR}/${THIRD_PATH}/${THIRD_PARTY_NAME}/ | xargs -i{} sed -i "s/printf(\"\\WWARNING\\W decoding[^)]\\{1,\\})/ /" {})
128+
set(FT_PATCH_COMMAND cp ${allocator_src} ${allocator_dst} && cp ${common_src} ${common_dst} && cp ${cmakelists_src} ${cmakelists_dst} && cp ${topk_kernels_src} ${topk_kernels_dst} && ${MUTE_COMMAND})
129+
130+
######################################################################################
131+
# A function for automatic detection of GPUs installed (if autodetection is enabled)
132+
# Usage:
133+
# detect_installed_gpus(out_variable)
134+
function(detect_installed_gpus out_variable)
135+
if(NOT CUDA_gpu_detect_output)
136+
set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
137+
138+
file(WRITE ${cufile} ""
139+
"#include \"stdio.h\"\n"
140+
"#include \"cuda.h\"\n"
141+
"#include \"cuda_runtime.h\"\n"
142+
"int main() {\n"
143+
" int count = 0;\n"
144+
" if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
145+
" if (count == 0) return -1;\n"
146+
" for (int device = 0; device < count; ++device) {\n"
147+
" cudaDeviceProp prop;\n"
148+
" if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
149+
" printf(\"%d.%d \", prop.major, prop.minor);\n"
150+
" }\n"
151+
" return 0;\n"
152+
"}\n")
153+
154+
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
155+
"--run" "${cufile}"
156+
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
157+
RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
158+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
159+
160+
if(nvcc_res EQUAL 0)
161+
# Only use last item of nvcc_out (the last device's compute capability).
162+
string(REGEX REPLACE "\\." "" nvcc_out "${nvcc_out}")
163+
string(REGEX MATCHALL "[0-9()]+" nvcc_out "${nvcc_out}")
164+
list(GET nvcc_out -1 nvcc_out)
165+
set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
166+
endif()
167+
endif()
168+
169+
if(NOT CUDA_gpu_detect_output)
170+
message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
171+
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
172+
else()
173+
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
174+
endif()
175+
endfunction()
176+
177+
# TODO(guosheng): Remove it if `GetCUDAComputeCapability` is exposed by paddle.
178+
# Currently, if `CUDA_gpu_detect_output` is not defined, use the detected arch.
179+
detect_installed_gpus(SM)
127180

128181
ExternalProject_Add(
129182
extern_${THIRD_PARTY_NAME}

paddlenlp/ops/ext_utils.py

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
# -*- coding: utf-8 -*-
2+
import os
3+
import sys
4+
import subprocess
5+
import textwrap
6+
import inspect
7+
from pathlib import Path
8+
from setuptools import setup, Extension
9+
from setuptools.command.build_ext import build_ext
10+
from distutils.dep_util import newer_group
11+
12+
from paddle.utils.cpp_extension import load_op_meta_info_and_register_op
13+
from paddle.utils.cpp_extension.extension_utils import _jit_compile, _import_module_from_library
14+
from paddle.utils.cpp_extension.cpp_extension import (
15+
CUDA_HOME, CppExtension, BuildExtension as PaddleBuildExtension)
16+
from paddlenlp.utils.env import PPNLP_HOME
17+
from paddlenlp.utils.log import logger
18+
19+
if not os.path.exists(CUDA_HOME):
20+
# CUDA_HOME is only None when `core.is_compiled_with_cuda()` is True in
21+
# find_cuda_home. Clear it for paddle cpu version.
22+
CUDA_HOME = None
23+
24+
25+
class CMakeExtension(Extension):
26+
def __init__(self, name, source_dir=None):
27+
# A CMakeExtension needs a source_dir instead of a file list.
28+
Extension.__init__(self, name, sources=[])
29+
if source_dir is None:
30+
self.source_dir = Path(__file__).parent.resolve()
31+
else:
32+
self.source_dir = os.path.abspath(os.path.expanduser(source_dir))
33+
self.sources = [
34+
os.path.join(self.source_dir, f)
35+
for f in os.listdir(self.source_dir)
36+
]
37+
38+
def build_with_command(self, ext_builder):
39+
"""
40+
Custom `build_ext.build_extension` in `Extension` instead of `Command`.
41+
`ext_builder` is the instance of `build_ext` command.
42+
"""
43+
# refer to https://github.com/pybind/cmake_example/blob/master/setup.py
44+
if ext_builder.compiler.compiler_type == "msvc":
45+
raise NotImplementedError
46+
cmake_args = getattr(self, "cmake_args", []) + [
47+
"-DCMAKE_BUILD_TYPE={}".format("Debug"
48+
if ext_builder.debug else "Release"),
49+
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}".format(ext_builder.build_lib),
50+
]
51+
build_args = []
52+
53+
# Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
54+
# across all generators.
55+
if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
56+
# self.parallel is a Python 3 only way to set parallel jobs by hand
57+
# using -j in the build_ext call, not supported by pip or PyPA-build.
58+
if hasattr(ext_builder, "parallel") and ext_builder.parallel:
59+
# CMake 3.12+ only.
60+
build_args += ["-j{}".format(ext_builder.parallel)]
61+
62+
if not os.path.exists(ext_builder.build_temp):
63+
os.makedirs(ext_builder.build_temp)
64+
65+
# Redirect stdout/stderr to mute, especially when allowing errors
66+
stdout = getattr(self, "_std_out_handle", None)
67+
subprocess.check_call(
68+
["cmake", self.source_dir] + cmake_args,
69+
cwd=ext_builder.build_temp,
70+
stdout=stdout,
71+
stderr=stdout)
72+
subprocess.check_call(
73+
["cmake", "--build", "."] + build_args,
74+
cwd=ext_builder.build_temp,
75+
stdout=stdout,
76+
stderr=stdout)
77+
78+
def get_target_filename(self):
79+
raise NotImplementedError
80+
81+
82+
class FasterTransformerExtension(CMakeExtension):
83+
def __init__(self, name, source_dir=None):
84+
super(FasterTransformerExtension, self).__init__(name, source_dir)
85+
self._std_out_handle = None
86+
# Env variable may not work as expected, since jit compile by `load`
87+
# would not re-built if source code is not update.
88+
# self.sm = os.environ.get("PPNLP_GENERATE_CODE", None)
89+
90+
def build_with_command(self, ext_builder):
91+
if CUDA_HOME is None: # GPU only
92+
# TODO(guosheng): should we touch a dummy file or add a quick exit
93+
# method to avoid meaningless process in `load`
94+
logger.warning(
95+
"FasterTransformer is not available because CUDA can not be found."
96+
)
97+
raise NotImplementedError
98+
# TODO(guosheng): Multiple -std seems be passed in FasterTransformer,
99+
# which is not allowed by NVCC. Fix it later.
100+
self.cmake_args = [f"-DPY_CMD={sys.executable}"]
101+
# `GetCUDAComputeCapability` is not exposed yet, and detect CUDA/GPU
102+
# version in cmake file.
103+
# self.cmake_args += [f"-DSM={self.sm}"] if self.sm is not None else []
104+
self.cmake_args = [f"-DWITH_GPT=ON"]
105+
try:
106+
super(FasterTransformerExtension,
107+
self).build_with_command(ext_builder)
108+
# FasterTransformer cmake file resets `CMAKE_LIBRARY_OUTPUT_DIRECTORY`
109+
# to `CMAKE_BINARY_DIR/lib`, thus copy the lib back to `build_ext.build_lib`.
110+
# Maybe move this copy to CMakeList.
111+
# `copy_tree` or `copy_file`, boost lib might be included
112+
ext_builder.copy_tree(
113+
os.path.join(ext_builder.build_temp, "lib"),
114+
ext_builder.build_lib)
115+
except Exception as e:
116+
logger.warning(
117+
"FasterTransformer is not available due to build errors.")
118+
raise e
119+
120+
def get_target_filename(self):
121+
# CMake file has fixed the name of lib, maybe we can copy it as the name
122+
# returned by `BuildExtension.get_ext_filename` after build.
123+
return "libdecoding_op.so"
124+
125+
126+
class BuildExtension(PaddleBuildExtension):
127+
"""
128+
Support both `CppExtention` of Paddle and custom extensions of PaddleNLP.
129+
"""
130+
131+
def build_extensions(self):
132+
custom_exts = [] # for
133+
no_custom_exts = [] # for normal extentions paddle.utils.cpp_extension
134+
for ext in self.extensions:
135+
if hasattr(ext, "build_with_command"):
136+
# custom build in Extension
137+
ext.build_with_command(self)
138+
custom_exts.append(ext)
139+
else:
140+
no_custom_exts.append(ext)
141+
if no_custom_exts:
142+
# Build CppExtentio/CUDAExtension with `PaddleBuildExtension`
143+
self.extensions = no_custom_exts
144+
super(BuildExtension, self).build_extensions()
145+
self.extensions = custom_exts + no_custom_exts
146+
147+
148+
EXTENSIONS = {"FasterTransformer": FasterTransformerExtension}
149+
150+
151+
def get_extension_maker(name):
152+
# Use `paddle.utils.cpp_extension.CppExtension` as the default
153+
# TODO(guosheng): Maybe register extension classes into `Extensions`.
154+
return EXTENSIONS.get(name, CppExtension)
155+
156+
157+
def _write_setup_file(name, file_path, build_dir, **kwargs):
158+
"""
159+
Automatically generate setup.py and write it into build directory.
160+
`kwargws` is arguments for the corresponding Extension initialization.
161+
Any type extension can be jit build.
162+
"""
163+
template = textwrap.dedent("""
164+
from setuptools import setup
165+
from paddlenlp.ops.ext_utils import get_extension_maker, BuildExtension
166+
167+
setup(
168+
name='{name}',
169+
ext_modules=[
170+
get_extension_maker('{name}')(
171+
name='{name}',
172+
{kwargs_str})],
173+
cmdclass={{'build_ext' : BuildExtension.with_options(
174+
output_dir=r'{build_dir}')
175+
}})""").lstrip()
176+
kwargs_str = ""
177+
for key, value in kwargs.items():
178+
kwargs_str += key + "=" + (f"'{value}'"
179+
if isinstance(value, str) else value) + ","
180+
content = template.format(
181+
name=name, kwargs_str=kwargs_str, build_dir=build_dir)
182+
183+
with open(file_path, 'w') as f:
184+
f.write(content)
185+
186+
187+
def load(name, build_dir=None, force=False, verbose=False, **kwargs):
188+
# TODO(guosheng): Need better way to resolve unsupported such as CPU. Currently,
189+
# raise NotImplementedError and skip `_jit_compile`. Otherwise, `_jit_compile`
190+
# will output the error to stdout (when verbose is True) and raise `RuntimeError`,
191+
# which is not friendly for users though no other bad effect.
192+
if CUDA_HOME is None:
193+
logger.warning("%s is not available because CUDA can not be found." %
194+
name)
195+
raise NotImplementedError
196+
if build_dir is None:
197+
build_dir = os.path.join(PPNLP_HOME, 'extenstions')
198+
build_base_dir = os.path.abspath(
199+
os.path.expanduser(os.path.join(build_dir, name)))
200+
if not os.path.exists(build_base_dir):
201+
os.makedirs(build_base_dir)
202+
203+
extension = get_extension_maker(name)(name, **kwargs)
204+
# Check if 'target' is out-of-date with respect to any file to avoid rebuild
205+
if isinstance(extension, CMakeExtension):
206+
# `CppExtention/CUDAExtension `has version manager by `PaddleBuildExtension`
207+
# Maybe move this to CMakeExtension later.
208+
# TODO(guosheng): flags/args changes may also trigger build, and maybe
209+
# need version manager like `PaddleBuildExtension`.
210+
ext_filename = extension.get_target_filename()
211+
ext_filepath = os.path.join(build_base_dir, ext_filename)
212+
if not force:
213+
ext_sources = extension.sources
214+
if os.path.exists(ext_filepath) and not newer_group(
215+
ext_sources, ext_filepath, 'newer'):
216+
logger.debug("skipping '%s' extension (up-to-date) build" %
217+
name)
218+
return load_op_meta_info_and_register_op(ext_filepath)
219+
220+
# write setup file and jit compile
221+
file_path = os.path.join(build_dir, "{}_setup.py".format(name))
222+
_write_setup_file(name, file_path, build_base_dir, **kwargs)
223+
_jit_compile(file_path, verbose)
224+
if isinstance(extension, CMakeExtension):
225+
# Load a shared library (if exists) only to register op.
226+
if os.path.exists(ext_filepath):
227+
load_op_meta_info_and_register_op(ext_filepath)
228+
else:
229+
# Import as callable python api
230+
return _import_module_from_library(name, build_base_dir, verbose)

0 commit comments

Comments
 (0)