Fix faster fp16 (#4423)

FrostML · web-flow · commit b4697f00f3c7 · 2023-01-12T14:31:33.000+08:00
diff --git a/model_zoo/gpt/fast_gpt/export_model.py b/model_zoo/gpt/fast_gpt/export_model.py
@@ -59,7 +59,7 @@ def do_predict(args):
     model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path]
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
     logger.info("Loading the model parameters, please wait...")
-    model = model_class.from_pretrained(args.model_name_or_path, max_predict_len=args.max_out_len)
+    model = model_class.from_pretrained(args.model_name_or_path)
 
     gpt = FasterGPT(model=model, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding)
 
diff --git a/paddlenlp/ops/CMakeLists.txt b/paddlenlp/ops/CMakeLists.txt
@@ -16,6 +16,12 @@ project(FasterTransformer LANGUAGES C CXX CUDA)
 
 find_package(CUDA 10.1 REQUIRED)
 
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif()
+
 INCLUDE(ExternalProject)
 
 set(CXX_STD "14" CACHE STRING "C++ standard")
diff --git a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py b/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py
@@ -812,7 +812,7 @@ def save_resources(self, tokenizer, path):
 
 class FasterGPT(GPTPretrainedModel):
     def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        super(FasterGPT, self).__init__()
+        super(FasterGPT, self).__init__(model.config)
         self._model = model
         self.use_fp16_decoding = use_fp16_decoding
         self.decoding = InferGptDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding)
@@ -1923,9 +1923,9 @@ def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
         self.use_fp16_decoding = use_fp16_decoding
         self._model = model
         if use_fp16_decoding:
-            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.mbart.encoder.embed_tokens.weight))
-            model.mbart.encoder.embed_tokens = nn.Embedding(
-                *model.mbart.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
+            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.encoder.embed_tokens.weight))
+            model.encoder.embed_tokens = nn.Embedding(
+                *model.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
             )
         self.encoder = model.t5.get_encoder()
         self.decoder = model.t5.get_decoder()
diff --git a/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt b/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt
@@ -17,6 +17,12 @@ project(FasterTransformer LANGUAGES CXX CUDA)
 
 find_package(CUDA 10.1 REQUIRED)
 
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif()
+
 option(BUILD_PD "Build in PaddlePaddle mode" ON)
 option(BUILD_GPT "Build project with gpt"    ON)
 option(BUILD_ENCODER "Build project with encoder"    ON)