Fix benchmark bug (#7002)

MARD1NO · web-flow · commit 4a7665dbd1bf · 2023-09-12T08:21:49.000-05:00
* fix bug

* fix wint4 convert
diff --git a/llm/benchmark.sh b/llm/benchmark.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH
 
 export FLAGS_control_flow_use_new_executor=1
@@ -6,10 +20,10 @@ export FLAGS_allocator_strategy=naive_best_fit
 export FLAGS_fraction_of_gpu_memory_to_use=0.92
 
 python predictor.py \
-    --model_name_or_path ./llama-13b-inference_model_fp16 \
+    --model_name_or_path ./llama7b-inference_model_fp16 \
     --dtype float16 \
     --src_length 300 \
-    --max_length 400 \
+    --max_length 100 \
     --output_file "infer.json" \
     --mode "static" \
     --batch_size 1 \
diff --git a/llm/predictor.py b/llm/predictor.py
@@ -352,6 +352,7 @@ def _preprocess(self, source):
             inputs = dybatch_preprocess(
                 self.tokenizer,
                 source,
+                self.config.src_length,
                 self.config.max_length,
                 self.architectures,
                 top_p=self.config.top_p,
@@ -369,6 +370,7 @@ def _preprocess(self, source):
             inputs = dybatch_preprocess(
                 self.tokenizer,
                 source,
+                self.config.src_length,
                 self.config.max_length,
                 self.architectures,
                 top_p=self.config.top_p,
@@ -431,6 +433,7 @@ def _preprocess(self, source):
             inputs = dybatch_preprocess(
                 self.tokenizer,
                 source,
+                self.config.src_length,
                 self.config.max_length,
                 self.architectures,
                 top_p=self.config.top_p,
@@ -790,9 +793,6 @@ def benchmark(predictor, predictor_args, model_args):
     test_texts = "hello world, how are you?"
     benchmark_texts = [test_texts + "<pad>" * predictor_args.src_length for _ in range(predictor_args.batch_size)]
 
-    benchmark_texts = [
-        "<pad>" * (predictor_args.src_length // 2 - 3) + "My name is " for _ in range(predictor_args.batch_size)
-    ]
     batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size)
     print("***********Start Benchmark**********")
 
diff --git a/llm/utils.py b/llm/utils.py
@@ -385,6 +385,7 @@ def pad_batch_data(insts, pad_id=0, return_seq_len=False, pad_style="right"):
 def dybatch_preprocess(
     tokenizer,
     texts: list[str],
+    src_length: int,
     max_length: int,
     architectures: str,
     top_p: float,
@@ -398,7 +399,7 @@ def dybatch_preprocess(
         position_ids = []
 
         for text in texts:
-            tokens = tokenizer(text, return_tensors="np", padding=True)
+            tokens = tokenizer(text, return_tensors="np", padding=True, max_length=src_length)
             input_ids.append(tokens["input_ids"][0])
             position_ids.append(tokens["position_ids"][0])
 
@@ -423,6 +424,7 @@ def dybatch_preprocess(
                 text,
                 return_tensors="np",
                 padding=False,
+                max_length=src_length,
                 return_attention_mask=False,
                 return_token_type_ids=False,
             )
@@ -434,7 +436,7 @@ def dybatch_preprocess(
         bs = inputs["input_ids"].shape[0]
         max_len = max(map(len, input_ids))
 
-        position_ids = paddle.zeros(shape=[bs, max_length], dtype="int64")
+        position_ids = paddle.zeros(shape=[bs, max_length + src_length], dtype="int64")
 
         for i in range(bs):
             position_ids[i, pre_caches_length : pre_caches_length + seq_len[i]] = paddle.arange(seq_len[i])
@@ -490,7 +492,8 @@ def dybatch_preprocess(
             [
                 1
                 if not benchmark
-                else max_length,  # Note(Zhengzekang): When in benchmark mode, we need to set a fixed decode length.
+                else max_length
+                - pre_caches_length,  # Note(Zhengzekang): When in benchmark mode, we need to set a fixed decode length.
             ]
             * bs
         )
diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py
@@ -212,9 +212,13 @@ def __init__(
         self.quant_bits = quant_bits
         self.use_weight_only = False
         self.weight_dtype = self._dtype
+        self.create_params_type = self._dtype
 
         if self.quant_bits != -1:
             self.use_weight_only = True
+            self.create_params_type = (
+                "int8"  # If use weightonly int4, params dtype is int8, and one of the dimension will be half.
+            )
             self.weight_dtype = "int" + str(self.quant_bits)
 
         self.ln_scales, self.ln_biases = [], []
@@ -292,7 +296,7 @@ def _add_parameter(param):
             qkv_weight = self.create_parameter(
                 shape=qkv_weight_shape,
                 attr=qkv_weight_attr,
-                dtype=self.weight_dtype,
+                dtype=self.create_params_type,
                 is_bias=False,
             )
 
@@ -321,7 +325,7 @@ def _add_parameter(param):
             linear_weight = self.create_parameter(
                 shape=linear_weight_shape,
                 attr=linear_weight_attr,
-                dtype=self.weight_dtype,
+                dtype=self.create_params_type,
                 is_bias=False,
             )
 
@@ -371,7 +375,7 @@ def _add_parameter(param):
             ffn1_weight = self.create_parameter(
                 shape=ffn1_weight_shape,
                 attr=ffn1_weight_attr,
-                dtype=self.weight_dtype,
+                dtype=self.create_params_type,
                 is_bias=False,
             )
 
@@ -401,7 +405,7 @@ def _add_parameter(param):
             ffn2_weight = self.create_parameter(
                 shape=ffn2_weight_shape,
                 attr=ffn2_weight_attr,
-                dtype=self.weight_dtype,
+                dtype=self.create_params_type,
                 is_bias=False,
             )