20
20
from abc import abstractmethod
21
21
from dataclasses import dataclass , field
22
22
from threading import Thread
23
- from typing import List , Optional
24
23
25
24
import numpy as np
26
25
import paddle
27
- import paddle .distributed .fleet .base .topology as tp
28
26
import paddle .incubate .multiprocessing as mp
29
27
from paddle .base .framework import in_cinn_mode , in_pir_executor_mode
30
28
from paddle .distributed import fleet
31
- from utils .utils import (
32
- dybatch_preprocess ,
33
- get_alibi_slopes ,
34
- get_infer_model_path ,
35
- get_model_max_position_embeddings ,
36
- get_prefix_tuning_params ,
37
- init_chat_template ,
38
- load_real_time_tokens ,
39
- read_res ,
40
- )
41
29
42
30
from paddlenlp .generation import GenerationConfig , TextIteratorStreamer
43
31
from paddlenlp .peft import LoRAConfig , LoRAModel , PrefixConfig , PrefixModelForCausalLM
54
42
PretrainedModel ,
55
43
PretrainedTokenizer ,
56
44
)
45
+ from paddlenlp .utils import llm_utils
57
46
from paddlenlp .utils .import_utils import import_module , is_paddlenlp_ops_available
58
47
from paddlenlp .utils .log import logger
59
48
@@ -157,50 +146,6 @@ def batchfy_text(texts, batch_size):
157
146
return batch_texts
158
147
159
148
160
- def init_dist_env ():
161
- tensor_parallel_degree = paddle .distributed .get_world_size ()
162
- tensor_parallel_rank = paddle .distributed .get_rank ()
163
-
164
- if tensor_parallel_degree > 1 :
165
- # refer to: https://github.com/PaddlePaddle/Paddle/blob/4abea956ee852ce52791a1e08fa92ed4d3be150d/python/paddle/distributed/fleet/fleet.py#L298C23-L298C45
166
- hcg = tp ._HYBRID_PARALLEL_GROUP
167
- if hcg is None :
168
- strategy = fleet .DistributedStrategy ()
169
- strategy .hybrid_configs = {
170
- "dp_degree" : 1 ,
171
- "mp_degree" : tensor_parallel_degree ,
172
- "pp_degree" : 1 ,
173
- "sharding_degree" : 1 ,
174
- }
175
- fleet .init (is_collective = True , strategy = strategy )
176
- hcg = fleet .get_hybrid_communicate_group ()
177
-
178
- tensor_parallel_rank = hcg .get_model_parallel_rank ()
179
- return tensor_parallel_rank , tensor_parallel_degree
180
-
181
-
182
- def get_eos_token_id (
183
- tokenizer : PretrainedTokenizer , generation_config : Optional [GenerationConfig ] = None
184
- ) -> List [List [int ]]:
185
- """get eos_token_id from generation_config or tokenizer
186
-
187
- Returns:
188
- List[int]: eos_token_id to stop the generation
189
- """
190
- eos_token_ids = []
191
- if tokenizer .eos_token_id is not None :
192
- eos_token_ids .append (tokenizer .eos_token_id )
193
-
194
- if generation_config is not None and generation_config .eos_token_id is not None :
195
- if isinstance (generation_config .eos_token_id , int ):
196
- eos_token_ids .append (generation_config .eos_token_id )
197
- else :
198
- eos_token_ids .extend (generation_config .eos_token_id )
199
-
200
- eos_token_ids_dict = {str (item ): item for item in eos_token_ids }
201
- return list (eos_token_ids_dict .values ())
202
-
203
-
204
149
class BasePredictor :
205
150
def __init__ (self , config : PredictorArgument , tokenizer : PretrainedTokenizer = None ):
206
151
self .model_config = AutoConfig .from_pretrained (config .model_name_or_path )
@@ -211,8 +156,11 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = N
211
156
self .tokenizer = tokenizer
212
157
213
158
self .return_tensors = "pd"
214
- self .tensor_parallel_rank , self .tensor_parallel_degree = init_dist_env ()
215
- self .model_config .tensor_parallel_rank , self .model_config .tensor_parallel_degree = init_dist_env ()
159
+ self .tensor_parallel_rank , self .tensor_parallel_degree = llm_utils .init_dist_env ()
160
+ self .model_config .tensor_parallel_rank , self .model_config .tensor_parallel_degree = (
161
+ self .tensor_parallel_rank ,
162
+ self .tensor_parallel_degree ,
163
+ )
216
164
217
165
try :
218
166
self .generation_config = GenerationConfig .from_pretrained (config .model_name_or_path )
@@ -293,7 +241,7 @@ def __init__(
293
241
)
294
242
self .model .merge ()
295
243
if config .prefix_path is not None :
296
- prefix_tuning_params = get_prefix_tuning_params (self .model )
244
+ prefix_tuning_params = llm_utils . get_prefix_tuning_params (self .model )
297
245
self .model = PrefixModelForCausalLM .from_pretrained (
298
246
model = self .model ,
299
247
prefix_path = config .prefix_path ,
@@ -307,7 +255,7 @@ def _infer(self, inputs: dict[str, paddle.Tensor]):
307
255
** inputs ,
308
256
max_new_tokens = self .config .max_length ,
309
257
bos_token_id = self .tokenizer .bos_token_id ,
310
- eos_token_id = get_eos_token_id (self .tokenizer , self .generation_config ),
258
+ eos_token_id = llm_utils . get_eos_token_id (self .tokenizer , self .generation_config ),
311
259
pad_token_id = self .tokenizer .pad_token_id ,
312
260
decode_strategy = self .config .decode_strategy ,
313
261
temperature = self .config .temperature ,
@@ -326,7 +274,7 @@ def stream_predict(self, inputs: dict[str, paddle.Tensor]):
326
274
streamer = text_streamer ,
327
275
max_new_tokens = self .config .max_length ,
328
276
bos_token_id = self .tokenizer .bos_token_id ,
329
- eos_token_id = get_eos_token_id (self .tokenizer , self .generation_config ),
277
+ eos_token_id = llm_utils . get_eos_token_id (self .tokenizer , self .generation_config ),
330
278
pad_token_id = self .tokenizer .pad_token_id ,
331
279
decode_strategy = (
332
280
"greedy_search" if self .config .top_k == 1 and self .config .top_p == 1.0 else self .config .decode_strategy
@@ -465,7 +413,7 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
465
413
466
414
def _postprocess (self , predictions , return_tokens = False ):
467
415
if paddle .distributed .get_rank () == 0 :
468
- tokens : np .ndarray = load_real_time_tokens ()
416
+ tokens : np .ndarray = llm_utils . load_real_time_tokens ()
469
417
decoded_predictions = self .tokenizer .batch_decode (
470
418
tokens .tolist (), skip_special_tokens = True , clean_up_tokenization_spaces = False
471
419
)
@@ -487,15 +435,15 @@ def _preprocess(self, source):
487
435
source = [source ] if isinstance (source , str ) else source
488
436
source = [self .tokenizer .apply_chat_template (sentence , tokenize = False ) for sentence in source ]
489
437
490
- inputs = dybatch_preprocess (
438
+ inputs = llm_utils . dybatch_preprocess (
491
439
self .tokenizer ,
492
440
source ,
493
441
self .config .src_length ,
494
442
self .config .max_length ,
495
443
self .architectures ,
496
444
top_p = self .config .top_p ,
497
445
temperature = self .config .temperature ,
498
- eos_token_id = get_eos_token_id (self .tokenizer , self .generation_config ),
446
+ eos_token_id = llm_utils . get_eos_token_id (self .tokenizer , self .generation_config ),
499
447
benchmark = self .config .benchmark ,
500
448
pre_caches_length = pre_caches_length ,
501
449
)
@@ -546,7 +494,7 @@ def _preprocess(self, source):
546
494
547
495
inputs ["tgt_pos" ] = inputs ["tgt_pos" ] + pre_caches_length
548
496
# alibi encoder
549
- alibi_slopes = get_alibi_slopes (self .model_config .n_head )
497
+ alibi_slopes = llm_utils . get_alibi_slopes (self .model_config .n_head )
550
498
inputs ["position_ids" ] = paddle .to_tensor (alibi_slopes , dtype = "float32" )
551
499
arange_tensor_encoder = paddle .arange (self .config .total_max_length , dtype = self .config .dtype )
552
500
alibi = alibi_slopes [None , :, None , None ] * arange_tensor_encoder
@@ -667,7 +615,9 @@ def _create_predictor(self, predictor_args: PredictorArgument):
667
615
import_module ("paddlenlp_ops.transpose_remove_padding" )
668
616
import_module ("paddlenlp_ops.write_cache_kv" )
669
617
670
- infer_model_path = get_infer_model_path (predictor_args .model_name_or_path , predictor_args .model_prefix )
618
+ infer_model_path = llm_utils .get_infer_model_path (
619
+ predictor_args .model_name_or_path , predictor_args .model_prefix
620
+ )
671
621
672
622
config = paddle .inference .Config (infer_model_path + ".pdmodel" , infer_model_path + ".pdiparams" )
673
623
@@ -854,7 +804,7 @@ def init_model_inputs(self, config: PredictorArgument):
854
804
shape = [config .batch_size , 1 ], fill_value = config .temperature , dtype = "float32"
855
805
)
856
806
self .model_inputs ["eos_token_id" ] = paddle .to_tensor (
857
- np .array (get_eos_token_id (self .tokenizer , self .generation_config )).reshape (- 1 , 1 ).astype ("int64" )
807
+ np .array (llm_utils . get_eos_token_id (self .tokenizer , self .generation_config )).reshape (- 1 , 1 ).astype ("int64" )
858
808
)
859
809
self .model_inputs ["penalty_score" ] = paddle .full (
860
810
shape = [config .batch_size , 1 ], fill_value = config .repetition_penalty , dtype = "float32"
@@ -871,7 +821,7 @@ def init_model_inputs(self, config: PredictorArgument):
871
821
self .model_inputs ["max_length" ] = paddle .full (
872
822
shape = [config .batch_size , 1 ], fill_value = config .max_length , dtype = "int64"
873
823
)
874
- self .model_inputs ["rope_emb" ] = self . _get_rotary_position_embedding (
824
+ self .model_inputs ["rope_emb" ] = llm_utils . get_rotary_position_embedding (
875
825
paddle .arange (config .total_max_length ).reshape ((1 , - 1 )), self .head_dim , self .rope_theta , self .rope_scaling
876
826
)
877
827
self .model_inputs ["bad_tokens" ] = paddle .to_tensor ([- 1 ], dtype = "int64" )
@@ -888,7 +838,7 @@ def init_model_inputs(self, config: PredictorArgument):
888
838
shape = [config .batch_size , 1 , 1 , config .total_max_length ], fill_value = 1 , dtype = self .dtype
889
839
)
890
840
arange_tensor_encoder = paddle .arange (config .total_max_length ).astype (self .dtype )
891
- alibi_slopes = get_alibi_slopes (self .num_attention_heads )
841
+ alibi_slopes = llm_utils . get_alibi_slopes (self .num_attention_heads )
892
842
alibi = alibi_slopes [None , :, None , None ] * arange_tensor_encoder
893
843
alibi_encoder = alibi .tile ([config .batch_size , 1 , config .total_max_length , 1 ])
894
844
alibi_decoder = alibi .tile (
@@ -907,57 +857,6 @@ def init_model_inputs(self, config: PredictorArgument):
907
857
alibi_decoder + (1 - self .model_inputs ["tgt_mask" ]) * paddle .finfo (self .dtype ).min
908
858
).cast (self .dtype )
909
859
910
- def _get_rotary_position_embedding (self , position_ids , head_dim , rope_theta = 10000.0 , rope_scaling : dict = None ):
911
- """
912
- Pre-calculate rotary position embedding for position_ids.
913
-
914
- Args:
915
- position_ids: [1, S]
916
- head_dim: D
917
-
918
- Returns:
919
- rot_emb: [2, 1, S, 1, D], cos + sin
920
- """
921
- bsz , max_seq_len = position_ids .shape [:2 ]
922
- rot_emb = paddle .zeros ((2 , bsz , max_seq_len , 1 , head_dim ), dtype = "float32" )
923
- inv_freq = rope_theta ** (- paddle .arange (0 , head_dim , 2 , dtype = "float32" ) / head_dim )
924
-
925
- if rope_scaling is not None :
926
- rope_type = rope_scaling .get ("rope_type" , None )
927
- if rope_type is not None and rope_type == "llama3" :
928
- factor = rope_scaling .get ("factor" , 8.0 )
929
- low_freq_factor = rope_scaling .get ("low_freq_factor" , 1.0 )
930
- high_freq_factor = rope_scaling .get ("high_freq_factor" , 4.0 )
931
- original_max_position_embeddings = rope_scaling .get ("original_max_position_embeddings" , 8192 )
932
-
933
- low_freq_wavelen = original_max_position_embeddings / low_freq_factor
934
- high_freq_wavelen = original_max_position_embeddings / high_freq_factor
935
- new_freqs = []
936
- for freq in inv_freq :
937
- import math
938
-
939
- wavelen = 2 * math .pi / freq
940
- if wavelen < high_freq_wavelen :
941
- new_freqs .append (freq )
942
- elif wavelen > low_freq_wavelen :
943
- new_freqs .append (freq / factor )
944
- else :
945
- assert low_freq_wavelen != high_freq_wavelen
946
- smooth = (original_max_position_embeddings / wavelen - low_freq_factor ) / (
947
- high_freq_factor - low_freq_factor
948
- )
949
- new_freqs .append ((1 - smooth ) * freq / factor + smooth * freq )
950
- inv_freq = paddle .to_tensor (new_freqs , dtype = inv_freq .dtype )
951
-
952
- # shape: [B, S, D/2]
953
- freqs = paddle .einsum ("ij,k->ijk" , position_ids .cast ("float32" ), inv_freq )
954
- # shape: [B, S, 1, D]
955
- emb = paddle .concat ([freqs , freqs ], axis = - 1 ).reshape ((bsz , max_seq_len , 1 , head_dim ))
956
-
957
- rot_emb [0 ] = paddle .cos (emb )
958
- rot_emb [1 ] = paddle .sin (emb )
959
- return rot_emb
960
-
961
860
def _preprocess (self , input_text : list [str ]):
962
861
if self .tokenizer .chat_template is not None :
963
862
input_text = [input_text ] if isinstance (input_text , str ) else input_text
@@ -1053,7 +952,9 @@ def predict(self, input_texts: list[str], return_tokens=False):
1053
952
output_tensor = output_tensor .cpu ()
1054
953
tensor_queue .put (output_tensor )
1055
954
1056
- read_res_process = mp .Process (target = read_res , args = [self .model_name_or_path , tensor_queue , result_queue ])
955
+ read_res_process = mp .Process (
956
+ target = llm_utils .read_res , args = [self .model_name_or_path , tensor_queue , result_queue ]
957
+ )
1057
958
if self .tensor_parallel_rank == 0 :
1058
959
read_res_process .start ()
1059
960
@@ -1119,7 +1020,9 @@ def _create_predictor(self, predictor_args: PredictorArgument):
1119
1020
"https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md"
1120
1021
)
1121
1022
1122
- infer_model_path = get_infer_model_path (predictor_args .model_name_or_path , predictor_args .model_prefix )
1023
+ infer_model_path = llm_utils .get_infer_model_path (
1024
+ predictor_args .model_name_or_path , predictor_args .model_prefix
1025
+ )
1123
1026
1124
1027
config = paddle .inference .Config (infer_model_path + ".pdmodel" , infer_model_path + ".pdiparams" )
1125
1028
@@ -1175,7 +1078,9 @@ def predict(self, input_texts: list[str], return_tokens=False):
1175
1078
output_tensor = output_tensor .cpu ()
1176
1079
tensor_queue .put (output_tensor )
1177
1080
1178
- read_res_process = mp .Process (target = read_res , args = [self .model_name_or_path , tensor_queue , result_queue ])
1081
+ read_res_process = mp .Process (
1082
+ target = llm_utils .read_res , args = [self .model_name_or_path , tensor_queue , result_queue ]
1083
+ )
1179
1084
1180
1085
if self .tensor_parallel_rank == 0 :
1181
1086
read_res_process .start ()
@@ -1220,15 +1125,15 @@ def create_predictor(
1220
1125
predictor_args .model_name_or_path ,
1221
1126
)
1222
1127
# init chat_template for tokenizer
1223
- init_chat_template (tokenizer , predictor_args .model_name_or_path , predictor_args .chat_template )
1128
+ llm_utils . init_chat_template (tokenizer , predictor_args .model_name_or_path , predictor_args .chat_template )
1224
1129
1225
1130
# TODO(wj-Mcat): fix llama tokenzier pad_token bug
1226
1131
if (isinstance (tokenizer , (LlamaTokenizer , Llama3Tokenizer ))) and not tokenizer .pad_token :
1227
1132
tokenizer .pad_token = tokenizer .eos_token
1228
1133
1229
1134
config = AutoConfig .from_pretrained (predictor_args .model_name_or_path )
1230
1135
1231
- max_position_embeddings = get_model_max_position_embeddings (config )
1136
+ max_position_embeddings = llm_utils . get_model_max_position_embeddings (config )
1232
1137
if max_position_embeddings is None :
1233
1138
max_position_embeddings = predictor_args .src_length + predictor_args .max_length
1234
1139
logger .warning (
@@ -1247,7 +1152,7 @@ def create_predictor(
1247
1152
predictor_args .top_p = 0.0
1248
1153
predictor_args .temperature = 1.0
1249
1154
1250
- tensor_parallel_rank , tensor_parallel_degree = init_dist_env ()
1155
+ tensor_parallel_rank , tensor_parallel_degree = llm_utils . init_dist_env ()
1251
1156
if not predictor_args .inference_model :
1252
1157
tokenizer .padding_side = "left"
1253
1158
if predictor_args .mode == "dynamic" :
0 commit comments