19
19
import time
20
20
from abc import abstractmethod
21
21
from dataclasses import dataclass , field
22
- from distutils .command .config import config
23
22
24
23
import numpy as np
25
24
import paddle
@@ -139,6 +138,7 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = N
139
138
self .tokenizer = tokenizer
140
139
self .return_tensors = "pd"
141
140
self .tensor_parallel_rank , self .tensor_parallel_degree = init_dist_env ()
141
+ self .model_config .tensor_parallel_rank , self .model_config .tensor_parallel_degree = init_dist_env ()
142
142
143
143
def _preprocess (self , source ):
144
144
tokenized_source = self .tokenizer (
@@ -284,11 +284,11 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
284
284
self .cache_kvs [0 ].shape [- 3 ],
285
285
self .cache_kvs [0 ].shape [- 1 ],
286
286
)
287
- total_max_length = config .src_length + config .max_length
288
- self .pre_ids = paddle .full ([config .batch_size , total_max_length ], - 1 , dtype = "int64" )
287
+ self . total_max_length = config .src_length + config .max_length
288
+ self .pre_ids = paddle .full ([config .batch_size , self . total_max_length ], - 1 , dtype = "int64" )
289
289
if "chatglm" in self .architectures :
290
290
self .attention_mask = paddle .ones (
291
- shape = (config .batch_size , 1 , total_max_length , total_max_length ),
291
+ shape = (config .batch_size , 1 , self . total_max_length , self . total_max_length ),
292
292
dtype = self .dtype ,
293
293
)
294
294
self .tgt_pos = paddle .ones (
@@ -297,15 +297,17 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
297
297
)
298
298
else :
299
299
self .attention_mask = paddle .zeros (
300
- shape = (config .batch_size , 1 , total_max_length , total_max_length ),
300
+ shape = (config .batch_size , 1 , self . total_max_length , self . total_max_length ),
301
301
dtype = self .dtype ,
302
302
)
303
303
304
304
self .tgt_generation_mask = paddle .zeros (
305
- shape = [config .batch_size , 1 , 1 , total_max_length ],
305
+ shape = [config .batch_size , 1 , 1 , self . total_max_length ],
306
306
dtype = self .dtype ,
307
307
)
308
- self .arange_tensor_encoder = paddle .zeros (shape = (config .batch_size , 1 , total_max_length ), dtype = self .dtype )
308
+ self .arange_tensor_encoder = paddle .zeros (
309
+ shape = (config .batch_size , 1 , self .total_max_length ), dtype = self .dtype
310
+ )
309
311
310
312
if config .export_precache :
311
313
if config .prefix_path :
@@ -342,6 +344,10 @@ def _postprocess(self, predictions):
342
344
return None
343
345
344
346
def _preprocess (self , source ):
347
+ self .attention_mask [:] = 0
348
+ self .tgt_generation_mask [:] = 0
349
+ pre_caches_length = 0 if not self .config .export_precache else self .pre_caches [0 ].shape [- 2 ]
350
+
345
351
if "chatglm" in self .architectures :
346
352
inputs = dybatch_preprocess (
347
353
self .tokenizer ,
@@ -370,12 +376,12 @@ def _preprocess(self, source):
370
376
)
371
377
for i in range (inputs ["input_ids" ].shape [0 ]):
372
378
length = inputs ["seq_len_encoder" ][i ][0 ]
373
- self .attention_mask [i , 0 , :length , :length ] = paddle .tril (
379
+ self .attention_mask [i , : , :length , :length ] = paddle .tril (
374
380
paddle .ones (shape = (length , length ), dtype = self .config .dtype )
375
381
)
376
- self .arange_tensor_encoder [i , 0 , :length ] = paddle .arange (length ).astype (self .config .dtype )
382
+ self .arange_tensor_encoder [i , : , :length ] = paddle .arange (length ).astype (self .config .dtype )
377
383
378
- self .tgt_generation_mask [i , 0 , 0 , :length ] = paddle .ones (shape = [1 , length ], dtype = self .config .dtype )
384
+ self .tgt_generation_mask [i , : , 0 , :length ] = paddle .ones (shape = [1 , length ], dtype = self .config .dtype )
379
385
# alibi encoder
380
386
alibi_slopes = get_alibi_slopes (self .model_config .n_head )
381
387
inputs ["position_ids" ] = paddle .to_tensor (alibi_slopes , dtype = "float32" )
@@ -402,16 +408,16 @@ def _preprocess(self, source):
402
408
[
403
409
inputs ["input_ids" ].shape [0 ],
404
410
self .model_config .n_head // self .model_config .tensor_parallel_degree ,
405
- self .config . max_length ,
406
- self .config . max_length ,
411
+ self .total_max_length ,
412
+ self .total_max_length ,
407
413
]
408
414
)
409
415
alibi_decoder = alibi .expand (
410
416
[
411
417
inputs ["input_ids" ].shape [0 ],
412
418
self .model_config .n_head // self .model_config .tensor_parallel_degree ,
413
419
1 ,
414
- self .config . max_length ,
420
+ self .total_max_length ,
415
421
]
416
422
)
417
423
self .attention_mask = (
@@ -422,7 +428,6 @@ def _preprocess(self, source):
422
428
)
423
429
424
430
else :
425
- pre_caches_length = 0 if not self .config .export_precache else self .pre_caches [0 ].shape [- 2 ]
426
431
inputs = dybatch_preprocess (
427
432
self .tokenizer ,
428
433
source ,
@@ -655,7 +660,7 @@ def create_predictor(
655
660
from paddlenlp .experimental .transformers import (
656
661
LlamaForCausalLMInferenceModel as LlamaInferenceModel ,
657
662
)
658
-
663
+
659
664
config .tensor_parallel_degree = tensor_parallel_degree
660
665
config .tensor_parallel_rank = tensor_parallel_rank
661
666
config .quant_bits = - 1
@@ -679,6 +684,20 @@ def create_predictor(
679
684
dtype = predictor_args .dtype ,
680
685
)
681
686
model .eval ()
687
+ elif "bloom" in config .architectures [0 ].lower ():
688
+ from paddlenlp .experimental .transformers import (
689
+ BloomForCausalLMInferenceModel ,
690
+ )
691
+
692
+ config .tensor_parallel_degree = tensor_parallel_degree
693
+ config .tensor_parallel_rank = tensor_parallel_rank
694
+ model = BloomForCausalLMInferenceModel .from_pretrained (
695
+ predictor_args .model_name_or_path ,
696
+ config = config ,
697
+ dtype = predictor_args .dtype ,
698
+ )
699
+ cache_kvs_shape = BloomForCausalLMInferenceModel .get_cache_kvs_shape (config , predictor_args .batch_size )
700
+ model .eval ()
682
701
predictor = DygraphInferencePredictor (predictor_args , model = model , tokenizer = tokenizer )
683
702
elif predictor_args .mode == "static" :
684
703
config = AutoConfig .from_pretrained (predictor_args .model_name_or_path )
@@ -698,6 +717,15 @@ def create_predictor(
698
717
config , predictor_args .batch_size
699
718
)
700
719
predictor = StaticInferencePredictor (predictor_args , cache_kvs_shape , tokenizer = tokenizer )
720
+ elif "bloom" in config .architectures [0 ].lower ():
721
+ from paddlenlp .experimental .transformers import (
722
+ BloomForCausalLMInferenceModel ,
723
+ )
724
+
725
+ cache_kvs_shape = BloomForCausalLMInferenceModel .get_cache_kvs_shape (config , predictor_args .batch_size )
726
+ predictor = StaticInferencePredictor (
727
+ predictor_args , cache_kvs_shape = cache_kvs_shape , tokenizer = tokenizer
728
+ )
701
729
else :
702
730
raise ValueError ("the `mode` should be one of [dynamic, static]" )
703
731
return predictor
0 commit comments