Skip to content

Commit 6e12ef3

Browse files
authored
[LLM Inference] move llm.utils.utils.py to paddlenlp.utils.llm_utils.py (#8946)
* adjust some code * update * update
1 parent 6d538c7 commit 6e12ef3

File tree

7 files changed

+146
-148
lines changed

7 files changed

+146
-148
lines changed

legacy/examples/RLHF/infer_utils.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,9 @@ def __init__(self, config, model: PretrainedModel = None, tokenizer: PretrainedT
5353

5454
@staticmethod
5555
def create_predictor(trainer):
56-
from predictor import (
57-
PdArgumentParser,
58-
PredictorArgument,
59-
get_model_max_position_embeddings,
60-
)
56+
from predictor import PdArgumentParser, PredictorArgument
57+
58+
from paddlenlp.utils.llm_utils import get_model_max_position_embeddings
6159

6260
# create infer model
6361
# NOTE: infer model use static name param_attr to create and cannot be

llm/alignment/ppo/infer_utils.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,9 @@ def __init__(self, config, model: PretrainedModel = None, tokenizer: PretrainedT
5353

5454
@staticmethod
5555
def create_predictor(trainer):
56-
from predictor import (
57-
PdArgumentParser,
58-
PredictorArgument,
59-
get_model_max_position_embeddings,
60-
)
56+
from predictor import PdArgumentParser, PredictorArgument
57+
58+
from paddlenlp.utils.llm_utils import get_model_max_position_embeddings
6159

6260
# create infer model
6361
# NOTE: infer model use static name param_attr to create and cannot be

llm/predict/export_model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
import paddle
2020
from paddle.distributed import fleet
2121
from predict.predictor import ModelArgument, PredictorArgument, create_predictor
22-
from utils.utils import generate_rank_mapping, get_infer_model_path
2322

2423
from paddlenlp.trainer import PdArgumentParser
24+
from paddlenlp.utils import llm_utils
2525

2626

2727
@dataclass
@@ -53,7 +53,7 @@ def main():
5353
predictor.model.eval()
5454

5555
predictor.model.to_static(
56-
get_infer_model_path(export_args.output_path, predictor_args.model_prefix),
56+
llm_utils.get_infer_model_path(export_args.output_path, predictor_args.model_prefix),
5757
{
5858
"dtype": predictor_args.dtype,
5959
"export_precache": predictor_args.export_precache,
@@ -67,7 +67,7 @@ def main():
6767
predictor.model.generation_config.save_pretrained(export_args.output_path)
6868

6969
predictor.tokenizer.save_pretrained(export_args.output_path)
70-
generate_rank_mapping(os.path.join(export_args.output_path, "rank_mapping.csv"))
70+
llm_utils.generate_rank_mapping(os.path.join(export_args.output_path, "rank_mapping.csv"))
7171

7272
if tensor_parallel_degree > 1:
7373
export_args.output_path = os.path.join(export_args.output_path, f"rank_{tensor_parallel_rank}")

llm/predict/predictor.py

Lines changed: 31 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -20,24 +20,12 @@
2020
from abc import abstractmethod
2121
from dataclasses import dataclass, field
2222
from threading import Thread
23-
from typing import List, Optional
2423

2524
import numpy as np
2625
import paddle
27-
import paddle.distributed.fleet.base.topology as tp
2826
import paddle.incubate.multiprocessing as mp
2927
from paddle.base.framework import in_cinn_mode, in_pir_executor_mode
3028
from paddle.distributed import fleet
31-
from utils.utils import (
32-
dybatch_preprocess,
33-
get_alibi_slopes,
34-
get_infer_model_path,
35-
get_model_max_position_embeddings,
36-
get_prefix_tuning_params,
37-
init_chat_template,
38-
load_real_time_tokens,
39-
read_res,
40-
)
4129

4230
from paddlenlp.generation import GenerationConfig, TextIteratorStreamer
4331
from paddlenlp.peft import LoRAConfig, LoRAModel, PrefixConfig, PrefixModelForCausalLM
@@ -54,6 +42,7 @@
5442
PretrainedModel,
5543
PretrainedTokenizer,
5644
)
45+
from paddlenlp.utils import llm_utils
5746
from paddlenlp.utils.import_utils import import_module, is_paddlenlp_ops_available
5847
from paddlenlp.utils.log import logger
5948

@@ -157,50 +146,6 @@ def batchfy_text(texts, batch_size):
157146
return batch_texts
158147

159148

160-
def init_dist_env():
161-
tensor_parallel_degree = paddle.distributed.get_world_size()
162-
tensor_parallel_rank = paddle.distributed.get_rank()
163-
164-
if tensor_parallel_degree > 1:
165-
# refer to: https://github.com/PaddlePaddle/Paddle/blob/4abea956ee852ce52791a1e08fa92ed4d3be150d/python/paddle/distributed/fleet/fleet.py#L298C23-L298C45
166-
hcg = tp._HYBRID_PARALLEL_GROUP
167-
if hcg is None:
168-
strategy = fleet.DistributedStrategy()
169-
strategy.hybrid_configs = {
170-
"dp_degree": 1,
171-
"mp_degree": tensor_parallel_degree,
172-
"pp_degree": 1,
173-
"sharding_degree": 1,
174-
}
175-
fleet.init(is_collective=True, strategy=strategy)
176-
hcg = fleet.get_hybrid_communicate_group()
177-
178-
tensor_parallel_rank = hcg.get_model_parallel_rank()
179-
return tensor_parallel_rank, tensor_parallel_degree
180-
181-
182-
def get_eos_token_id(
183-
tokenizer: PretrainedTokenizer, generation_config: Optional[GenerationConfig] = None
184-
) -> List[List[int]]:
185-
"""get eos_token_id from generation_config or tokenizer
186-
187-
Returns:
188-
List[int]: eos_token_id to stop the generation
189-
"""
190-
eos_token_ids = []
191-
if tokenizer.eos_token_id is not None:
192-
eos_token_ids.append(tokenizer.eos_token_id)
193-
194-
if generation_config is not None and generation_config.eos_token_id is not None:
195-
if isinstance(generation_config.eos_token_id, int):
196-
eos_token_ids.append(generation_config.eos_token_id)
197-
else:
198-
eos_token_ids.extend(generation_config.eos_token_id)
199-
200-
eos_token_ids_dict = {str(item): item for item in eos_token_ids}
201-
return list(eos_token_ids_dict.values())
202-
203-
204149
class BasePredictor:
205150
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None):
206151
self.model_config = AutoConfig.from_pretrained(config.model_name_or_path)
@@ -211,8 +156,11 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = N
211156
self.tokenizer = tokenizer
212157

213158
self.return_tensors = "pd"
214-
self.tensor_parallel_rank, self.tensor_parallel_degree = init_dist_env()
215-
self.model_config.tensor_parallel_rank, self.model_config.tensor_parallel_degree = init_dist_env()
159+
self.tensor_parallel_rank, self.tensor_parallel_degree = llm_utils.init_dist_env()
160+
self.model_config.tensor_parallel_rank, self.model_config.tensor_parallel_degree = (
161+
self.tensor_parallel_rank,
162+
self.tensor_parallel_degree,
163+
)
216164

217165
try:
218166
self.generation_config = GenerationConfig.from_pretrained(config.model_name_or_path)
@@ -293,7 +241,7 @@ def __init__(
293241
)
294242
self.model.merge()
295243
if config.prefix_path is not None:
296-
prefix_tuning_params = get_prefix_tuning_params(self.model)
244+
prefix_tuning_params = llm_utils.get_prefix_tuning_params(self.model)
297245
self.model = PrefixModelForCausalLM.from_pretrained(
298246
model=self.model,
299247
prefix_path=config.prefix_path,
@@ -307,7 +255,7 @@ def _infer(self, inputs: dict[str, paddle.Tensor]):
307255
**inputs,
308256
max_new_tokens=self.config.max_length,
309257
bos_token_id=self.tokenizer.bos_token_id,
310-
eos_token_id=get_eos_token_id(self.tokenizer, self.generation_config),
258+
eos_token_id=llm_utils.get_eos_token_id(self.tokenizer, self.generation_config),
311259
pad_token_id=self.tokenizer.pad_token_id,
312260
decode_strategy=self.config.decode_strategy,
313261
temperature=self.config.temperature,
@@ -326,7 +274,7 @@ def stream_predict(self, inputs: dict[str, paddle.Tensor]):
326274
streamer=text_streamer,
327275
max_new_tokens=self.config.max_length,
328276
bos_token_id=self.tokenizer.bos_token_id,
329-
eos_token_id=get_eos_token_id(self.tokenizer, self.generation_config),
277+
eos_token_id=llm_utils.get_eos_token_id(self.tokenizer, self.generation_config),
330278
pad_token_id=self.tokenizer.pad_token_id,
331279
decode_strategy=(
332280
"greedy_search" if self.config.top_k == 1 and self.config.top_p == 1.0 else self.config.decode_strategy
@@ -465,7 +413,7 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
465413

466414
def _postprocess(self, predictions, return_tokens=False):
467415
if paddle.distributed.get_rank() == 0:
468-
tokens: np.ndarray = load_real_time_tokens()
416+
tokens: np.ndarray = llm_utils.load_real_time_tokens()
469417
decoded_predictions = self.tokenizer.batch_decode(
470418
tokens.tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=False
471419
)
@@ -487,15 +435,15 @@ def _preprocess(self, source):
487435
source = [source] if isinstance(source, str) else source
488436
source = [self.tokenizer.apply_chat_template(sentence, tokenize=False) for sentence in source]
489437

490-
inputs = dybatch_preprocess(
438+
inputs = llm_utils.dybatch_preprocess(
491439
self.tokenizer,
492440
source,
493441
self.config.src_length,
494442
self.config.max_length,
495443
self.architectures,
496444
top_p=self.config.top_p,
497445
temperature=self.config.temperature,
498-
eos_token_id=get_eos_token_id(self.tokenizer, self.generation_config),
446+
eos_token_id=llm_utils.get_eos_token_id(self.tokenizer, self.generation_config),
499447
benchmark=self.config.benchmark,
500448
pre_caches_length=pre_caches_length,
501449
)
@@ -546,7 +494,7 @@ def _preprocess(self, source):
546494

547495
inputs["tgt_pos"] = inputs["tgt_pos"] + pre_caches_length
548496
# alibi encoder
549-
alibi_slopes = get_alibi_slopes(self.model_config.n_head)
497+
alibi_slopes = llm_utils.get_alibi_slopes(self.model_config.n_head)
550498
inputs["position_ids"] = paddle.to_tensor(alibi_slopes, dtype="float32")
551499
arange_tensor_encoder = paddle.arange(self.config.total_max_length, dtype=self.config.dtype)
552500
alibi = alibi_slopes[None, :, None, None] * arange_tensor_encoder
@@ -667,7 +615,9 @@ def _create_predictor(self, predictor_args: PredictorArgument):
667615
import_module("paddlenlp_ops.transpose_remove_padding")
668616
import_module("paddlenlp_ops.write_cache_kv")
669617

670-
infer_model_path = get_infer_model_path(predictor_args.model_name_or_path, predictor_args.model_prefix)
618+
infer_model_path = llm_utils.get_infer_model_path(
619+
predictor_args.model_name_or_path, predictor_args.model_prefix
620+
)
671621

672622
config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams")
673623

@@ -854,7 +804,7 @@ def init_model_inputs(self, config: PredictorArgument):
854804
shape=[config.batch_size, 1], fill_value=config.temperature, dtype="float32"
855805
)
856806
self.model_inputs["eos_token_id"] = paddle.to_tensor(
857-
np.array(get_eos_token_id(self.tokenizer, self.generation_config)).reshape(-1, 1).astype("int64")
807+
np.array(llm_utils.get_eos_token_id(self.tokenizer, self.generation_config)).reshape(-1, 1).astype("int64")
858808
)
859809
self.model_inputs["penalty_score"] = paddle.full(
860810
shape=[config.batch_size, 1], fill_value=config.repetition_penalty, dtype="float32"
@@ -871,7 +821,7 @@ def init_model_inputs(self, config: PredictorArgument):
871821
self.model_inputs["max_length"] = paddle.full(
872822
shape=[config.batch_size, 1], fill_value=config.max_length, dtype="int64"
873823
)
874-
self.model_inputs["rope_emb"] = self._get_rotary_position_embedding(
824+
self.model_inputs["rope_emb"] = llm_utils.get_rotary_position_embedding(
875825
paddle.arange(config.total_max_length).reshape((1, -1)), self.head_dim, self.rope_theta, self.rope_scaling
876826
)
877827
self.model_inputs["bad_tokens"] = paddle.to_tensor([-1], dtype="int64")
@@ -888,7 +838,7 @@ def init_model_inputs(self, config: PredictorArgument):
888838
shape=[config.batch_size, 1, 1, config.total_max_length], fill_value=1, dtype=self.dtype
889839
)
890840
arange_tensor_encoder = paddle.arange(config.total_max_length).astype(self.dtype)
891-
alibi_slopes = get_alibi_slopes(self.num_attention_heads)
841+
alibi_slopes = llm_utils.get_alibi_slopes(self.num_attention_heads)
892842
alibi = alibi_slopes[None, :, None, None] * arange_tensor_encoder
893843
alibi_encoder = alibi.tile([config.batch_size, 1, config.total_max_length, 1])
894844
alibi_decoder = alibi.tile(
@@ -907,57 +857,6 @@ def init_model_inputs(self, config: PredictorArgument):
907857
alibi_decoder + (1 - self.model_inputs["tgt_mask"]) * paddle.finfo(self.dtype).min
908858
).cast(self.dtype)
909859

910-
def _get_rotary_position_embedding(self, position_ids, head_dim, rope_theta=10000.0, rope_scaling: dict = None):
911-
"""
912-
Pre-calculate rotary position embedding for position_ids.
913-
914-
Args:
915-
position_ids: [1, S]
916-
head_dim: D
917-
918-
Returns:
919-
rot_emb: [2, 1, S, 1, D], cos + sin
920-
"""
921-
bsz, max_seq_len = position_ids.shape[:2]
922-
rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, head_dim), dtype="float32")
923-
inv_freq = rope_theta ** (-paddle.arange(0, head_dim, 2, dtype="float32") / head_dim)
924-
925-
if rope_scaling is not None:
926-
rope_type = rope_scaling.get("rope_type", None)
927-
if rope_type is not None and rope_type == "llama3":
928-
factor = rope_scaling.get("factor", 8.0)
929-
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
930-
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
931-
original_max_position_embeddings = rope_scaling.get("original_max_position_embeddings", 8192)
932-
933-
low_freq_wavelen = original_max_position_embeddings / low_freq_factor
934-
high_freq_wavelen = original_max_position_embeddings / high_freq_factor
935-
new_freqs = []
936-
for freq in inv_freq:
937-
import math
938-
939-
wavelen = 2 * math.pi / freq
940-
if wavelen < high_freq_wavelen:
941-
new_freqs.append(freq)
942-
elif wavelen > low_freq_wavelen:
943-
new_freqs.append(freq / factor)
944-
else:
945-
assert low_freq_wavelen != high_freq_wavelen
946-
smooth = (original_max_position_embeddings / wavelen - low_freq_factor) / (
947-
high_freq_factor - low_freq_factor
948-
)
949-
new_freqs.append((1 - smooth) * freq / factor + smooth * freq)
950-
inv_freq = paddle.to_tensor(new_freqs, dtype=inv_freq.dtype)
951-
952-
# shape: [B, S, D/2]
953-
freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq)
954-
# shape: [B, S, 1, D]
955-
emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, 1, head_dim))
956-
957-
rot_emb[0] = paddle.cos(emb)
958-
rot_emb[1] = paddle.sin(emb)
959-
return rot_emb
960-
961860
def _preprocess(self, input_text: list[str]):
962861
if self.tokenizer.chat_template is not None:
963862
input_text = [input_text] if isinstance(input_text, str) else input_text
@@ -1053,7 +952,9 @@ def predict(self, input_texts: list[str], return_tokens=False):
1053952
output_tensor = output_tensor.cpu()
1054953
tensor_queue.put(output_tensor)
1055954

1056-
read_res_process = mp.Process(target=read_res, args=[self.model_name_or_path, tensor_queue, result_queue])
955+
read_res_process = mp.Process(
956+
target=llm_utils.read_res, args=[self.model_name_or_path, tensor_queue, result_queue]
957+
)
1057958
if self.tensor_parallel_rank == 0:
1058959
read_res_process.start()
1059960

@@ -1119,7 +1020,9 @@ def _create_predictor(self, predictor_args: PredictorArgument):
11191020
"https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md"
11201021
)
11211022

1122-
infer_model_path = get_infer_model_path(predictor_args.model_name_or_path, predictor_args.model_prefix)
1023+
infer_model_path = llm_utils.get_infer_model_path(
1024+
predictor_args.model_name_or_path, predictor_args.model_prefix
1025+
)
11231026

11241027
config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams")
11251028

@@ -1175,7 +1078,9 @@ def predict(self, input_texts: list[str], return_tokens=False):
11751078
output_tensor = output_tensor.cpu()
11761079
tensor_queue.put(output_tensor)
11771080

1178-
read_res_process = mp.Process(target=read_res, args=[self.model_name_or_path, tensor_queue, result_queue])
1081+
read_res_process = mp.Process(
1082+
target=llm_utils.read_res, args=[self.model_name_or_path, tensor_queue, result_queue]
1083+
)
11791084

11801085
if self.tensor_parallel_rank == 0:
11811086
read_res_process.start()
@@ -1220,15 +1125,15 @@ def create_predictor(
12201125
predictor_args.model_name_or_path,
12211126
)
12221127
# init chat_template for tokenizer
1223-
init_chat_template(tokenizer, predictor_args.model_name_or_path, predictor_args.chat_template)
1128+
llm_utils.init_chat_template(tokenizer, predictor_args.model_name_or_path, predictor_args.chat_template)
12241129

12251130
# TODO(wj-Mcat): fix llama tokenzier pad_token bug
12261131
if (isinstance(tokenizer, (LlamaTokenizer, Llama3Tokenizer))) and not tokenizer.pad_token:
12271132
tokenizer.pad_token = tokenizer.eos_token
12281133

12291134
config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)
12301135

1231-
max_position_embeddings = get_model_max_position_embeddings(config)
1136+
max_position_embeddings = llm_utils.get_model_max_position_embeddings(config)
12321137
if max_position_embeddings is None:
12331138
max_position_embeddings = predictor_args.src_length + predictor_args.max_length
12341139
logger.warning(
@@ -1247,7 +1152,7 @@ def create_predictor(
12471152
predictor_args.top_p = 0.0
12481153
predictor_args.temperature = 1.0
12491154

1250-
tensor_parallel_rank, tensor_parallel_degree = init_dist_env()
1155+
tensor_parallel_rank, tensor_parallel_degree = llm_utils.init_dist_env()
12511156
if not predictor_args.inference_model:
12521157
tokenizer.padding_side = "left"
12531158
if predictor_args.mode == "dynamic":

0 commit comments

Comments
 (0)