Skip to content

Commit 9571c45

Browse files
authored
enhance eos_tokens (#3274)
* enhance eos_tokens * update * update
1 parent 21caa63 commit 9571c45

File tree

11 files changed

+71
-89
lines changed

11 files changed

+71
-89
lines changed

docs/usage/environment_variables.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
3838

3939
# Whether to use HuggingFace tokenizer (0 or 1)
4040
"FD_USE_HF_TOKENIZER":
41-
lambda: os.getenv("FD_USE_HF_TOKENIZER", 0),
41+
lambda: bool(int(os.getenv("FD_USE_HF_TOKENIZER", 0))),
4242

4343
# ZMQ send high-water mark (HWM) during initialization
4444
"FD_ZMQ_SNDHWM":

docs/zh/usage/environment_variables.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# FastDeploy 环境变量说明
2+
23
FastDeploy 的环境变量保存在了代码库根目录下 fastdeploy/envs.py 文件中,以下是其对应的中文版说明:
34

45
```python
@@ -37,7 +38,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
3738

3839
# 是否使用 HuggingFace 分词器
3940
"FD_USE_HF_TOKENIZER":
40-
lambda: os.getenv("FD_USE_HF_TOKENIZER", 0),
41+
lambda: bool(int(os.getenv("FD_USE_HF_TOKENIZER", 0))),
4142

4243
# 设置 ZMQ 初始化期间接收数据的高水位标记(HWM)
4344
"FD_ZMQ_SNDHWM":

fastdeploy/config.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ def __init__(
125125
self.redundant_experts_num = 0
126126
self.seed = 0
127127
self.quantization = None
128+
self.pad_token_id: int = -1
129+
self.eos_tokens_lens: int = 2
128130
for key, value in args.items():
129131
if hasattr(self, key):
130132
setattr(self, key, value)
@@ -258,10 +260,6 @@ def __init__(
258260
self.engine_pid: Optional[int] = None
259261
# Do profile or not
260262
self.do_profile: bool = False
261-
#
262-
self.pad_token_id: int = -1
263-
#
264-
self.eos_tokens_lens: int = 2
265263

266264
self.max_num_batched_tokens: int = 2048
267265
# splitwise role

fastdeploy/envs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
# splited by comma, such as 0,1,2.
4343
"CUDA_VISIBLE_DEVICES": lambda: os.getenv("CUDA_VISIBLE_DEVICES", None),
4444
# Whether to use HuggingFace tokenizer.
45-
"FD_USE_HF_TOKENIZER": lambda: os.getenv("FD_USE_HF_TOKENIZER", 0),
45+
"FD_USE_HF_TOKENIZER": lambda: bool(int(os.getenv("FD_USE_HF_TOKENIZER", "0"))),
4646
# Set the high watermark (HWM) for receiving data during ZMQ initialization
4747
"FD_ZMQ_SNDHWM": lambda: os.getenv("FD_ZMQ_SNDHWM", 10000),
4848
# cache kv quant params directory
@@ -61,7 +61,7 @@
6161
# Whether transition from standalone PD decoupling to centralized inference
6262
"FD_PD_CHANGEABLE": lambda: os.getenv("FD_PD_CHANGEABLE", "0"),
6363
# Whether to use fastsafetensor load weight (0 or 1)
64-
"FD_USE_FASTSAFETENSOR": lambda: os.getenv("FD_USE_FASTSAFETENSOR", "0"),
64+
"FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
6565
# Whether to use DeepGemm for FP8 blockwise MoE.
6666
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
6767
# Whether to use aggregate send.

fastdeploy/input/ernie_processor.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import numpy as np
2020
from paddleformers.generation import GenerationConfig
2121

22-
from fastdeploy import envs
2322
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
2423
from fastdeploy.input.text_processor import BaseDataProcessor
2524
from fastdeploy.utils import data_processor_logger
@@ -47,7 +46,16 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None):
4746

4847
self.model_name_or_path = model_name_or_path
4948
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
50-
self._init_config()
49+
50+
# Generation config
51+
try:
52+
self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
53+
except Exception as e:
54+
data_processor_logger.warning(
55+
f"Can't find generation config, so it will not use "
56+
f"generation_config field in the model config, details={e}"
57+
)
58+
self.generation_config = None
5159

5260
self.decode_status = dict()
5361
self.thinking_parser_dict = dict()
@@ -57,26 +65,15 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None):
5765
{self.tokenizer.bos_token_id}, \
5866
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
5967
)
60-
self.eos_token_ids = [self.tokenizer.eos_token_id]
68+
from paddleformers.trl.llm_utils import get_eos_token_id
69+
70+
self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
6171
self.eos_token_id_len = len(self.eos_token_ids)
6272
self.pad_token_id = self.get_pad_id()
6373
self.reasoning_parser = None
6474
if reasoning_parser_obj:
6575
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
6676

67-
def _init_config(self):
68-
self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
69-
70-
# Generation config
71-
try:
72-
self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
73-
except Exception as e:
74-
data_processor_logger.warning(
75-
f"Can't find generation config, so it will not use "
76-
f"generation_config field in the model config, details={e}"
77-
)
78-
self.generation_config = None
79-
8077
def process_request(self, request, max_model_len=None, **kwargs):
8178
"""
8279
Preprocess the request

fastdeploy/input/ernie_vl_processor.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
# limitations under the License.
1515
"""
1616

17-
import os
18-
1917
import numpy as np
2018
from paddleformers.generation import GenerationConfig
2119

@@ -35,10 +33,6 @@ def __init__(
3533
mm_processor_kwargs=None,
3634
reasoning_parser_obj=None,
3735
):
38-
self.use_hf_tokenizer = False
39-
40-
if "merge_llm_model" in model_name_or_path:
41-
model_name_or_path = os.path.dirname(model_name_or_path)
4236
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
4337
tokenizer_path = model_name_or_path
4438
preprocessor_path = model_name_or_path
@@ -55,13 +49,6 @@ def __init__(
5549

5650
self.decode_status = dict()
5751
self._load_tokenizer()
58-
self.eos_token_ids = [self.tokenizer.eos_token_id]
59-
self.eos_token_id_len = len(self.eos_token_ids)
60-
self.pad_token_id = self.get_pad_id()
61-
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
62-
self.reasoning_parser = None
63-
if reasoning_parser_obj:
64-
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
6552

6653
# Generation config
6754
try:
@@ -72,6 +59,17 @@ def __init__(
7259
)
7360
self.generation_config = None
7461

62+
# self.eos_token_ids = [self.tokenizer.eos_token_id]
63+
from paddleformers.trl.llm_utils import get_eos_token_id
64+
65+
self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
66+
self.eos_token_id_len = len(self.eos_token_ids)
67+
self.pad_token_id = self.get_pad_id()
68+
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
69+
self.reasoning_parser = None
70+
if reasoning_parser_obj:
71+
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
72+
7573
def get_pad_id(self):
7674
"""get pad id"""
7775
return self.tokenizer.pad_token_id

fastdeploy/input/text_processor.py

Lines changed: 12 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,14 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None):
165165

166166
self.model_name_or_path = model_name_or_path
167167

168-
self._init_config()
168+
# Generation config
169+
try:
170+
self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
171+
except Exception as e:
172+
data_processor_logger.warning(
173+
f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
174+
)
175+
self.generation_config = None
169176

170177
self.decode_status = dict()
171178
self.tokenizer = self._load_tokenizer()
@@ -184,30 +191,6 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None):
184191
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
185192
self.tokenizer.pad_token_id = self.pad_token_id
186193

187-
def _init_config(self):
188-
"""
189-
初始化配置,包括模型名称、使用Hugging Face Tokenizer等。
190-
191-
Args:
192-
无参数,但是会从环境变量中获取一些配置信息。
193-
194-
Returns:
195-
无返回值,直接修改了类的属性。
196-
197-
Raises:
198-
无异常抛出。
199-
"""
200-
self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
201-
202-
# Generation config
203-
try:
204-
self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
205-
except Exception as e:
206-
data_processor_logger.warning(
207-
f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
208-
)
209-
self.generation_config = None
210-
211194
def process_request(self, request, max_model_len=None, **kwargs):
212195
"""
213196
Preprocess the request
@@ -433,7 +416,7 @@ def text2ids(self, text, max_model_len):
433416
Returns:
434417
List[int]: token ids list
435418
"""
436-
if self.use_hf_tokenizer:
419+
if envs.FD_USE_HF_TOKENIZER:
437420
tokens = self.tokenizer(
438421
text,
439422
return_tensors="np",
@@ -491,7 +474,7 @@ def ids2tokens(self, token_id, task_id):
491474
Returns:
492475
List[str]: strings
493476
"""
494-
if self.use_hf_tokenizer:
477+
if envs.FD_USE_HF_TOKENIZER:
495478
if task_id not in self.decode_status:
496479
# history token ids & history token strings & befer decode str
497480
self.decode_status[task_id] = [[], [], ""]
@@ -536,7 +519,7 @@ def _load_tokenizer(self):
536519
Returns:
537520
tokenizer (AutoTokenizer)
538521
"""
539-
if self.use_hf_tokenizer:
522+
if envs.FD_USE_HF_TOKENIZER:
540523
from transformers import AutoTokenizer
541524

542525
return AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=False)
@@ -557,7 +540,7 @@ def clear_request_status(self, task_id):
557540
"""
558541
results_all = ""
559542
if task_id in self.decode_status:
560-
if self.use_hf_tokenizer:
543+
if envs.FD_USE_HF_TOKENIZER:
561544
results_all = self.decode_status[task_id][2]
562545
else:
563546
results_all = "".join(self.decode_status[task_id][3])

fastdeploy/model_executor/pre_and_post_process.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ def post_process_normal(
181181
)
182182

183183
stop_wo_think = (
184-
(sampler_output.sampled_token_ids == model_output.eos_token_id) | (model_output.reasoning_index == 0)
184+
(sampler_output.sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
185+
| (model_output.reasoning_index == 0)
185186
) & (model_output.need_think_end > 0)
186187
sampler_output.sampled_token_ids = paddle.where(
187188
stop_wo_think,

fastdeploy/worker/gcu_model_runner.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,6 @@ def get_attr_from_request(request, attr, default_value=None):
236236
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
237237
self.share_inputs["prompt_lens"][idx : idx + 1] = length
238238

239-
if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens:
240-
request.eos_token_ids.append(request.eos_token_ids[0])
241239
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
242240
self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
243241
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
@@ -315,7 +313,9 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
315313
idx = i
316314
self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
317315
self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
318-
self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
316+
self.share_inputs["eos_token_id"][:] = np.array(
317+
[2] * self.model_config.eos_tokens_lens, dtype="int64"
318+
).reshape(-1, 1)
319319
self.seq_lens_this_time_buffer[idx : idx + 1] = input_length
320320
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length
321321
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length
@@ -350,15 +350,15 @@ def _init_share_inputs(self, max_num_seqs: int):
350350
)
351351
self.share_inputs["input_ids"] = paddle.full(
352352
[max_num_seqs, self.parallel_config.max_model_len],
353-
self.parallel_config.pad_token_id,
353+
self.model_config.pad_token_id,
354354
dtype="int64",
355355
)
356356
self.share_inputs["prompt_ids"] = paddle.full(
357357
[max_num_seqs, self.parallel_config.max_model_len],
358-
self.parallel_config.pad_token_id,
358+
self.model_config.pad_token_id,
359359
dtype="int64",
360360
)
361-
self.share_inputs["eos_token_id"] = paddle.full([self.parallel_config.eos_tokens_lens, 1], 0, dtype="int64")
361+
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
362362
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
363363
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
364364
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")

fastdeploy/worker/gpu_model_runner.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,11 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
265265
)
266266

267267
input_ids = request.prompt_token_ids + request.output_token_ids
268-
logger.debug(f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}")
268+
logger.debug(
269+
f"Handle prefill request {request} at idx {idx}, "
270+
f"{prefill_start_index=}, {prefill_end_index=}, "
271+
f"need_prefilled_token_num={len(input_ids)}"
272+
)
269273
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
270274
input_ids[prefill_start_index:prefill_end_index]
271275
)
@@ -307,8 +311,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
307311
self.share_inputs["is_block_step"][idx : idx + 1] = False
308312
continue
309313

310-
if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens:
311-
request.eos_token_ids.append(request.eos_token_ids[0])
314+
assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
312315
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
313316

314317
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
@@ -471,8 +474,7 @@ def get_attr_from_request(request, attr, default_value=None):
471474
else:
472475
return default_value
473476

474-
if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens:
475-
request.eos_token_ids.append(request.eos_token_ids[0])
477+
assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
476478
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
477479
self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
478480
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
@@ -562,7 +564,9 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
562564
idx = i
563565
self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
564566
self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
565-
self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
567+
self.share_inputs["eos_token_id"][:] = np.array(
568+
[2] * self.model_config.eos_tokens_lens, dtype="int64"
569+
).reshape(-1, 1)
566570
self.seq_lens_this_time_buffer[idx : idx + 1] = input_length
567571
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length
568572
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length
@@ -597,15 +601,15 @@ def _init_share_inputs(self, max_num_seqs: int):
597601
)
598602
self.share_inputs["input_ids"] = paddle.full(
599603
[max_num_seqs, self.parallel_config.max_model_len],
600-
self.parallel_config.pad_token_id,
604+
self.model_config.pad_token_id,
601605
dtype="int64",
602606
)
603607
self.share_inputs["prompt_ids"] = paddle.full(
604608
[max_num_seqs, self.parallel_config.max_model_len],
605-
self.parallel_config.pad_token_id,
609+
self.model_config.pad_token_id,
606610
dtype="int64",
607611
)
608-
self.share_inputs["eos_token_id"] = paddle.full([self.parallel_config.eos_tokens_lens, 1], 0, dtype="int64")
612+
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
609613
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
610614
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
611615
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")

0 commit comments

Comments
 (0)