Skip to content

Commit beeef52

Browse files
authored
Merge branch 'develop' into think
2 parents dd30110 + 70633c6 commit beeef52

File tree

4 files changed

+843
-2
lines changed

4 files changed

+843
-2
lines changed

fastdeploy/cache_manager/cache_messager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@ def __init__(
152152
cache_v = []
153153
self.messager = {}
154154
for layer_idx in range(self.num_layers):
155-
key_cache = self.gpu_cache_kvs[f"key_caches_{layer_idx}_rank{self.rank}.device{gpu_id}"]
156-
val_cache = self.gpu_cache_kvs[f"value_caches_{layer_idx}_rank{self.rank}.device{gpu_id}"]
155+
key_cache = self.gpu_cache_kvs[f"key_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
156+
val_cache = self.gpu_cache_kvs[f"value_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
157157
cache_k.append(key_cache)
158158
cache_v.append(val_cache)
159159
cache_k_ptr_list.append(key_cache.data_ptr())

fastdeploy/entrypoints/cli/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,12 @@ def main():
2525
import fastdeploy.entrypoints.cli.openai
2626
import fastdeploy.entrypoints.cli.run_batch
2727
import fastdeploy.entrypoints.cli.serve
28+
import fastdeploy.entrypoints.cli.tokenizer
2829
from fastdeploy.utils import FlexibleArgumentParser
2930

3031
CMD_MODULES = [
3132
fastdeploy.entrypoints.cli.run_batch,
33+
fastdeploy.entrypoints.cli.tokenizer,
3234
fastdeploy.entrypoints.cli.openai,
3335
fastdeploy.entrypoints.cli.benchmark.main,
3436
fastdeploy.entrypoints.cli.serve,
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License"
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import argparse
20+
import json
21+
import typing
22+
from pathlib import Path
23+
24+
from fastdeploy.entrypoints.cli.types import CLISubcommand
25+
from fastdeploy.input.preprocess import InputPreprocessor
26+
27+
if typing.TYPE_CHECKING:
28+
from fastdeploy.utils import FlexibleArgumentParser
29+
30+
31+
class TokenizerSubcommand(CLISubcommand):
32+
"""The `tokenizer` subcommand for the FastDeploy CLI."""
33+
34+
name = "tokenizer"
35+
36+
@staticmethod
37+
def cmd(args: argparse.Namespace) -> None:
38+
main(args)
39+
40+
def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
41+
tokenizer_parser = subparsers.add_parser(
42+
name=self.name,
43+
help="Start the FastDeploy Tokenizer Server.",
44+
description="Start the FastDeploy Tokenizer Server.",
45+
usage="fastdeploy tokenizer [--encode/-e TEXT] [--decode/-d TEXT]",
46+
)
47+
48+
# 添加通用参数
49+
tokenizer_parser.add_argument(
50+
"--model_name_or_path",
51+
"--model",
52+
"-m",
53+
type=str,
54+
default="baidu/ERNIE-4.5-0.3B-PT",
55+
help="Path to model or model identifier",
56+
)
57+
tokenizer_parser.add_argument("--enable-mm", "-mm", action="store_true", help="Enable multi-modal support")
58+
tokenizer_parser.add_argument("--vocab-size", "-vs", action="store_true", help="Show vocabulary size")
59+
tokenizer_parser.add_argument("--info", "-i", action="store_true", help="Show tokenizer information")
60+
tokenizer_parser.add_argument(
61+
"--vocab-export", "-ve", type=str, metavar="FILE", help="Export vocabulary to file"
62+
)
63+
tokenizer_parser.add_argument("--encode", "-e", default=None, help="Encode text to tokens")
64+
tokenizer_parser.add_argument("--decode", "-d", default=None, help="Decode tokens to text")
65+
66+
return tokenizer_parser
67+
68+
69+
def cmd_init() -> list[CLISubcommand]:
70+
return [TokenizerSubcommand()]
71+
72+
73+
def get_vocab_size(tokenizer) -> int:
74+
"""获取词表大小"""
75+
try:
76+
if hasattr(tokenizer, "vocab_size"):
77+
return tokenizer.vocab_size
78+
elif hasattr(tokenizer, "get_vocab_size"):
79+
return tokenizer.get_vocab_size()
80+
else:
81+
return 100295 # Ernie4_5Tokenizer的固定词表大小
82+
except Exception:
83+
return 0
84+
85+
86+
def get_tokenizer_info(tokenizer) -> dict:
87+
"""获取tokenizer的元信息"""
88+
info = {}
89+
90+
try:
91+
# 基本属性
92+
info["vocab_size"] = get_vocab_size(tokenizer)
93+
94+
# 模型类型和路径
95+
if hasattr(tokenizer, "name_or_path"):
96+
info["model_name"] = tokenizer.name_or_path
97+
98+
# tokenizer类型
99+
info["tokenizer_type"] = type(tokenizer).__name__
100+
101+
# 特殊符号
102+
special_tokens = {}
103+
for attr in ["bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token"]:
104+
if hasattr(tokenizer, attr):
105+
token = getattr(tokenizer, attr)
106+
if token:
107+
special_tokens[attr] = token
108+
info["special_tokens"] = special_tokens
109+
110+
# 特殊token IDs
111+
special_token_ids = {}
112+
for attr in [
113+
"bos_token_id",
114+
"eos_token_id",
115+
"unk_token_id",
116+
"sep_token_id",
117+
"pad_token_id",
118+
"cls_token_id",
119+
"mask_token_id",
120+
]:
121+
if hasattr(tokenizer, attr):
122+
token_id = getattr(tokenizer, attr)
123+
if token_id is not None:
124+
special_token_ids[attr] = token_id
125+
info["special_token_ids"] = special_token_ids
126+
127+
# 模型最大长度
128+
if hasattr(tokenizer, "model_max_length"):
129+
info["model_max_length"] = tokenizer.model_max_length
130+
131+
except Exception as e:
132+
info["error"] = f"Failed to get tokenizer info: {e}"
133+
134+
return info
135+
136+
137+
def get_vocab_dict(tokenizer) -> dict:
138+
"""获取词表字典"""
139+
try:
140+
if hasattr(tokenizer, "vocab"):
141+
return tokenizer.vocab
142+
elif hasattr(tokenizer, "get_vocab"):
143+
return tokenizer.get_vocab()
144+
elif hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "vocab"):
145+
return tokenizer.tokenizer.vocab
146+
elif hasattr(tokenizer, "encoder"):
147+
return tokenizer.encoder
148+
else:
149+
return {}
150+
except Exception:
151+
return {}
152+
153+
154+
def export_vocabulary(tokenizer, file_path: str) -> None:
155+
"""导出词表到文件"""
156+
try:
157+
vocab = get_vocab_dict(tokenizer)
158+
if not vocab:
159+
print("Warning: Could not retrieve vocabulary from tokenizer")
160+
return
161+
162+
path = Path(file_path)
163+
path.parent.mkdir(parents=True, exist_ok=True)
164+
165+
# 根据文件扩展名选择格式
166+
if path.suffix.lower() == ".json":
167+
with open(path, "w", encoding="utf-8") as f:
168+
json.dump(vocab, f, ensure_ascii=False, indent=2)
169+
else:
170+
# 默认格式:每行一个token
171+
with open(path, "w", encoding="utf-8") as f:
172+
for token, token_id in sorted(vocab.items(), key=lambda x: x[1]):
173+
# 处理不可打印字符
174+
try:
175+
f.write(f"{token_id}\t{repr(token)}\n")
176+
except:
177+
f.write(f"{token_id}\t<unprintable>\n")
178+
179+
print(f"Vocabulary exported to: {file_path}")
180+
print(f"Total tokens: {len(vocab)}")
181+
182+
except Exception as e:
183+
print(f"Error exporting vocabulary: {e}")
184+
185+
186+
def main(args: argparse.Namespace) -> None:
187+
188+
def print_separator(title=""):
189+
if title:
190+
print(f"\n{'='*50}")
191+
print(f" {title}")
192+
print(f"{'='*50}")
193+
else:
194+
print(f"\n{'='*50}")
195+
196+
# 检查参数
197+
if not any([args.encode, args.decode, args.vocab_size, args.info, args.vocab_export]):
198+
print("请至少指定一个参数:--encode, --decode, --vocab-size, --info, --export-vocab")
199+
return
200+
201+
# 初始化tokenizer
202+
preprocessor = InputPreprocessor(model_name_or_path=args.model_name_or_path, enable_mm=args.enable_mm)
203+
tokenizer = preprocessor.create_processor().tokenizer
204+
205+
# 执行操作
206+
operations_count = 0
207+
208+
if args.encode:
209+
print_separator("ENCODING")
210+
print(f"Input text: {args.encode}")
211+
encoded_text = tokenizer.encode(args.encode)
212+
print(f"Encoded tokens: {encoded_text}")
213+
operations_count += 1
214+
215+
if args.decode:
216+
print_separator("DECODING")
217+
print(f"Input tokens: {args.decode}")
218+
try:
219+
if isinstance(args.decode, str):
220+
if args.decode.startswith("[") and args.decode.endswith("]"):
221+
tokens = eval(args.decode)
222+
else:
223+
tokens = [int(x.strip()) for x in args.decode.split(",")]
224+
else:
225+
tokens = args.decode
226+
227+
decoded_text = tokenizer.decode(tokens)
228+
print(f"Decoded text: {decoded_text}")
229+
except Exception as e:
230+
print(f"Error decoding tokens: {e}")
231+
operations_count += 1
232+
233+
if args.vocab_size:
234+
print_separator("VOCABULARY SIZE")
235+
print(f"Vocabulary size: {get_vocab_size(tokenizer)}")
236+
operations_count += 1
237+
238+
if args.info:
239+
print_separator("TOKENIZER INFO")
240+
print(json.dumps(get_tokenizer_info(tokenizer), indent=2))
241+
operations_count += 1
242+
243+
if args.vocab_export:
244+
print_separator("EXPORT VOCABULARY")
245+
export_vocabulary(tokenizer, args.vocab_export)
246+
operations_count += 1
247+
248+
print_separator()
249+
print(f"Completed {operations_count} operation(s)")

0 commit comments

Comments
 (0)