Skip to content

Commit a37d815

Browse files
GeauxEricEricDingNVDywang96
authored
Make initialization of tokenizer and detokenizer optional (#3748)
Co-authored-by: Yun Ding <[email protected]> Co-authored-by: Roger Wang <[email protected]>
1 parent 7f2593b commit a37d815

File tree

6 files changed

+68
-12
lines changed

6 files changed

+68
-12
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pytest
2+
3+
from vllm.entrypoints.llm import LLM
4+
from vllm.sampling_params import SamplingParams
5+
6+
7+
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
8+
def test_skip_tokenizer_initialization(model: str):
9+
# This test checks if the flag skip_tokenizer_init skips the initialization
10+
# of tokenizer and detokenizer. The generated output is expected to contain
11+
# token ids.
12+
llm = LLM(model=model, skip_tokenizer_init=True)
13+
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
14+
with pytest.raises(ValueError) as err:
15+
llm.generate("abc", sampling_params)
16+
assert "prompts must be None if" in str(err.value)
17+
outputs = llm.generate(prompt_token_ids=[[1, 2, 3]],
18+
sampling_params=sampling_params)
19+
assert len(outputs) > 0
20+
completions = outputs[0].outputs
21+
assert len(completions) > 0
22+
assert completions[0].text == ""
23+
assert completions[0].token_ids

vllm/config.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ class ModelConfig:
6666
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
6767
When a sequence has context length larger than this, we fall back
6868
to eager mode.
69+
skip_tokenizer_init: If true, skip initialization of tokenizer and
70+
detokenizer.
6971
"""
7072

7173
def __init__(
@@ -85,6 +87,7 @@ def __init__(
8587
enforce_eager: bool = False,
8688
max_context_len_to_capture: Optional[int] = None,
8789
max_logprobs: int = 5,
90+
skip_tokenizer_init: bool = False,
8891
) -> None:
8992
self.model = model
9093
self.tokenizer = tokenizer
@@ -99,14 +102,16 @@ def __init__(
99102
self.enforce_eager = enforce_eager
100103
self.max_context_len_to_capture = max_context_len_to_capture
101104
self.max_logprobs = max_logprobs
105+
self.skip_tokenizer_init = skip_tokenizer_init
102106

103107
self.hf_config = get_config(self.model, trust_remote_code, revision,
104108
code_revision)
105109
self.hf_text_config = get_hf_text_config(self.hf_config)
106110
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
107111
self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
108112
max_model_len)
109-
self._verify_tokenizer_mode()
113+
if not self.skip_tokenizer_init:
114+
self._verify_tokenizer_mode()
110115
self._verify_quantization()
111116
self._verify_cuda_graph()
112117

vllm/engine/arg_utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class EngineArgs:
1616
"""Arguments for vLLM engine."""
1717
model: str
1818
tokenizer: Optional[str] = None
19+
skip_tokenizer_init: bool = False
1920
tokenizer_mode: str = 'auto'
2021
trust_remote_code: bool = False
2122
download_dir: Optional[str] = None
@@ -93,6 +94,10 @@ def add_cli_args(
9394
type=str,
9495
default=EngineArgs.tokenizer,
9596
help='Name or path of the huggingface tokenizer to use.')
97+
parser.add_argument(
98+
'--skip-tokenizer-init',
99+
action='store_true',
100+
help='Skip initialization of tokenizer and detokenizer')
96101
parser.add_argument(
97102
'--revision',
98103
type=str,
@@ -453,7 +458,7 @@ def create_engine_config(self, ) -> EngineConfig:
453458
self.code_revision, self.tokenizer_revision, self.max_model_len,
454459
self.quantization, self.quantization_param_path,
455460
self.enforce_eager, self.max_context_len_to_capture,
456-
self.max_logprobs)
461+
self.max_logprobs, self.skip_tokenizer_init)
457462
cache_config = CacheConfig(self.block_size,
458463
self.gpu_memory_utilization,
459464
self.swap_space, self.kv_cache_dtype,

vllm/engine/llm_engine.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def __init__(
100100
f"model={model_config.model!r}, "
101101
f"speculative_config={speculative_config!r}, "
102102
f"tokenizer={model_config.tokenizer!r}, "
103+
f"skip_tokenizer_init={model_config.skip_tokenizer_init}, "
103104
f"tokenizer_mode={model_config.tokenizer_mode}, "
104105
f"revision={model_config.revision}, "
105106
f"tokenizer_revision={model_config.tokenizer_revision}, "
@@ -132,8 +133,14 @@ def __init__(
132133
self.decoding_config = decoding_config or DecodingConfig()
133134
self.log_stats = log_stats
134135

135-
self._init_tokenizer()
136-
self.detokenizer = Detokenizer(self.tokenizer)
136+
if not self.model_config.skip_tokenizer_init:
137+
self.tokenizer: BaseTokenizerGroup
138+
self._init_tokenizer()
139+
self.detokenizer = Detokenizer(self.tokenizer)
140+
else:
141+
self.detokenizer = None
142+
self.tokenizer = None
143+
137144
self.seq_counter = Counter()
138145
self.generation_config_fields = _load_generation_config_dict(
139146
model_config)
@@ -187,9 +194,10 @@ def __init__(
187194
parallel_config.disable_custom_all_reduce,
188195
})
189196

190-
# Ping the tokenizer to ensure liveness if it runs in a
191-
# different process.
192-
self.tokenizer.ping()
197+
if self.tokenizer:
198+
# Ping the tokenizer to ensure liveness if it runs in a
199+
# different process.
200+
self.tokenizer.ping()
193201

194202
# Create the scheduler.
195203
# NOTE: the cache_config here have been updated with the numbers of
@@ -296,7 +304,7 @@ def _init_tokenizer(self, **tokenizer_init_kwargs):
296304
trust_remote_code=self.model_config.trust_remote_code,
297305
revision=self.model_config.tokenizer_revision)
298306
init_kwargs.update(tokenizer_init_kwargs)
299-
self.tokenizer: BaseTokenizerGroup = get_tokenizer_group(
307+
self.tokenizer = get_tokenizer_group(
300308
self.parallel_config.tokenizer_pool_config, **init_kwargs)
301309

302310
def _verify_args(self) -> None:
@@ -393,8 +401,13 @@ def add_request(
393401
# Create the sequences.
394402
block_size = self.cache_config.block_size
395403
seq_id = next(self.seq_counter)
396-
eos_token_id = self.tokenizer.get_lora_tokenizer(
397-
lora_request).eos_token_id
404+
eos_token_id = None
405+
if self.tokenizer:
406+
eos_token_id = self.tokenizer.get_lora_tokenizer(
407+
lora_request).eos_token_id
408+
else:
409+
logger.warning("Use None for EOS token id because tokenizer is "
410+
"not initialized")
398411
seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
399412
eos_token_id, lora_request)
400413

vllm/engine/output_processor/single_step.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
5959

6060
# Process prompt logprobs
6161
prompt_logprobs = outputs.prompt_logprobs
62-
if prompt_logprobs is not None and seq_group.sampling_params.detokenize:
62+
if prompt_logprobs is not None and \
63+
seq_group.sampling_params.detokenize and self.detokenizer:
6364
self.detokenizer.decode_prompt_logprobs_inplace(
6465
seq_group, prompt_logprobs)
6566
seq_group.prompt_logprobs = prompt_logprobs
@@ -105,7 +106,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
105106
child_seqs.append((parent, parent))
106107

107108
for seq, _ in child_seqs:
108-
if seq_group.sampling_params.detokenize:
109+
if seq_group.sampling_params.detokenize and self.detokenizer:
109110
new_char_count = self.detokenizer.decode_sequence_inplace(
110111
seq, seq_group.sampling_params)
111112
else:

vllm/entrypoints/llm.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ class LLM:
3232
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
3333
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
3434
if available, and "slow" will always use the slow tokenizer.
35+
skip_tokenizer_init: If true, skip initialization of tokenizer and
36+
detokenizer. Expect valid prompt_token_ids and None for prompt
37+
from the input.
3538
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
3639
downloading the model and tokenizer.
3740
tensor_parallel_size: The number of GPUs to use for distributed
@@ -76,6 +79,7 @@ def __init__(
7679
model: str,
7780
tokenizer: Optional[str] = None,
7881
tokenizer_mode: str = "auto",
82+
skip_tokenizer_init: bool = False,
7983
trust_remote_code: bool = False,
8084
tensor_parallel_size: int = 1,
8185
dtype: str = "auto",
@@ -96,6 +100,7 @@ def __init__(
96100
model=model,
97101
tokenizer=tokenizer,
98102
tokenizer_mode=tokenizer_mode,
103+
skip_tokenizer_init=skip_tokenizer_init,
99104
trust_remote_code=trust_remote_code,
100105
tensor_parallel_size=tensor_parallel_size,
101106
dtype=dtype,
@@ -160,6 +165,10 @@ def generate(
160165
if prompts is None and prompt_token_ids is None:
161166
raise ValueError("Either prompts or prompt_token_ids must be "
162167
"provided.")
168+
if self.llm_engine.model_config.skip_tokenizer_init \
169+
and prompts is not None:
170+
raise ValueError("prompts must be None if skip_tokenizer_init "
171+
"is True")
163172
if isinstance(prompts, str):
164173
# Convert a single prompt to a list.
165174
prompts = [prompts]

0 commit comments

Comments
 (0)