@@ -100,6 +100,7 @@ def __init__(
100
100
f"model={ model_config .model !r} , "
101
101
f"speculative_config={ speculative_config !r} , "
102
102
f"tokenizer={ model_config .tokenizer !r} , "
103
+ f"skip_tokenizer_init={ model_config .skip_tokenizer_init } , "
103
104
f"tokenizer_mode={ model_config .tokenizer_mode } , "
104
105
f"revision={ model_config .revision } , "
105
106
f"tokenizer_revision={ model_config .tokenizer_revision } , "
@@ -132,8 +133,14 @@ def __init__(
132
133
self .decoding_config = decoding_config or DecodingConfig ()
133
134
self .log_stats = log_stats
134
135
135
- self ._init_tokenizer ()
136
- self .detokenizer = Detokenizer (self .tokenizer )
136
+ if not self .model_config .skip_tokenizer_init :
137
+ self .tokenizer : BaseTokenizerGroup
138
+ self ._init_tokenizer ()
139
+ self .detokenizer = Detokenizer (self .tokenizer )
140
+ else :
141
+ self .detokenizer = None
142
+ self .tokenizer = None
143
+
137
144
self .seq_counter = Counter ()
138
145
self .generation_config_fields = _load_generation_config_dict (
139
146
model_config )
@@ -187,9 +194,10 @@ def __init__(
187
194
parallel_config .disable_custom_all_reduce ,
188
195
})
189
196
190
- # Ping the tokenizer to ensure liveness if it runs in a
191
- # different process.
192
- self .tokenizer .ping ()
197
+ if self .tokenizer :
198
+ # Ping the tokenizer to ensure liveness if it runs in a
199
+ # different process.
200
+ self .tokenizer .ping ()
193
201
194
202
# Create the scheduler.
195
203
# NOTE: the cache_config here have been updated with the numbers of
@@ -296,7 +304,7 @@ def _init_tokenizer(self, **tokenizer_init_kwargs):
296
304
trust_remote_code = self .model_config .trust_remote_code ,
297
305
revision = self .model_config .tokenizer_revision )
298
306
init_kwargs .update (tokenizer_init_kwargs )
299
- self .tokenizer : BaseTokenizerGroup = get_tokenizer_group (
307
+ self .tokenizer = get_tokenizer_group (
300
308
self .parallel_config .tokenizer_pool_config , ** init_kwargs )
301
309
302
310
def _verify_args (self ) -> None :
@@ -393,8 +401,13 @@ def add_request(
393
401
# Create the sequences.
394
402
block_size = self .cache_config .block_size
395
403
seq_id = next (self .seq_counter )
396
- eos_token_id = self .tokenizer .get_lora_tokenizer (
397
- lora_request ).eos_token_id
404
+ eos_token_id = None
405
+ if self .tokenizer :
406
+ eos_token_id = self .tokenizer .get_lora_tokenizer (
407
+ lora_request ).eos_token_id
408
+ else :
409
+ logger .warning ("Use None for EOS token id because tokenizer is "
410
+ "not initialized" )
398
411
seq = Sequence (seq_id , prompt , prompt_token_ids , block_size ,
399
412
eos_token_id , lora_request )
400
413
0 commit comments