Skip to content

Commit c957c74

Browse files
authored
Enable safetensors loading for all models (#974)
1 parent c07ece5 commit c957c74

File tree

18 files changed

+143
-83
lines changed

18 files changed

+143
-83
lines changed

vllm/config.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,16 @@ class ModelConfig:
2424
downloading the model and tokenizer.
2525
download_dir: Directory to download and load the weights, default to the
2626
default cache directory of huggingface.
27-
use_np_weights: Save a numpy copy of model weights for faster loading.
28-
This can increase the disk usage by up to 2x.
29-
use_dummy_weights: Use dummy values for model weights (for profiling).
27+
load_format: The format of the model weights to load:
28+
"auto" will try to load the weights in the safetensors format and
29+
fall back to the pytorch bin format if safetensors format is
30+
not available.
31+
"pt" will load the weights in the pytorch bin format.
32+
"safetensors" will load the weights in the safetensors format.
33+
"npcache" will load the weights in pytorch format and store
34+
a numpy cache to speed up the loading.
35+
"dummy" will initialize the weights with random values, which is
36+
mainly for profiling.
3037
dtype: Data type for model weights and activations. The "auto" option
3138
will use FP16 precision for FP32 and FP16 models, and BF16 precision
3239
for BF16 models.
@@ -40,8 +47,7 @@ def __init__(
4047
tokenizer_mode: str,
4148
trust_remote_code: bool,
4249
download_dir: Optional[str],
43-
use_np_weights: bool,
44-
use_dummy_weights: bool,
50+
load_format: str,
4551
dtype: str,
4652
seed: int,
4753
) -> None:
@@ -50,14 +56,24 @@ def __init__(
5056
self.tokenizer_mode = tokenizer_mode
5157
self.trust_remote_code = trust_remote_code
5258
self.download_dir = download_dir
53-
self.use_np_weights = use_np_weights
54-
self.use_dummy_weights = use_dummy_weights
59+
self.load_format = load_format
5560
self.seed = seed
5661

5762
self.hf_config = get_config(model, trust_remote_code)
5863
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
64+
self._verify_load_format()
5965
self._verify_tokenizer_mode()
6066

67+
def _verify_load_format(self) -> None:
68+
load_format = self.load_format.lower()
69+
if load_format not in [
70+
"auto", "pt", "safetensors", "npcache", "dummy"
71+
]:
72+
raise ValueError(
73+
f"Unknown load format: {self.load_format}. Must be one of "
74+
"'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
75+
self.load_format = load_format
76+
6177
def _verify_tokenizer_mode(self) -> None:
6278
tokenizer_mode = self.tokenizer_mode.lower()
6379
if tokenizer_mode not in ["auto", "slow"]:

vllm/engine/arg_utils.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@ class EngineArgs:
1515
tokenizer_mode: str = 'auto'
1616
trust_remote_code: bool = False
1717
download_dir: Optional[str] = None
18-
use_np_weights: bool = False
19-
use_dummy_weights: bool = False
18+
load_format: str = 'auto'
2019
dtype: str = 'auto'
2120
seed: int = 0
2221
worker_use_ray: bool = False
@@ -65,14 +64,21 @@ def add_cli_args(
6564
help='directory to download and load the weights, '
6665
'default to the default cache dir of '
6766
'huggingface')
68-
parser.add_argument('--use-np-weights',
69-
action='store_true',
70-
help='save a numpy copy of model weights for '
71-
'faster loading. This can increase the disk '
72-
'usage by up to 2x.')
73-
parser.add_argument('--use-dummy-weights',
74-
action='store_true',
75-
help='use dummy values for model weights')
67+
parser.add_argument(
68+
'--load-format',
69+
type=str,
70+
default=EngineArgs.load_format,
71+
choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
72+
help='The format of the model weights to load. '
73+
'"auto" will try to load the weights in the safetensors format '
74+
'and fall back to the pytorch bin format if safetensors format '
75+
'is not available. '
76+
'"pt" will load the weights in the pytorch bin format. '
77+
'"safetensors" will load the weights in the safetensors format. '
78+
'"npcache" will load the weights in pytorch format and store '
79+
'a numpy cache to speed up the loading. '
80+
'"dummy" will initialize the weights with random values, '
81+
'which is mainly for profiling.')
7682
# TODO(woosuk): Support FP32.
7783
parser.add_argument(
7884
'--dtype',
@@ -146,9 +152,8 @@ def create_engine_configs(
146152
# Initialize the configs.
147153
model_config = ModelConfig(self.model, self.tokenizer,
148154
self.tokenizer_mode, self.trust_remote_code,
149-
self.download_dir, self.use_np_weights,
150-
self.use_dummy_weights, self.dtype,
151-
self.seed)
155+
self.download_dir, self.load_format,
156+
self.dtype, self.seed)
152157
cache_config = CacheConfig(self.block_size,
153158
self.gpu_memory_utilization,
154159
self.swap_space)

vllm/engine/llm_engine.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,8 @@ def __init__(
7676
f"tokenizer_mode={model_config.tokenizer_mode}, "
7777
f"trust_remote_code={model_config.trust_remote_code}, "
7878
f"dtype={model_config.dtype}, "
79-
f"use_dummy_weights={model_config.use_dummy_weights}, "
8079
f"download_dir={model_config.download_dir!r}, "
81-
f"use_np_weights={model_config.use_np_weights}, "
80+
f"load_format={model_config.load_format}, "
8281
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
8382
f"seed={model_config.seed})")
8483
# TODO(woosuk): Print more configs in debug mode.

vllm/model_executor/model_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,14 @@ def get_model(model_config: ModelConfig) -> nn.Module:
5656
# Create a model instance.
5757
# The weights will be initialized as empty tensors.
5858
model = model_class(model_config.hf_config)
59-
if model_config.use_dummy_weights:
59+
if model_config.load_format == "dummy":
6060
model = model.cuda()
6161
# NOTE(woosuk): For accurate performance evaluation, we assign
6262
# random values to the weights.
6363
initialize_dummy_weights(model)
6464
else:
6565
# Load the weights from the cached or downloaded files.
6666
model.load_weights(model_config.model, model_config.download_dir,
67-
model_config.use_np_weights)
67+
model_config.load_format)
6868
model = model.cuda()
6969
return model.eval()

vllm/model_executor/models/aquila.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def forward(
288288
def load_weights(self,
289289
model_name_or_path: str,
290290
cache_dir: Optional[str] = None,
291-
use_np_cache: bool = False):
291+
load_format: str = "auto"):
292292
tp_size = get_tensor_model_parallel_world_size()
293293
tensor_model_parallel_rank = get_tensor_model_parallel_rank()
294294
q_proj_shard_size = (self.config.hidden_size // tp_size)
@@ -305,7 +305,7 @@ def load_weights(self,
305305
state_dict = self.state_dict()
306306

307307
for name, loaded_weight in hf_model_weights_iterator(
308-
model_name_or_path, cache_dir, use_np_cache):
308+
model_name_or_path, cache_dir, load_format):
309309
if "rotary_emb.inv_freq" in name:
310310
continue
311311

vllm/model_executor/models/baichuan.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@
3535
PagedAttentionWithALiBi)
3636
from vllm.model_executor.layers.sampler import Sampler
3737
from vllm.model_executor.weight_utils import (
38-
hf_model_weights_iterator, load_padded_tensor_parallel_vocab,
39-
load_tensor_parallel_weights)
38+
convert_pyslice_to_tensor, hf_model_weights_iterator,
39+
load_padded_tensor_parallel_vocab, load_tensor_parallel_weights)
4040
from vllm.model_executor.parallel_utils.parallel_state import (
4141
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
4242
from vllm.model_executor.parallel_utils.tensor_parallel import (
@@ -303,16 +303,18 @@ def forward(
303303
def load_weights(self,
304304
model_name_or_path: str,
305305
cache_dir: Optional[str] = None,
306-
use_np_cache: bool = False):
306+
load_format: str = "auto"):
307307
tp_world_size = get_tensor_model_parallel_world_size()
308308
tp_rank = get_tensor_model_parallel_rank()
309309
state_dict = self.state_dict()
310310

311311
for name, loaded_weight in hf_model_weights_iterator(
312-
model_name_or_path, cache_dir, use_np_cache):
312+
model_name_or_path, cache_dir, load_format):
313313
if "rotary_emb.inv_freq" in name:
314314
continue
315315

316+
loaded_weight = convert_pyslice_to_tensor(loaded_weight)
317+
316318
if "W_pack" in name:
317319
total_num_heads = self.config.num_attention_heads
318320
hidden_size = self.config.hidden_size

vllm/model_executor/models/bloom.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,11 @@ def forward(
279279
def load_weights(self,
280280
model_name_or_path: str,
281281
cache_dir: Optional[str] = None,
282-
use_np_cache: bool = False):
282+
load_format: str = "auto"):
283283
tp_rank = get_tensor_model_parallel_rank()
284284
state_dict = self.state_dict()
285285
for name, loaded_weight in hf_model_weights_iterator(
286-
model_name_or_path, cache_dir, use_np_cache):
286+
model_name_or_path, cache_dir, load_format):
287287
if name == "lm_head.weight":
288288
# Since hidden_states are parallelized, we need to
289289
# load lm_head.weight in parallel.

vllm/model_executor/models/falcon.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
PagedAttentionWithALiBi,
3232
PagedAttentionWithRoPE)
3333
from vllm.model_executor.layers.sampler import Sampler
34-
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
34+
from vllm.model_executor.weight_utils import (convert_pyslice_to_tensor,
35+
hf_model_weights_iterator,
3536
load_tensor_parallel_weights)
3637
from vllm.model_executor.parallel_utils.parallel_state import (
3738
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
@@ -419,7 +420,7 @@ def forward(
419420
def load_weights(self,
420421
model_name_or_path: str,
421422
cache_dir: Optional[str] = None,
422-
use_np_cache: bool = False):
423+
load_format: str = "auto"):
423424
tp_size = (get_tensor_model_parallel_world_size())
424425
tp_rank = get_tensor_model_parallel_rank()
425426

@@ -451,8 +452,9 @@ def load_weights(self,
451452
state_dict = self.state_dict()
452453

453454
for name, loaded_weight in hf_model_weights_iterator(
454-
model_name_or_path, cache_dir, use_np_cache):
455+
model_name_or_path, cache_dir, load_format):
455456
if "query_key_value" in name:
457+
loaded_weight = convert_pyslice_to_tensor(loaded_weight)
456458
loaded_weight_size = loaded_weight.size()
457459
loaded_weight = loaded_weight.view(
458460
total_num_kv_heads, num_query_heads_per_kv_head + 2,

vllm/model_executor/models/gpt2.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@
3232
from vllm.model_executor.layers.attention import PagedAttention
3333
from vllm.model_executor.layers.sampler import Sampler
3434
from vllm.model_executor.weight_utils import (
35-
hf_model_weights_iterator, load_padded_tensor_parallel_vocab,
36-
load_tensor_parallel_weights)
35+
convert_pyslice_to_tensor, hf_model_weights_iterator,
36+
load_padded_tensor_parallel_vocab, load_tensor_parallel_weights)
3737
from vllm.model_executor.parallel_utils.parallel_state import (
3838
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
3939
from vllm.model_executor.parallel_utils.tensor_parallel import (
@@ -231,14 +231,14 @@ def forward(
231231
def load_weights(self,
232232
model_name_or_path: str,
233233
cache_dir: Optional[str] = None,
234-
use_np_cache: bool = False):
234+
load_format: str = "auto"):
235235
tensor_model_parallel_world_size = (
236236
get_tensor_model_parallel_world_size())
237237
tensor_model_parallel_rank = get_tensor_model_parallel_rank()
238238
state_dict = self.state_dict()
239239

240240
for name, loaded_weight in hf_model_weights_iterator(
241-
model_name_or_path, cache_dir, use_np_cache):
241+
model_name_or_path, cache_dir, load_format):
242242
if "lm_head.weight" in name:
243243
# GPT-2 ties the weights of the embedding layer and the final
244244
# linear layer.
@@ -251,6 +251,8 @@ def load_weights(self,
251251
if not name.startswith("transformer."):
252252
name = "transformer." + name
253253

254+
loaded_weight = convert_pyslice_to_tensor(loaded_weight)
255+
254256
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
255257
# Because of this, we need to transpose the weights.
256258
for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:

vllm/model_executor/models/gpt_bigcode.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
from vllm.model_executor.layers.attention import PagedAttention
3434
from vllm.model_executor.layers.sampler import Sampler
3535
from vllm.model_executor.weight_utils import (
36-
hf_model_weights_iterator, load_padded_tensor_parallel_vocab,
37-
load_tensor_parallel_weights)
36+
convert_pyslice_to_tensor, hf_model_weights_iterator,
37+
load_padded_tensor_parallel_vocab, load_tensor_parallel_weights)
3838
from vllm.model_executor.parallel_utils.parallel_state import (
3939
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
4040
from vllm.model_executor.parallel_utils.tensor_parallel import (
@@ -259,14 +259,14 @@ def forward(
259259
def load_weights(self,
260260
model_name_or_path: str,
261261
cache_dir: Optional[str] = None,
262-
use_np_cache: bool = False):
262+
load_format: str = "auto"):
263263
tensor_model_parallel_world_size = (
264264
get_tensor_model_parallel_world_size())
265265
tensor_model_parallel_rank = get_tensor_model_parallel_rank()
266266
state_dict = self.state_dict()
267267

268268
for name, loaded_weight in hf_model_weights_iterator(
269-
model_name_or_path, cache_dir, use_np_cache):
269+
model_name_or_path, cache_dir, load_format):
270270
if "lm_head.weight" in name:
271271
# GPT-2 ties the weights of the embedding layer and the final
272272
# linear layer.
@@ -295,6 +295,7 @@ def load_weights(self,
295295
head_start = tensor_model_parallel_rank * num_heads
296296
head_end = (tensor_model_parallel_rank + 1) * num_heads
297297

298+
loaded_weight = convert_pyslice_to_tensor(loaded_weight)
298299
wq, wk, wv = torch.split(
299300
loaded_weight, [hidden_size, total_kv_size, total_kv_size],
300301
dim=0)

0 commit comments

Comments
 (0)