8
8
import openai
9
9
import pytest
10
10
import torch
11
+ from huggingface_hub import snapshot_download
11
12
from tensorizer import EncryptionParams
12
13
13
14
from vllm import SamplingParams
14
15
from vllm .engine .arg_utils import EngineArgs
16
+ # yapf conflicts with isort for this docstring
15
17
# yapf: disable
16
18
from vllm .model_executor .model_loader .tensorizer import (TensorizerConfig ,
17
19
TensorSerializer ,
20
22
open_stream ,
21
23
serialize_vllm_model ,
22
24
tensorize_vllm_model )
25
+ # yapf: enable
26
+ from vllm .utils import import_from_path
23
27
24
28
from ..conftest import VllmRunner
25
- from ..utils import RemoteOpenAIServer
29
+ from ..utils import VLLM_PATH , RemoteOpenAIServer
26
30
from .conftest import retry_until_skip
27
31
28
- # yapf conflicts with isort for this docstring
29
-
32
+ EXAMPLES_PATH = VLLM_PATH / "examples"
30
33
31
34
prompts = [
32
35
"Hello, my name is" ,
@@ -94,8 +97,8 @@ def test_can_deserialize_s3(vllm_runner):
94
97
num_readers = 1 ,
95
98
s3_endpoint = "object.ord1.coreweave.com" ,
96
99
)) as loaded_hf_model :
97
- deserialized_outputs = loaded_hf_model .generate (prompts ,
98
- sampling_params )
100
+ deserialized_outputs = loaded_hf_model .generate (
101
+ prompts , sampling_params )
99
102
# noqa: E501
100
103
101
104
assert deserialized_outputs
@@ -111,23 +114,21 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
111
114
112
115
outputs = vllm_model .generate (prompts , sampling_params )
113
116
114
- config_for_serializing = TensorizerConfig (
115
- tensorizer_uri = model_path ,
116
- encryption_keyfile = key_path
117
- )
117
+ config_for_serializing = TensorizerConfig (tensorizer_uri = model_path ,
118
+ encryption_keyfile = key_path )
118
119
serialize_vllm_model (get_torch_model (vllm_model ),
119
120
config_for_serializing )
120
121
121
122
config_for_deserializing = TensorizerConfig (tensorizer_uri = model_path ,
122
123
encryption_keyfile = key_path )
123
124
124
- with vllm_runner (
125
- model_ref ,
126
- load_format = "tensorizer" ,
127
- model_loader_extra_config = config_for_deserializing ) as loaded_vllm_model : # noqa: E501
125
+ with vllm_runner (model_ref ,
126
+ load_format = "tensorizer" ,
127
+ model_loader_extra_config = config_for_deserializing
128
+ ) as loaded_vllm_model : # noqa: E501
128
129
129
- deserialized_outputs = loaded_vllm_model .generate (prompts ,
130
- sampling_params )
130
+ deserialized_outputs = loaded_vllm_model .generate (
131
+ prompts , sampling_params )
131
132
# noqa: E501
132
133
133
134
assert outputs == deserialized_outputs
@@ -156,14 +157,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
156
157
157
158
158
159
def test_vllm_model_can_load_with_lora (vllm_runner , tmp_path ):
159
- from huggingface_hub import snapshot_download
160
-
161
- from examples . multilora_inference import ( create_test_prompts ,
162
- process_requests )
160
+ multilora_inference = import_from_path (
161
+ "examples.multilora_inference" ,
162
+ EXAMPLES_PATH / "multilora_inference.py" ,
163
+ )
163
164
164
165
model_ref = "meta-llama/Llama-2-7b-hf"
165
166
lora_path = snapshot_download (repo_id = "yard1/llama-2-7b-sql-lora-test" )
166
- test_prompts = create_test_prompts (lora_path )
167
+ test_prompts = multilora_inference . create_test_prompts (lora_path )
167
168
168
169
# Serialize model before deserializing and binding LoRA adapters
169
170
with vllm_runner (model_ref , ) as vllm_model :
@@ -186,7 +187,8 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
186
187
max_num_seqs = 50 ,
187
188
max_model_len = 1000 ,
188
189
) as loaded_vllm_model :
189
- process_requests (loaded_vllm_model .model .llm_engine , test_prompts )
190
+ multilora_inference .process_requests (
191
+ loaded_vllm_model .model .llm_engine , test_prompts )
190
192
191
193
assert loaded_vllm_model
192
194
@@ -217,8 +219,11 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
217
219
218
220
## Start OpenAI API server
219
221
openai_args = [
220
- "--dtype" , "float16" , "--load-format" ,
221
- "tensorizer" , "--model-loader-extra-config" ,
222
+ "--dtype" ,
223
+ "float16" ,
224
+ "--load-format" ,
225
+ "tensorizer" ,
226
+ "--model-loader-extra-config" ,
222
227
json .dumps (model_loader_extra_config ),
223
228
]
224
229
@@ -251,8 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
251
256
torch .cuda .empty_cache ()
252
257
253
258
254
- @pytest .mark .skipif (torch .cuda .device_count () < 2 ,
255
- reason = "Requires 2 GPUs" )
259
+ @pytest .mark .skipif (torch .cuda .device_count () < 2 , reason = "Requires 2 GPUs" )
256
260
def test_tensorizer_with_tp_path_without_template (vllm_runner ):
257
261
with pytest .raises (ValueError ):
258
262
model_ref = "EleutherAI/pythia-1.4b"
@@ -271,10 +275,9 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
271
275
)
272
276
273
277
274
- @pytest .mark .skipif (torch .cuda .device_count () < 2 ,
275
- reason = "Requires 2 GPUs" )
276
- def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs (vllm_runner ,
277
- tmp_path ):
278
+ @pytest .mark .skipif (torch .cuda .device_count () < 2 , reason = "Requires 2 GPUs" )
279
+ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs (
280
+ vllm_runner , tmp_path ):
278
281
model_ref = "EleutherAI/pythia-1.4b"
279
282
# record outputs from un-sharded un-tensorized model
280
283
with vllm_runner (
@@ -313,13 +316,12 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
313
316
disable_custom_all_reduce = True ,
314
317
enforce_eager = True ,
315
318
model_loader_extra_config = tensorizer_config ) as loaded_vllm_model :
316
- deserialized_outputs = loaded_vllm_model .generate (prompts ,
317
- sampling_params )
319
+ deserialized_outputs = loaded_vllm_model .generate (
320
+ prompts , sampling_params )
318
321
319
322
assert outputs == deserialized_outputs
320
323
321
324
322
-
323
325
@retry_until_skip (3 )
324
326
def test_vllm_tensorized_model_has_same_outputs (vllm_runner , tmp_path ):
325
327
gc .collect ()
@@ -337,8 +339,8 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
337
339
with vllm_runner (model_ref ,
338
340
load_format = "tensorizer" ,
339
341
model_loader_extra_config = config ) as loaded_vllm_model :
340
- deserialized_outputs = loaded_vllm_model .generate (prompts ,
341
- sampling_params )
342
+ deserialized_outputs = loaded_vllm_model .generate (
343
+ prompts , sampling_params )
342
344
# noqa: E501
343
345
344
346
assert outputs == deserialized_outputs
0 commit comments