2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
# imports for guided decoding tests
4
4
import json
5
+ import os
5
6
import shutil
6
7
from tempfile import TemporaryDirectory
7
8
from typing import Optional
26
27
# technically these adapters use a different base model,
27
28
# but we're not testing generation quality here
28
29
LORA_NAME = "typeof/zephyr-7b-beta-lora"
29
- PA_NAME = "swapnilbp/llama_tweet_ptune"
30
- # if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
31
- # need to change to match the prompt adapter
32
- PA_NUM_VIRTUAL_TOKENS = 8
33
30
34
31
GUIDED_DECODING_BACKENDS = ["outlines" , "lm-format-enforcer" , "xgrammar" ]
35
32
@@ -56,13 +53,7 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files):
56
53
57
54
58
55
@pytest .fixture (scope = "module" )
59
- def zephyr_pa_files ():
60
- return snapshot_download (repo_id = PA_NAME )
61
-
62
-
63
- @pytest .fixture (scope = "module" )
64
- def default_server_args (zephyr_lora_files , zephyr_lora_added_tokens_files ,
65
- zephyr_pa_files ):
56
+ def default_server_args (zephyr_lora_files , zephyr_lora_added_tokens_files ):
66
57
return [
67
58
# use half precision for speed and memory savings in CI environment
68
59
"--dtype" ,
@@ -81,15 +72,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
81
72
"64" ,
82
73
"--max-cpu-loras" ,
83
74
"2" ,
84
- # pa config
85
- "--enable-prompt-adapter" ,
86
- "--prompt-adapters" ,
87
- f"zephyr-pa={ zephyr_pa_files } " ,
88
- f"zephyr-pa2={ zephyr_pa_files } " ,
89
- "--max-prompt-adapters" ,
90
- "2" ,
91
- "--max-prompt-adapter-token" ,
92
- "128" ,
93
75
]
94
76
95
77
@@ -98,8 +80,19 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
98
80
def server (default_server_args , request ):
99
81
if request .param :
100
82
default_server_args .append (request .param )
101
- with RemoteOpenAIServer (MODEL_NAME , default_server_args ) as remote_server :
102
- yield remote_server
83
+
84
+ original_value = os .environ .get ('VLLM_USE_V1' )
85
+ os .environ ['VLLM_USE_V1' ] = '0'
86
+ try :
87
+ with RemoteOpenAIServer (MODEL_NAME ,
88
+ default_server_args ) as remote_server :
89
+ yield remote_server
90
+ finally :
91
+ # Restore original env value
92
+ if original_value is None :
93
+ os .environ .pop ('VLLM_USE_V1' , None )
94
+ else :
95
+ os .environ ['VLLM_USE_V1' ] = original_value
103
96
104
97
105
98
@pytest_asyncio .fixture
@@ -110,14 +103,11 @@ async def client(server):
110
103
111
104
@pytest .mark .asyncio
112
105
@pytest .mark .parametrize (
113
- # first test base model, then test loras, then test prompt adapters
114
- "model_name,num_virtual_tokens" ,
115
- [(MODEL_NAME , 0 ), ("zephyr-lora" , 0 ), ("zephyr-lora2" , 0 ),
116
- ("zephyr-pa" , PA_NUM_VIRTUAL_TOKENS ),
117
- ("zephyr-pa2" , PA_NUM_VIRTUAL_TOKENS )],
106
+ # first test base model, then test loras
107
+ "model_name" ,
108
+ [MODEL_NAME , "zephyr-lora" , "zephyr-lora2" ],
118
109
)
119
- async def test_single_completion (client : openai .AsyncOpenAI , model_name : str ,
120
- num_virtual_tokens : int ):
110
+ async def test_single_completion (client : openai .AsyncOpenAI , model_name : str ):
121
111
completion = await client .completions .create (model = model_name ,
122
112
prompt = "Hello, my name is" ,
123
113
max_tokens = 5 ,
@@ -130,9 +120,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
130
120
assert len (choice .text ) >= 5
131
121
assert choice .finish_reason == "length"
132
122
assert completion .usage == openai .types .CompletionUsage (
133
- completion_tokens = 5 ,
134
- prompt_tokens = 6 + num_virtual_tokens ,
135
- total_tokens = 11 + num_virtual_tokens )
123
+ completion_tokens = 5 , prompt_tokens = 6 , total_tokens = 11 )
136
124
137
125
# test using token IDs
138
126
completion = await client .completions .create (
@@ -175,9 +163,9 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
175
163
176
164
@pytest .mark .asyncio
177
165
@pytest .mark .parametrize (
178
- # first test base model, then test loras, then test prompt adapters
166
+ # first test base model, then test loras
179
167
"model_name" ,
180
- [MODEL_NAME , "zephyr-lora" , "zephyr-lora2" , "zephyr-pa" , "zephyr-pa2" ],
168
+ [MODEL_NAME , "zephyr-lora" , "zephyr-lora2" ],
181
169
)
182
170
async def test_no_logprobs (client : openai .AsyncOpenAI , model_name : str ):
183
171
# test using token IDs
@@ -194,9 +182,9 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
194
182
195
183
@pytest .mark .asyncio
196
184
@pytest .mark .parametrize (
197
- # just test 1 lora and 1 pa hereafter
185
+ # just test 1 lora
198
186
"model_name" ,
199
- [MODEL_NAME , "zephyr-lora" , "zephyr-pa" ],
187
+ [MODEL_NAME , "zephyr-lora" ],
200
188
)
201
189
async def test_zero_logprobs (client : openai .AsyncOpenAI , model_name : str ):
202
190
# test using token IDs
@@ -217,7 +205,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
217
205
@pytest .mark .asyncio
218
206
@pytest .mark .parametrize (
219
207
"model_name" ,
220
- [MODEL_NAME , "zephyr-lora" , "zephyr-pa" ],
208
+ [MODEL_NAME , "zephyr-lora" ],
221
209
)
222
210
async def test_some_logprobs (client : openai .AsyncOpenAI , model_name : str ):
223
211
# test using token IDs
@@ -238,7 +226,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
238
226
@pytest .mark .asyncio
239
227
@pytest .mark .parametrize (
240
228
"model_name" ,
241
- [MODEL_NAME , "zephyr-lora" , "zephyr-pa" ],
229
+ [MODEL_NAME , "zephyr-lora" ],
242
230
)
243
231
async def test_too_many_completion_logprobs (client : openai .AsyncOpenAI ,
244
232
model_name : str ):
@@ -314,7 +302,7 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
314
302
@pytest .mark .asyncio
315
303
@pytest .mark .parametrize (
316
304
"model_name" ,
317
- [MODEL_NAME , "zephyr-lora" , "zephyr-pa" ],
305
+ [MODEL_NAME , "zephyr-lora" ],
318
306
)
319
307
async def test_completion_streaming (client : openai .AsyncOpenAI ,
320
308
model_name : str ):
@@ -348,7 +336,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
348
336
@pytest .mark .asyncio
349
337
@pytest .mark .parametrize (
350
338
"model_name" ,
351
- [MODEL_NAME , "zephyr-lora" , "zephyr-pa" ],
339
+ [MODEL_NAME , "zephyr-lora" ],
352
340
)
353
341
async def test_parallel_streaming (client : openai .AsyncOpenAI , model_name : str ):
354
342
"""Streaming for parallel sampling.
@@ -382,7 +370,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
382
370
@pytest .mark .asyncio
383
371
@pytest .mark .parametrize (
384
372
"model_name" ,
385
- [MODEL_NAME , "zephyr-lora" , "zephyr-pa" ],
373
+ [MODEL_NAME , "zephyr-lora" ],
386
374
)
387
375
async def test_completion_stream_options (client : openai .AsyncOpenAI ,
388
376
model_name : str ):
@@ -519,7 +507,7 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
519
507
@pytest .mark .asyncio
520
508
@pytest .mark .parametrize (
521
509
"model_name" ,
522
- [MODEL_NAME , "zephyr-lora" , "zephyr-pa" ],
510
+ [MODEL_NAME , "zephyr-lora" ],
523
511
)
524
512
async def test_batch_completions (client : openai .AsyncOpenAI , model_name : str ):
525
513
# test both text and token IDs
0 commit comments