Skip to content

Commit 82ec66f

Browse files
authored
[V0 Deprecation] Remove Prompt Adapters (#20588)
Signed-off-by: mgoin <[email protected]>
1 parent 78c13e3 commit 82ec66f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+126
-1727
lines changed

docs/api/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ API documentation for vLLM's configuration classes.
1414
- [vllm.config.DeviceConfig][]
1515
- [vllm.config.SpeculativeConfig][]
1616
- [vllm.config.LoRAConfig][]
17-
- [vllm.config.PromptAdapterConfig][]
1817
- [vllm.config.MultiModalConfig][]
1918
- [vllm.config.PoolerConfig][]
2019
- [vllm.config.DecodingConfig][]

docs/features/compatibility_matrix.md

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,23 +34,22 @@ th:not(:first-child) {
3434
}
3535
</style>
3636

37-
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD](spec_decode.md) | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
38-
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
37+
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
38+
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3939
| [CP][chunked-prefill] || | | | | | | | | | | | | | |
4040
| [APC](automatic_prefix_caching.md) ||| | | | | | | | | | | | | |
4141
| [LoRA](lora.md) |||| | | | | | | | | | | | |
42-
| <abbr title="Prompt Adapter">prmpt adptr</abbr> ||||| | | | | | | | | | | |
43-
| [SD](spec_decode.md) |||||| | | | | | | | | | |
44-
| CUDA graph ||||||| | | | | | | | | |
45-
| <abbr title="Pooling Models">pooling</abbr> |||||||| | | | | | | | |
46-
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> || [](gh-issue:7366) ||| [](gh-issue:7366) |||| | | | | | | |
47-
| <abbr title="Logprobs">logP</abbr> |||||||||| | | | | | |
48-
| <abbr title="Prompt Logprobs">prmpt logP</abbr> ||||||||||| | | | | |
49-
| <abbr title="Async Output Processing">async output</abbr> |||||||||||| | | | |
50-
| multi-step ||||||||||||| | | |
51-
| <abbr title="Multimodal Inputs">mm</abbr> || [🟠](gh-pr:8348) | [🟠](gh-pr:4194) ||||||||||| | |
52-
| best-of ||||| [](gh-issue:6137) ||||||| [](gh-issue:7968) ||| |
53-
| beam-search ||||| [](gh-issue:6137) ||||||| [](gh-issue:7968) ||||
42+
| [SD](spec_decode.md) ||||| | | | | | | | | | |
43+
| CUDA graph |||||| | | | | | | | | |
44+
| <abbr title="Pooling Models">pooling</abbr> ||||||| | | | | | | | |
45+
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> || [](gh-issue:7366) || [](gh-issue:7366) |||| | | | | | | |
46+
| <abbr title="Logprobs">logP</abbr> ||||||||| | | | | | |
47+
| <abbr title="Prompt Logprobs">prmpt logP</abbr> |||||||||| | | | | |
48+
| <abbr title="Async Output Processing">async output</abbr> ||||||||||| | | | |
49+
| multi-step |||||||||||| | | |
50+
| <abbr title="Multimodal Inputs">mm</abbr> || [🟠](gh-pr:8348) | [🟠](gh-pr:4194) |||||||||| | |
51+
| best-of |||| [](gh-issue:6137) ||||||| [](gh-issue:7968) ||| |
52+
| beam-search |||| [](gh-issue:6137) ||||||| [](gh-issue:7968) ||||
5453

5554
[](){ #feature-x-hardware }
5655

@@ -59,10 +58,9 @@ th:not(:first-child) {
5958
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
6059
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
6160
| [CP][chunked-prefill] | [](gh-issue:2729) ||||||||
62-
| [APC](automatic_prefix_caching.md) | [](gh-issue:3687) ||||||||
63-
| [LoRA](lora.md) |||||||||
64-
| <abbr title="Prompt Adapter">prmpt adptr</abbr> |||||| [](gh-issue:8475) |||
65-
| [SD](spec_decode.md) |||||||||
61+
| [APC](automatic_prefix_caching.md) | [](gh-issue:3687) ||||||||
62+
| [LoRA](lora.md) |||||||||
63+
| [SD](spec_decode.md) |||||||||
6664
| CUDA graph |||||||||
6765
| <abbr title="Pooling Models">pooling</abbr> |||||||||
6866
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> |||||||||

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ line-length = 80
7272
"vllm/core/**/*.py" = ["UP006", "UP035"]
7373
"vllm/engine/**/*.py" = ["UP006", "UP035"]
7474
"vllm/executor/**/*.py" = ["UP006", "UP035"]
75-
"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
7675
"vllm/worker/**/*.py" = ["UP006", "UP035"]
7776
# Python 3.8 typing - skip utils for ROCm
7877
"vllm/utils/__init__.py" = ["UP006", "UP035"]

tests/entrypoints/openai/test_completion.py

Lines changed: 30 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
# imports for guided decoding tests
44
import json
5+
import os
56
import shutil
67
from tempfile import TemporaryDirectory
78
from typing import Optional
@@ -26,10 +27,6 @@
2627
# technically these adapters use a different base model,
2728
# but we're not testing generation quality here
2829
LORA_NAME = "typeof/zephyr-7b-beta-lora"
29-
PA_NAME = "swapnilbp/llama_tweet_ptune"
30-
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
31-
# need to change to match the prompt adapter
32-
PA_NUM_VIRTUAL_TOKENS = 8
3330

3431
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
3532

@@ -56,13 +53,7 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files):
5653

5754

5855
@pytest.fixture(scope="module")
59-
def zephyr_pa_files():
60-
return snapshot_download(repo_id=PA_NAME)
61-
62-
63-
@pytest.fixture(scope="module")
64-
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
65-
zephyr_pa_files):
56+
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
6657
return [
6758
# use half precision for speed and memory savings in CI environment
6859
"--dtype",
@@ -81,15 +72,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
8172
"64",
8273
"--max-cpu-loras",
8374
"2",
84-
# pa config
85-
"--enable-prompt-adapter",
86-
"--prompt-adapters",
87-
f"zephyr-pa={zephyr_pa_files}",
88-
f"zephyr-pa2={zephyr_pa_files}",
89-
"--max-prompt-adapters",
90-
"2",
91-
"--max-prompt-adapter-token",
92-
"128",
9375
]
9476

9577

@@ -98,8 +80,19 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
9880
def server(default_server_args, request):
9981
if request.param:
10082
default_server_args.append(request.param)
101-
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
102-
yield remote_server
83+
84+
original_value = os.environ.get('VLLM_USE_V1')
85+
os.environ['VLLM_USE_V1'] = '0'
86+
try:
87+
with RemoteOpenAIServer(MODEL_NAME,
88+
default_server_args) as remote_server:
89+
yield remote_server
90+
finally:
91+
# Restore original env value
92+
if original_value is None:
93+
os.environ.pop('VLLM_USE_V1', None)
94+
else:
95+
os.environ['VLLM_USE_V1'] = original_value
10396

10497

10598
@pytest_asyncio.fixture
@@ -110,14 +103,11 @@ async def client(server):
110103

111104
@pytest.mark.asyncio
112105
@pytest.mark.parametrize(
113-
# first test base model, then test loras, then test prompt adapters
114-
"model_name,num_virtual_tokens",
115-
[(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
116-
("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
117-
("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
106+
# first test base model, then test loras
107+
"model_name",
108+
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
118109
)
119-
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
120-
num_virtual_tokens: int):
110+
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
121111
completion = await client.completions.create(model=model_name,
122112
prompt="Hello, my name is",
123113
max_tokens=5,
@@ -130,9 +120,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
130120
assert len(choice.text) >= 5
131121
assert choice.finish_reason == "length"
132122
assert completion.usage == openai.types.CompletionUsage(
133-
completion_tokens=5,
134-
prompt_tokens=6 + num_virtual_tokens,
135-
total_tokens=11 + num_virtual_tokens)
123+
completion_tokens=5, prompt_tokens=6, total_tokens=11)
136124

137125
# test using token IDs
138126
completion = await client.completions.create(
@@ -175,9 +163,9 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
175163

176164
@pytest.mark.asyncio
177165
@pytest.mark.parametrize(
178-
# first test base model, then test loras, then test prompt adapters
166+
# first test base model, then test loras
179167
"model_name",
180-
[MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
168+
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
181169
)
182170
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
183171
# test using token IDs
@@ -194,9 +182,9 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
194182

195183
@pytest.mark.asyncio
196184
@pytest.mark.parametrize(
197-
# just test 1 lora and 1 pa hereafter
185+
# just test 1 lora
198186
"model_name",
199-
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
187+
[MODEL_NAME, "zephyr-lora"],
200188
)
201189
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
202190
# test using token IDs
@@ -217,7 +205,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
217205
@pytest.mark.asyncio
218206
@pytest.mark.parametrize(
219207
"model_name",
220-
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
208+
[MODEL_NAME, "zephyr-lora"],
221209
)
222210
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
223211
# test using token IDs
@@ -238,7 +226,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
238226
@pytest.mark.asyncio
239227
@pytest.mark.parametrize(
240228
"model_name",
241-
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
229+
[MODEL_NAME, "zephyr-lora"],
242230
)
243231
async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
244232
model_name: str):
@@ -314,7 +302,7 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
314302
@pytest.mark.asyncio
315303
@pytest.mark.parametrize(
316304
"model_name",
317-
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
305+
[MODEL_NAME, "zephyr-lora"],
318306
)
319307
async def test_completion_streaming(client: openai.AsyncOpenAI,
320308
model_name: str):
@@ -348,7 +336,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
348336
@pytest.mark.asyncio
349337
@pytest.mark.parametrize(
350338
"model_name",
351-
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
339+
[MODEL_NAME, "zephyr-lora"],
352340
)
353341
async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
354342
"""Streaming for parallel sampling.
@@ -382,7 +370,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
382370
@pytest.mark.asyncio
383371
@pytest.mark.parametrize(
384372
"model_name",
385-
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
373+
[MODEL_NAME, "zephyr-lora"],
386374
)
387375
async def test_completion_stream_options(client: openai.AsyncOpenAI,
388376
model_name: str):
@@ -519,7 +507,7 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
519507
@pytest.mark.asyncio
520508
@pytest.mark.parametrize(
521509
"model_name",
522-
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
510+
[MODEL_NAME, "zephyr-lora"],
523511
)
524512
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
525513
# test both text and token IDs

tests/entrypoints/openai/test_return_tokens_as_ids.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from .test_completion import default_server_args # noqa: F401
1414
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
1515
from .test_completion import zephyr_lora_files # noqa: F401
16-
from .test_completion import zephyr_pa_files # noqa: F401
1716
from .test_completion import MODEL_NAME
1817

1918

tests/entrypoints/openai/test_serving_models.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
3232
serving_models = OpenAIServingModels(engine_client=mock_engine_client,
3333
base_model_paths=BASE_MODEL_PATHS,
3434
model_config=mock_model_config,
35-
lora_modules=None,
36-
prompt_adapters=None)
35+
lora_modules=None)
3736
await serving_models.init_static_loras()
3837

3938
return serving_models

tests/prompt_adapter/test_bloom.py

Lines changed: 0 additions & 48 deletions
This file was deleted.

tests/prompt_adapter/test_multi_adapter_inference.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

0 commit comments

Comments
 (0)