Skip to content

Commit 2554b27

Browse files
[V0 Deprecation] Remove pooling model support in V0 (vllm-project#23434)
Signed-off-by: Woosuk Kwon <[email protected]> Signed-off-by: Max de Bayser <[email protected]> Co-authored-by: Woosuk Kwon <[email protected]>
1 parent 934bebf commit 2554b27

38 files changed

+99
-808
lines changed

tests/distributed/test_pipeline_parallel.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ def fast(
118118
multi_node_only: bool = False,
119119
load_format: Optional[str] = None,
120120
):
121+
vllm_major_versions = ["1"] if runner == "pooling" else ["0"]
122+
121123
return PPTestSettings(
122124
parallel_setups=[
123125
ParallelSetup(tp_size=tp_base,
@@ -126,7 +128,7 @@ def fast(
126128
chunked_prefill=False),
127129
],
128130
distributed_backends=["mp"],
129-
vllm_major_versions=["0"],
131+
vllm_major_versions=vllm_major_versions,
130132
runner=runner,
131133
test_options=PPTestOptions(multi_node_only=multi_node_only,
132134
load_format=load_format),
@@ -213,7 +215,9 @@ def iter_params(self, model_id: str):
213215
EMBEDDING_MODELS = { # type: ignore[var-annotated]
214216
# [Text-only]
215217
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
216-
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
218+
# TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
219+
# is fixed
220+
#"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
217221
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
218222
load_format="dummy", runner="pooling"
219223
),

tests/entrypoints/llm/test_classify.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,6 @@
1616
prompts = ["The chef prepared a delicious meal."]
1717

1818

19-
@pytest.fixture(autouse=True)
20-
def v1(run_with_both_engines):
21-
# Simple autouse wrapper to run both engines for each test
22-
# This can be promoted up to conftest.py to run for every
23-
# test in a package
24-
pass
25-
26-
2719
@pytest.fixture(scope="module")
2820
def llm():
2921
# pytest caches the fixture so we use weakref.proxy to

tests/entrypoints/llm/test_encode.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,6 @@
2727
]
2828

2929

30-
@pytest.fixture(autouse=True)
31-
def v1(run_with_both_engines):
32-
# Simple autouse wrapper to run both engines for each test
33-
# This can be promoted up to conftest.py to run for every
34-
# test in a package
35-
pass
36-
37-
3830
@pytest.fixture(scope="module")
3931
def llm():
4032
# pytest caches the fixture so we use weakref.proxy to

tests/entrypoints/llm/test_reward.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,6 @@
1616
prompts = ["The chef prepared a delicious meal."]
1717

1818

19-
@pytest.fixture(autouse=True)
20-
def v1(run_with_both_engines):
21-
# Simple autouse wrapper to run both engines for each test
22-
# This can be promoted up to conftest.py to run for every
23-
# test in a package
24-
pass
25-
26-
2719
@pytest.fixture(scope="module")
2820
def llm():
2921
# pytest caches the fixture so we use weakref.proxy to

tests/entrypoints/llm/test_score.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,6 @@
1414
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
1515

1616

17-
@pytest.fixture(autouse=True)
18-
def v1(run_with_both_engines):
19-
# Simple autouse wrapper to run both engines for each test
20-
# This can be promoted up to conftest.py to run for every
21-
# test in a package
22-
pass
23-
24-
2517
@pytest.fixture(scope="module")
2618
def llm():
2719
# pytest caches the fixture so we use weakref.proxy to

tests/entrypoints/offline_mode/test_offline_mode.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,16 @@
3232
"tensor_parallel_size": 1,
3333
"tokenizer_mode": "mistral",
3434
},
35-
{
36-
"model": "sentence-transformers/all-MiniLM-L12-v2",
37-
"enforce_eager": True,
38-
"gpu_memory_utilization": 0.20,
39-
"max_model_len": 64,
40-
"max_num_batched_tokens": 64,
41-
"max_num_seqs": 64,
42-
"tensor_parallel_size": 1,
43-
},
35+
# TODO: re-enable once these tests are run with V1
36+
# {
37+
# "model": "sentence-transformers/all-MiniLM-L12-v2",
38+
# "enforce_eager": True,
39+
# "gpu_memory_utilization": 0.20,
40+
# "max_model_len": 64,
41+
# "max_num_batched_tokens": 64,
42+
# "max_num_seqs": 64,
43+
# "tensor_parallel_size": 1,
44+
# },
4445
]
4546

4647

tests/entrypoints/openai/test_embedding.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,6 @@
2424
DTYPE = "bfloat16"
2525

2626

27-
@pytest.fixture(autouse=True)
28-
def v1(run_with_both_engines):
29-
# Simple autouse wrapper to run both engines for each test
30-
# This can be promoted up to conftest.py to run for every
31-
# test in a package
32-
pass
33-
34-
3527
@pytest.fixture(scope="module")
3628
def server():
3729
args = [

tests/entrypoints/openai/test_rerank.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,6 @@
1414
DTYPE = "bfloat16"
1515

1616

17-
@pytest.fixture(autouse=True)
18-
def v1(run_with_both_engines):
19-
# Simple autouse wrapper to run both engines for each test
20-
# This can be promoted up to conftest.py to run for every
21-
# test in a package
22-
pass
23-
24-
2517
@pytest.fixture(scope="module")
2618
def server():
2719
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

tests/entrypoints/openai/test_score.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,6 @@
1212

1313
from ...utils import RemoteOpenAIServer
1414

15-
16-
@pytest.fixture(autouse=True)
17-
def v1(run_with_both_engines):
18-
# Simple autouse wrapper to run both engines for each test
19-
# This can be promoted up to conftest.py to run for every
20-
# test in a package
21-
pass
22-
23-
2415
MODELS = [
2516
{
2617
"name": "BAAI/bge-reranker-v2-m3",

tests/models/language/pooling/test_embedding.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,6 @@
1010
from ...utils import check_embeddings_close, check_transformers_version
1111

1212

13-
@pytest.fixture(autouse=True)
14-
def v1(run_with_both_engines):
15-
# Simple autouse wrapper to run both engines for each test
16-
# This can be promoted up to conftest.py to run for every
17-
# test in a package
18-
pass
19-
20-
2113
@pytest.mark.parametrize(
2214
"model",
2315
[
@@ -32,21 +24,15 @@ def v1(run_with_both_engines):
3224
"intfloat/e5-mistral-7b-instruct",
3325
# CPU v1 doesn't support sliding window
3426
marks=[pytest.mark.core_model]),
35-
# the qwen models interfere with each other (see PR
36-
# https://github.com/vllm-project/vllm/pull/18720).
37-
# To avoid this problem, for now we skip v0 since it will be
38-
# deprecated anyway.
3927
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
40-
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
28+
marks=[pytest.mark.cpu_model]),
4129
# [Encoder-only]
4230
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
4331
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
4432
pytest.param("intfloat/multilingual-e5-small"),
45-
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
46-
marks=[pytest.mark.skip_v1]),
33+
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
4734
# [Cross-Encoder]
48-
pytest.param("sentence-transformers/stsb-roberta-base-v2",
49-
marks=[pytest.mark.skip_v1]),
35+
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
5036
],
5137
)
5238
def test_models(

0 commit comments

Comments
 (0)