neuralmagic
diff --git a/‎tests/distributed/test_pipeline_parallel.py‎
Lines changed: 6 additions & 2 deletions b/‎tests/distributed/test_pipeline_parallel.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎tests/entrypoints/llm/test_classify.py‎
Lines changed: 0 additions & 8 deletions b/‎tests/entrypoints/llm/test_classify.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tests/entrypoints/llm/test_encode.py‎
Lines changed: 0 additions & 8 deletions b/‎tests/entrypoints/llm/test_encode.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tests/entrypoints/llm/test_reward.py‎
Lines changed: 0 additions & 8 deletions b/‎tests/entrypoints/llm/test_reward.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tests/entrypoints/llm/test_score.py‎
Lines changed: 0 additions & 8 deletions b/‎tests/entrypoints/llm/test_score.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tests/entrypoints/offline_mode/test_offline_mode.py‎
Lines changed: 10 additions & 9 deletions b/‎tests/entrypoints/offline_mode/test_offline_mode.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎tests/entrypoints/openai/test_embedding.py‎
Lines changed: 0 additions & 8 deletions b/‎tests/entrypoints/openai/test_embedding.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tests/entrypoints/openai/test_rerank.py‎
Lines changed: 0 additions & 8 deletions b/‎tests/entrypoints/openai/test_rerank.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tests/entrypoints/openai/test_score.py‎
Lines changed: 0 additions & 9 deletions b/‎tests/entrypoints/openai/test_score.py‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎tests/models/language/pooling/test_embedding.py‎
Lines changed: 3 additions & 17 deletions b/‎tests/models/language/pooling/test_embedding.py‎
Lines changed: 3 additions & 17 deletions
@@ -118,6 +118,8 @@ def fast(
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
+        vllm_major_versions = ["1"] if runner == "pooling" else ["0"]
+
         return PPTestSettings(
             parallel_setups=[
                 ParallelSetup(tp_size=tp_base,
@@ -126,7 +128,7 @@ def fast(
                               chunked_prefill=False),
             ],
             distributed_backends=["mp"],
-            vllm_major_versions=["0"],
+            vllm_major_versions=vllm_major_versions,
             runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
@@ -213,7 +215,9 @@ def iter_params(self, model_id: str):
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
     # [Text-only]
     "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
+    # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
+    # is fixed
+    #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
     "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
         load_format="dummy", runner="pooling"
     ),
 
@@ -16,14 +16,6 @@
 prompts = ["The chef prepared a delicious meal."]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
 
@@ -27,14 +27,6 @@
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
 
@@ -16,14 +16,6 @@
 prompts = ["The chef prepared a delicious meal."]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
 
@@ -14,14 +14,6 @@
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
 
@@ -32,15 +32,16 @@
         "tensor_parallel_size": 1,
         "tokenizer_mode": "mistral",
     },
-    {
-        "model": "sentence-transformers/all-MiniLM-L12-v2",
-        "enforce_eager": True,
-        "gpu_memory_utilization": 0.20,
-        "max_model_len": 64,
-        "max_num_batched_tokens": 64,
-        "max_num_seqs": 64,
-        "tensor_parallel_size": 1,
-    },
+    # TODO: re-enable once these tests are run with V1
+    # {
+    #     "model": "sentence-transformers/all-MiniLM-L12-v2",
+    #     "enforce_eager": True,
+    #     "gpu_memory_utilization": 0.20,
+    #     "max_model_len": 64,
+    #     "max_num_batched_tokens": 64,
+    #     "max_num_seqs": 64,
+    #     "tensor_parallel_size": 1,
+    # },
 ]
 
 
 
@@ -24,14 +24,6 @@
 DTYPE = "bfloat16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = [
 
@@ -14,14 +14,6 @@
 DTYPE = "bfloat16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
 
@@ -12,15 +12,6 @@
 
 from ...utils import RemoteOpenAIServer
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 MODELS = [
     {
         "name": "BAAI/bge-reranker-v2-m3",
 
@@ -10,14 +10,6 @@
 from ...utils import check_embeddings_close, check_transformers_version
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize(
     "model",
     [
@@ -32,21 +24,15 @@ def v1(run_with_both_engines):
             "intfloat/e5-mistral-7b-instruct",
             # CPU v1 doesn't support sliding window
             marks=[pytest.mark.core_model]),
-        # the qwen models interfere with each other (see PR
-        # https://github.com/vllm-project/vllm/pull/18720).
-        # To avoid this problem, for now we skip v0 since it will be
-        # deprecated anyway.
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
+                     marks=[pytest.mark.cpu_model]),
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
 def test_models(