vllm-project
diff --git a/‎tests/entrypoints/llm/test_classify.py
Lines changed: 6 additions & 0 deletions b/‎tests/entrypoints/llm/test_classify.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_classification.py
Lines changed: 15 additions & 0 deletions b/‎tests/entrypoints/openai/test_classification.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎tests/models/language/pooling/mteb_utils.py
Lines changed: 10 additions & 2 deletions b/‎tests/models/language/pooling/mteb_utils.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎tests/models/language/pooling/test_auto_prefix_cache_support.py
Lines changed: 93 additions & 0 deletions b/‎tests/models/language/pooling/test_auto_prefix_cache_support.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎tests/models/language/pooling/test_baai.py
Lines changed: 61 additions & 56 deletions b/‎tests/models/language/pooling/test_baai.py
Lines changed: 61 additions & 56 deletions
diff --git a/‎tests/models/language/pooling/test_bge_reranker_v2_gemma.py
Lines changed: 4 additions & 4 deletions b/‎tests/models/language/pooling/test_bge_reranker_v2_gemma.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/models/language/pooling/test_cross_encoder.py
Lines changed: 7 additions & 5 deletions b/‎tests/models/language/pooling/test_cross_encoder.py
Lines changed: 7 additions & 5 deletions
@@ -65,3 +65,9 @@ def get_outputs(activation):
     assert torch.allclose(
         softmax(wo_activation), w_activation, atol=1e-2
     ), "w_activation should be close to activation(wo_activation)."
+
+
+def test_encode_api(llm: LLM):
+    err_msg = "pooling_task must be one of.+"
+    with pytest.raises(ValueError, match=err_msg):
+        llm.encode(prompts, use_tqdm=False)
@@ -211,3 +211,18 @@ async def get_outputs(activation):
     assert torch.allclose(
         F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
     ), "w_activation should be close to activation(wo_activation)."
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_pooling(server: RemoteOpenAIServer, model_name: str):
+    # pooling api uses ALL pooling, which does not support chunked prefill.
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float"
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
@@ -177,9 +177,12 @@ def mteb_test_embed_models(hf_runner,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
 
+        model_config = vllm_model.llm.llm_engine.model_config
+
         if model_info.architecture:
-            assert (model_info.architecture
-                    in vllm_model.llm.llm_engine.model_config.architectures)
+            assert model_info.architecture in model_config.architectures
+        assert (model_config._model_info.default_pooling_type ==
+                model_info.default_pooling_type)
 
         vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                               MTEB_EMBED_TASKS)
@@ -286,7 +289,12 @@ def mteb_test_rerank_models(hf_runner,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
+
+        if model_info.architecture:
+            assert (model_info.architecture in model_config.architectures)
         assert model_config.hf_config.num_labels == 1
+        assert (model_config._model_info.default_pooling_type ==
+                model_info.default_pooling_type)
 
         vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
                                           tasks=MTEB_RERANK_TASKS,
 
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+from tests.models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["jason9693/Qwen2.5-1.5B-apeach"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+
+    example_prompts = example_prompts * 2
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     enable_prefix_caching=True) as vllm_model:
+        cache_config = vllm_model.llm.llm_engine.cache_config
+        assert cache_config.enable_prefix_caching
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Qwen/Qwen3-Embedding-0.6B"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_embed_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+):
+    example_prompts = [str(s).strip() for s in example_prompts] * 2
+
+    with vllm_runner(
+            model,
+            runner="pooling",
+            max_model_len=None,
+            enable_prefix_caching=True,
+    ) as vllm_model:
+        cache_config = vllm_model.llm.llm_engine.cache_config
+        assert cache_config.enable_prefix_caching
+        vllm_outputs = vllm_model.embed(example_prompts)
+
+    with hf_runner(
+            model,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "intfloat/e5-small",
+        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",  # is_causal == False
+        "papluca/xlm-roberta-base-language-detection",
+    ])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
+                           dtype: str) -> None:
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     enable_prefix_caching=True) as vllm_model:
+        cache_config = vllm_model.llm.llm_engine.cache_config
+        assert not cache_config.enable_prefix_caching
@@ -2,73 +2,78 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import EmbedModelInfo, RerankModelInfo
+from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
+                      EmbedModelInfo, LASTPoolingEmbedModelInfo,
+                      RerankModelInfo)
 from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("BAAI/bge-base-en",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("BAAI/bge-base-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
+                             architecture="BertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-en",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-en",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
     ########## XLMRobertaModel
-    EmbedModelInfo("BAAI/bge-m3",
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
+    CLSPoolingEmbedModelInfo("BAAI/bge-m3",
+                             architecture="XLMRobertaModel",
+                             enable_test=True),
     ########## Qwen2Model
-    EmbedModelInfo("BAAI/bge-code-v1",
-                   architecture="Qwen2Model",
-                   dtype="float32",
-                   enable_test=True),
+    LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
+                              architecture="Qwen2Model",
+                              dtype="float32",
+                              enable_test=True),
 ]
 
 RERANK_MODELS = [
     ########## XLMRobertaForSequenceClassification
-    RerankModelInfo("BAAI/bge-reranker-base",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("BAAI/bge-reranker-large",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False),
-    RerankModelInfo("BAAI/bge-reranker-v2-m3",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False)
+    CLSPoolingRerankModelInfo(
+        "BAAI/bge-reranker-base",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=True),
+    CLSPoolingRerankModelInfo(
+        "BAAI/bge-reranker-large",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=False),
+    CLSPoolingRerankModelInfo(
+        "BAAI/bge-reranker-v2-m3",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=False)
 ]
 
 
 
@@ -8,12 +8,12 @@
 
 from tests.conftest import HfRunner
 
-from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
-                         mteb_test_rerank_models)
+from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                    architecture="GemmaForSequenceClassification"),
+    LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
+                               architecture="GemmaForSequenceClassification"),
 ]
 
 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
 
@@ -2,13 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo,
+                      RerankModelInfo)
+from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
-                    architecture="BertForSequenceClassification"),
-    RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
-                    architecture="Qwen3ForSequenceClassification")
+    CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                              architecture="BertForSequenceClassification"),
+    LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                               architecture="Qwen3ForSequenceClassification")
 ]