Skip to content

Commit 51383bd

Browse files
authored
[CI] Accelerate mteb test by setting SentenceTransformers mteb score to a constant (vllm-project#24088)
Signed-off-by: wang.yuqi <[email protected]>
1 parent 9c99e48 commit 51383bd

17 files changed

+83
-52
lines changed

tests/entrypoints/openai/correctness/test_mteb_embed.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,6 @@ def test_mteb_embed(server):
3737
print("SentenceTransformer main score: ", st_main_score)
3838
print("Difference: ", st_main_score - vllm_main_score)
3939

40-
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
40+
# We are not concerned that the vllm mteb results are better
41+
# than SentenceTransformers, so we only perform one-sided testing.
42+
assert st_main_score - vllm_main_score < MTEB_EMBED_TOL

tests/entrypoints/openai/correctness/test_mteb_score.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,19 @@
66

77
# yapf conflicts with isort for this block
88
# yapf: disable
9-
from tests.models.language.pooling.mteb_utils import (
10-
MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
11-
RerankClientMtebEncoder, ScoreClientMtebEncoder,
12-
mteb_test_rerank_models_hf, run_mteb_rerank)
9+
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
10+
MTEB_RERANK_TASKS,
11+
MTEB_RERANK_TOL,
12+
RerankClientMtebEncoder,
13+
ScoreClientMtebEncoder,
14+
run_mteb_rerank)
1315
# yapf: enable
1416
from tests.utils import RemoteOpenAIServer
1517

1618
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
1719

1820
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
21+
st_main_score = 0.33457
1922

2023

2124
@pytest.fixture(scope="module")
@@ -29,15 +32,7 @@ def server():
2932
yield remote_server
3033

3134

32-
@pytest.fixture(scope="module")
33-
def st_main_score(hf_runner):
34-
# The main score related to the version of the dependency.
35-
# So we need to recalculate every time.
36-
main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
37-
return main_score
38-
39-
40-
def test_mteb_score(server, st_main_score):
35+
def test_mteb_score(server):
4136
url = server.url_for("score")
4237
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
4338
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -47,10 +42,12 @@ def test_mteb_score(server, st_main_score):
4742
print("SentenceTransformer main score: ", st_main_score)
4843
print("Difference: ", st_main_score - vllm_main_score)
4944

50-
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
45+
# We are not concerned that the vllm mteb results are better
46+
# than SentenceTransformers, so we only perform one-sided testing.
47+
assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
5148

5249

53-
def test_mteb_rerank(server, st_main_score):
50+
def test_mteb_rerank(server):
5451
url = server.url_for("rerank")
5552
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
5653
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -60,4 +57,6 @@ def test_mteb_rerank(server, st_main_score):
6057
print("SentenceTransformer main score: ", st_main_score)
6158
print("Difference: ", st_main_score - vllm_main_score)
6259

63-
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
60+
# We are not concerned that the vllm mteb results are better
61+
# than SentenceTransformers, so we only perform one-sided testing.
62+
assert st_main_score - vllm_main_score < MTEB_RERANK_TOL

tests/models/language/pooling/embed_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
3535
example_prompts,
3636
vllm_extra_kwargs=None,
3737
hf_model_callback=None):
38-
if not model_info.enable_test:
39-
# A model family has many models with the same architecture,
40-
# and we don't need to test each one.
41-
pytest.skip("Skipping test.")
38+
pytest.skip("Debug only, ci prefers to use mteb test.")
4239

4340
# The example_prompts has ending "\n", for example:
4441
# "Write a short story about a robot that dreams for the first time.\n"

tests/models/language/pooling/mteb_utils.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# - Different model results in differences more than 1e-3
1919
# 1e-4 is a good tolerance threshold
2020
MTEB_EMBED_TASKS = ["STS12"]
21-
MTEB_EMBED_TOL = 0.02
21+
MTEB_EMBED_TOL = 1e-4
2222

2323
# See #19344
2424
MTEB_RERANK_TASKS = ["NFCorpus"]
@@ -192,22 +192,28 @@ def mteb_test_embed_models(hf_runner,
192192
MTEB_EMBED_TASKS)
193193
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
194194

195-
with hf_runner(model_info.name,
196-
is_sentence_transformer=True,
197-
dtype="float32") as hf_model:
195+
if model_info.mteb_score is None:
196+
with hf_runner(model_info.name,
197+
is_sentence_transformer=True,
198+
dtype="float32") as hf_model:
198199

199-
if hf_model_callback is not None:
200-
hf_model_callback(hf_model)
200+
if hf_model_callback is not None:
201+
hf_model_callback(hf_model)
201202

202-
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
203-
st_dtype = next(hf_model.model.parameters()).dtype
203+
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
204+
st_dtype = next(hf_model.model.parameters()).dtype
205+
else:
206+
st_main_score = model_info.mteb_score
207+
st_dtype = "Constant"
204208

205209
print("Model:", model_info.name)
206210
print("VLLM:", vllm_dtype, vllm_main_score)
207211
print("SentenceTransformers:", st_dtype, st_main_score)
208212
print("Difference:", st_main_score - vllm_main_score)
209213

210-
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
214+
# We are not concerned that the vllm mteb results are better
215+
# than SentenceTransformers, so we only perform one-sided testing.
216+
assert st_main_score - vllm_main_score < atol
211217

212218

213219
def run_mteb_rerank(cross_encoder, tasks, languages):
@@ -310,12 +316,18 @@ def mteb_test_rerank_models(hf_runner,
310316
languages=MTEB_RERANK_LANGS)
311317
vllm_dtype = model_config.dtype
312318

313-
st_main_score, st_dtype = mteb_test_rerank_models_hf(
314-
hf_runner, model_info.name, hf_model_callback)
319+
if model_info.mteb_score is None:
320+
st_main_score, st_dtype = mteb_test_rerank_models_hf(
321+
hf_runner, model_info.name, hf_model_callback)
322+
else:
323+
st_main_score = model_info.mteb_score
324+
st_dtype = "Constant"
315325

316326
print("Model:", model_info.name)
317327
print("VLLM:", vllm_dtype, vllm_main_score)
318328
print("SentenceTransformers:", st_dtype, st_main_score)
319329
print("Difference:", st_main_score - vllm_main_score)
320330

321-
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
331+
# We are not concerned that the vllm mteb results are better
332+
# than SentenceTransformers, so we only perform one-sided testing.
333+
assert st_main_score - vllm_main_score < atol

tests/models/language/pooling/test_baai.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
########## BertModel
1313
CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
1414
architecture="BertModel",
15+
mteb_score=0.779336792,
1516
enable_test=True),
1617
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
1718
architecture="BertModel",
@@ -52,10 +53,12 @@
5253
########## XLMRobertaModel
5354
CLSPoolingEmbedModelInfo("BAAI/bge-m3",
5455
architecture="XLMRobertaModel",
56+
mteb_score=0.787343078,
5557
enable_test=True),
5658
########## Qwen2Model
5759
LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
5860
architecture="Qwen2Model",
61+
mteb_score=0.75724465,
5962
dtype="float32",
6063
enable_test=True),
6164
]
@@ -65,6 +68,7 @@
6568
CLSPoolingRerankModelInfo(
6669
"BAAI/bge-reranker-base",
6770
architecture="XLMRobertaForSequenceClassification",
71+
mteb_score=0.32398,
6872
enable_test=True),
6973
CLSPoolingRerankModelInfo(
7074
"BAAI/bge-reranker-large",

tests/models/language/pooling/test_bge_reranker_v2_gemma.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
104104

105105
def __init__(self, *args, **kwargs):
106106
super().__init__(*args, **kwargs)
107-
self.prompt = PROMPT
108107
self.query_template = "A: {query}\n"
109108
self.document_template = "B: {doc}\n{prompt}"
110109

@@ -119,7 +118,7 @@ def predict(
119118
_sentences = []
120119
for query, corpus, prompt in sentences:
121120
query = self.query_template.format(query=query)
122-
corpus = self.document_template.format(doc=corpus, prompt=prompt)
121+
corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
123122
_sentences.append((query, corpus, prompt))
124123

125124
return super().predict(_sentences, *args, **kwargs)

tests/models/language/pooling/test_cross_encoder.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88

99
RERANK_MODELS = [
1010
CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
11+
mteb_score=0.32898,
1112
architecture="BertForSequenceClassification"),
1213
LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
14+
mteb_score=0.25736,
1315
architecture="Qwen3ForSequenceClassification")
1416
]
1517

tests/models/language/pooling/test_embedding.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from vllm.config import PoolerConfig
88
from vllm.platforms import current_platform
99

10-
from ...utils import check_embeddings_close, check_transformers_version
10+
from ...utils import check_embeddings_close
1111

1212

1313
@pytest.mark.parametrize(
@@ -30,7 +30,6 @@
3030
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
3131
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
3232
pytest.param("intfloat/multilingual-e5-small"),
33-
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
3433
# [Cross-Encoder]
3534
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
3635
],
@@ -42,8 +41,6 @@ def test_models(
4241
model,
4342
monkeypatch,
4443
) -> None:
45-
if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
46-
check_transformers_version(model, max_transformers_version="4.53.2")
4744

4845
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
4946
# ROCm Triton FA does not currently support sliding window attention

tests/models/language/pooling/test_gte.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55

66
from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
77
EmbedModelInfo, LASTPoolingEmbedModelInfo,
8-
RerankModelInfo, check_transformers_version)
8+
RerankModelInfo)
99
from .embed_utils import correctness_test_embed_models
1010
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
1111

1212
MODELS = [
1313
########## BertModel
1414
CLSPoolingEmbedModelInfo("thenlper/gte-large",
15+
mteb_score=0.76807651,
1516
architecture="BertModel",
1617
enable_test=True),
1718
CLSPoolingEmbedModelInfo("thenlper/gte-base",
@@ -30,28 +31,37 @@
3031
architecture="BertModel",
3132
enable_test=False),
3233
########### NewModel
34+
# These three architectures are almost the same, but not exactly the same.
35+
# For example,
36+
# - whether to use token_type_embeddings
37+
# - whether to use context expansion
38+
# So only test one (the most widely used) model
3339
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
3440
architecture="GteNewModel",
41+
mteb_score=0.775074696,
3542
hf_overrides={"architectures": ["GteNewModel"]},
3643
enable_test=True),
3744
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
3845
architecture="GteNewModel",
3946
hf_overrides={"architectures": ["GteNewModel"]},
40-
enable_test=True),
47+
enable_test=False),
4148
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
4249
architecture="GteNewModel",
4350
hf_overrides={"architectures": ["GteNewModel"]},
44-
enable_test=True),
51+
enable_test=False),
4552
########### Qwen2ForCausalLM
4653
LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
54+
mteb_score=0.758473459018872,
4755
architecture="Qwen2ForCausalLM",
4856
enable_test=True),
4957
########## ModernBertModel
5058
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
59+
mteb_score=0.748193353,
5160
architecture="ModernBertModel",
5261
enable_test=True),
5362
########## Qwen3ForCausalLM
5463
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
64+
mteb_score=0.771163695,
5565
architecture="Qwen3ForCausalLM",
5666
dtype="float32",
5767
enable_test=True),
@@ -65,10 +75,12 @@
6575
CLSPoolingRerankModelInfo(
6676
# classifier_pooling: mean
6777
"Alibaba-NLP/gte-reranker-modernbert-base",
78+
mteb_score=0.33386,
6879
architecture="ModernBertForSequenceClassification",
6980
enable_test=True),
7081
CLSPoolingRerankModelInfo(
7182
"Alibaba-NLP/gte-multilingual-reranker-base",
83+
mteb_score=0.33062,
7284
architecture="GteNewForSequenceClassification",
7385
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
7486
enable_test=True),
@@ -78,21 +90,13 @@
7890
@pytest.mark.parametrize("model_info", MODELS)
7991
def test_embed_models_mteb(hf_runner, vllm_runner,
8092
model_info: EmbedModelInfo) -> None:
81-
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
82-
check_transformers_version(model_info.name,
83-
max_transformers_version="4.53.2")
84-
8593
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
8694

8795

8896
@pytest.mark.parametrize("model_info", MODELS)
8997
def test_embed_models_correctness(hf_runner, vllm_runner,
9098
model_info: EmbedModelInfo,
9199
example_prompts) -> None:
92-
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
93-
check_transformers_version(model_info.name,
94-
max_transformers_version="4.53.2")
95-
96100
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
97101
example_prompts)
98102

tests/models/language/pooling/test_intfloat.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
########## BertModel
1111
CLSPoolingEmbedModelInfo("intfloat/e5-small",
1212
architecture="BertModel",
13+
mteb_score=0.742285423,
1314
enable_test=True),
1415
CLSPoolingEmbedModelInfo("intfloat/e5-base",
1516
architecture="BertModel",
@@ -23,6 +24,7 @@
2324
########## XLMRobertaModel
2425
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
2526
architecture="XLMRobertaModel",
27+
mteb_score=0.779325955,
2628
enable_test=True),
2729
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
2830
architecture="XLMRobertaModel",
@@ -36,7 +38,7 @@
3638
@pytest.mark.parametrize("model_info", MODELS)
3739
def test_embed_models_mteb(hf_runner, vllm_runner,
3840
model_info: EmbedModelInfo) -> None:
39-
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
41+
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
4042

4143

4244
@pytest.mark.parametrize("model_info", MODELS)

0 commit comments

Comments
 (0)