Skip to content

Commit 0428a14

Browse files
committed
[Bugfix] Fix Dense module loading for sentence-transformers embedding models v3
Signed-off-by: FFFfff1FFFfff <[email protected]>
1 parent 61244cf commit 0428a14

File tree

6 files changed

+680
-228
lines changed

6 files changed

+680
-228
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import pytest
5+
import torch
6+
from unittest.mock import Mock, patch
7+
from vllm.model_executor.layers.pooler import EmbeddingPoolerHead
8+
9+
10+
def test_embedding_pooler_head_with_projector():
11+
"""Test EmbeddingPoolerHead with projector."""
12+
# Create a mock projector
13+
projector = torch.nn.Sequential(
14+
torch.nn.Linear(1024, 1792),
15+
torch.nn.GELU()
16+
)
17+
18+
# Create EmbeddingPoolerHead with projector
19+
head = EmbeddingPoolerHead(projector=projector)
20+
21+
# Test with single tensor
22+
test_input = torch.randn(2, 1024)
23+
24+
# Mock the pooling parameters
25+
class MockPoolingParam:
26+
def __init__(self):
27+
self.dimensions = None
28+
self.normalize = True
29+
30+
# Mock metadata
31+
class MockMetadata:
32+
def __init__(self):
33+
pass
34+
35+
mock_metadata = MockMetadata()
36+
37+
# Mock get_pooling_params function
38+
with patch('vllm.model_executor.layers.pooler.get_pooling_params') as mock_get_params:
39+
mock_get_params.return_value = [MockPoolingParam()]
40+
41+
# Call forward - this will test the projector path
42+
output = head(test_input, mock_metadata)
43+
44+
# Should apply projector first (1024 -> 1792), then normalize
45+
assert output.shape == (2, 1792), f"Expected (2, 1792), got {output.shape}"
46+
47+
# Check that output is normalized (L2 norm should be ~1)
48+
norms = torch.norm(output, p=2, dim=-1)
49+
assert torch.allclose(norms, torch.ones_like(norms), atol=1e-5), \
50+
"Output should be normalized"
51+
52+
53+
def test_embedding_pooler_head_without_projector():
54+
"""Test EmbeddingPoolerHead without projector (regression test)."""
55+
# Create EmbeddingPoolerHead without projector
56+
head = EmbeddingPoolerHead(projector=None)
57+
58+
# Test with single tensor
59+
test_input = torch.randn(2, 1024)
60+
61+
# Mock the pooling parameters
62+
class MockPoolingParam:
63+
def __init__(self):
64+
self.dimensions = None
65+
self.normalize = True
66+
67+
# Mock metadata
68+
class MockMetadata:
69+
def __init__(self):
70+
pass
71+
72+
mock_metadata = MockMetadata()
73+
74+
# Mock get_pooling_params function
75+
with patch('vllm.model_executor.layers.pooler.get_pooling_params') as mock_get_params:
76+
mock_get_params.return_value = [MockPoolingParam()]
77+
78+
# Call forward - this should just normalize without projection
79+
output = head(test_input, mock_metadata)
80+
81+
# Should maintain the input dimension
82+
assert output.shape == (2, 1024), f"Expected (2, 1024), got {output.shape}"
83+
84+
# Check that output is normalized
85+
norms = torch.norm(output, p=2, dim=-1)
86+
assert torch.allclose(norms, torch.ones_like(norms), atol=1e-5), \
87+
"Output should be normalized"
88+
89+
90+
def test_embedding_pooler_head_dimension_mismatch():
91+
"""Test that dimension mismatch raises an error."""
92+
# Create a projector expecting 512-dim input
93+
projector = torch.nn.Sequential(
94+
torch.nn.Linear(512, 1792), # Expects 512, not 1024
95+
torch.nn.GELU()
96+
)
97+
98+
head = EmbeddingPoolerHead(projector=projector)
99+
100+
# Test with wrong dimension (1024 instead of 512)
101+
test_input = torch.randn(2, 1024)
102+
103+
class MockPoolingParam:
104+
def __init__(self):
105+
self.dimensions = None
106+
self.normalize = True
107+
108+
class MockMetadata:
109+
def __init__(self):
110+
pass
111+
112+
mock_metadata = MockMetadata()
113+
114+
with patch('vllm.model_executor.layers.pooler.get_pooling_params') as mock_get_params:
115+
mock_get_params.return_value = [MockPoolingParam()]
116+
117+
# Should raise ValueError due to dimension mismatch
118+
with pytest.raises(ValueError, match="Dimension mismatch"):
119+
head(test_input, mock_metadata)
120+
121+
122+
def test_embedding_pooler_head_device_handling():
123+
"""Test that projector moves to correct device."""
124+
if not torch.cuda.is_available():
125+
pytest.skip("CUDA not available")
126+
127+
# Create projector on CPU
128+
projector = torch.nn.Sequential(
129+
torch.nn.Linear(1024, 1792),
130+
torch.nn.GELU()
131+
)
132+
133+
head = EmbeddingPoolerHead(projector=projector)
134+
135+
# Test input on CUDA
136+
test_input = torch.randn(2, 1024).cuda()
137+
138+
class MockPoolingParam:
139+
def __init__(self):
140+
self.dimensions = None
141+
self.normalize = True
142+
143+
class MockMetadata:
144+
def __init__(self):
145+
pass
146+
147+
mock_metadata = MockMetadata()
148+
149+
with patch('vllm.model_executor.layers.pooler.get_pooling_params') as mock_get_params:
150+
mock_get_params.return_value = [MockPoolingParam()]
151+
152+
output = head(test_input, mock_metadata)
153+
154+
# Output should be on CUDA
155+
assert output.is_cuda, "Output should be on CUDA"
156+
assert output.shape == (2, 1792)
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
from typing import Any
4+
import pytest
5+
import numpy as np
6+
from scipy.spatial.distance import cosine
7+
from ...utils import EmbedModelInfo
8+
from .mteb_utils import mteb_test_embed_models
9+
10+
11+
def _get_vllm_embeddings(vllm_runner, model_info: EmbedModelInfo, test_texts: list[str]):
12+
"""Helper function to get vLLM embeddings."""
13+
vllm_extra_kwargs: dict[str, Any] = {}
14+
if model_info.architecture == "GteNewModel":
15+
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
16+
17+
with vllm_runner(model_info.name,
18+
runner="pooling",
19+
max_model_len=None,
20+
trust_remote_code=True,
21+
**vllm_extra_kwargs) as vllm_model:
22+
embeddings = vllm_model.encode(test_texts)
23+
24+
# Extract embedding data
25+
data = []
26+
for emb in embeddings:
27+
if hasattr(emb, 'outputs'):
28+
data.append(emb.outputs.data.cpu().numpy())
29+
else:
30+
data.append(emb.cpu().numpy() if hasattr(emb, 'cpu') else emb)
31+
return np.array(data)
32+
33+
34+
def _get_hf_embeddings(hf_runner, model_info: EmbedModelInfo, test_texts: list[str]):
35+
"""Helper function to get HuggingFace embeddings."""
36+
with hf_runner(model_info.name,
37+
is_sentence_transformer=True,
38+
dtype="float32") as hf_model:
39+
embeddings = hf_model.encode(test_texts)
40+
if hasattr(embeddings, 'cpu'):
41+
return embeddings.cpu().numpy()
42+
else:
43+
return np.array(embeddings)
44+
45+
46+
# Test models with ST projectors (Dense layers)
47+
ST_PROJECTOR_MODELS = [
48+
EmbedModelInfo("TencentBAC/Conan-embedding-v1",
49+
architecture="BertModel",
50+
enable_test=True),
51+
# Add more ST models with projectors as they become available
52+
]
53+
54+
# Test models without ST projectors (for regression testing)
55+
NON_PROJECTOR_MODELS = [
56+
EmbedModelInfo("thenlper/gte-large",
57+
architecture="BertModel",
58+
enable_test=True),
59+
EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
60+
architecture="GteNewModel",
61+
enable_test=True),
62+
EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
63+
architecture="Qwen3ForCausalLM",
64+
dtype="float32",
65+
enable_test=True),
66+
]
67+
68+
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
69+
def test_st_projector_loading(vllm_runner, model_info: EmbedModelInfo) -> None:
70+
"""Test that ST projector models load correctly with their projectors."""
71+
if not model_info.enable_test:
72+
pytest.skip("Skipping test.")
73+
74+
test_texts = ["This is a test sentence."]
75+
embeddings_data = _get_vllm_embeddings(vllm_runner, model_info, test_texts)
76+
77+
# Check if dimensions match expected projector output
78+
actual_dim = embeddings_data.shape[-1]
79+
expected_dim = 1792
80+
assert actual_dim == expected_dim, f"Expected {expected_dim}, got {actual_dim}"
81+
82+
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
83+
def test_compare_with_hf_dimensions(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
84+
"""Compare embedding dimensions between vLLM and HuggingFace."""
85+
if not model_info.enable_test:
86+
pytest.skip("Skipping test.")
87+
88+
test_texts = ["This is a test sentence for dimension comparison."]
89+
90+
# Get embeddings from both implementations
91+
vllm_data = _get_vllm_embeddings(vllm_runner, model_info, test_texts)
92+
hf_data = _get_hf_embeddings(hf_runner, model_info, test_texts)
93+
94+
# Compare dimensions
95+
vllm_dim = vllm_data.shape[-1]
96+
hf_dim = hf_data.shape[-1]
97+
98+
assert vllm_dim == hf_dim, f"Embedding dimension mismatch: vLLM {vllm_dim} vs HF {hf_dim}"
99+
print(f"✓ Embedding dimensions match: {vllm_dim}")
100+
101+
102+
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
103+
def test_embedding_numerical_similarity(hf_runner, vllm_runner,
104+
model_info: EmbedModelInfo) -> None:
105+
"""Test numerical similarity between vLLM and HuggingFace embeddings."""
106+
if not model_info.enable_test:
107+
pytest.skip("Skipping test.")
108+
109+
test_texts = [
110+
"This is a test sentence for numerical comparison.",
111+
"Another sentence to verify embedding quality.",
112+
"机器学习是人工智能的一个重要分支。", # Chinese test
113+
]
114+
115+
# Get embeddings from both implementations
116+
vllm_data = _get_vllm_embeddings(vllm_runner, model_info, test_texts)
117+
hf_data = _get_hf_embeddings(hf_runner, model_info, test_texts)
118+
119+
# Verify shape matching
120+
assert vllm_data.shape == hf_data.shape, f"Shape mismatch: vLLM {vllm_data.shape} vs HF {hf_data.shape}"
121+
122+
print(f"Embedding shape: {vllm_data.shape}")
123+
print(f"Embedding dimension: {vllm_data.shape[-1]}")
124+
125+
# Compute similarities for each test text
126+
similarities = []
127+
for i, text in enumerate(test_texts):
128+
vllm_emb = vllm_data[i]
129+
hf_emb = hf_data[i]
130+
131+
# Compute cosine similarity
132+
similarity = 1 - cosine(vllm_emb, hf_emb)
133+
similarities.append(similarity)
134+
135+
print(f"Text {i+1}: '{text[:50]}{'...' if len(text) > 50 else ''}'")
136+
print(f" Cosine similarity: {similarity:.6f}")
137+
138+
# Verify similarity threshold
139+
min_similarity = 0.95
140+
assert similarity > min_similarity, (
141+
f"Text {i+1} similarity too low: {similarity:.6f} < {min_similarity}\n"
142+
f"vLLM norm: {np.linalg.norm(vllm_emb):.6f}, "
143+
f"HF norm: {np.linalg.norm(hf_emb):.6f}")
144+
145+
# Verify average similarity
146+
avg_similarity = np.mean(similarities)
147+
print(f"\nAverage cosine similarity: {avg_similarity:.6f}")
148+
149+
assert avg_similarity > 0.98, (
150+
f"Average similarity too low: {avg_similarity:.6f} < 0.98")
151+
152+
print("✓ All numerical similarity tests passed!")
153+
154+
155+
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
156+
def test_embedding_quality_checks(vllm_runner, model_info: EmbedModelInfo) -> None:
157+
"""Test embedding quality: non-zero, non-constant, and distinct vectors."""
158+
if not model_info.enable_test:
159+
pytest.skip("Skipping test.")
160+
161+
test_texts = [
162+
"First test sentence.",
163+
"Second different sentence.",
164+
"Completely different content here."
165+
]
166+
167+
embeddings_data = _get_vllm_embeddings(vllm_runner, model_info, test_texts)
168+
169+
print(f"Embeddings shape: {embeddings_data.shape}")
170+
171+
# Verify non-zero vectors
172+
for i, emb in enumerate(embeddings_data):
173+
norm = np.linalg.norm(emb)
174+
print(f"Embedding {i+1} L2 norm: {norm:.6f}")
175+
assert norm > 1e-6, f"Embedding {i+1} is too close to zero vector: norm={norm}"
176+
177+
# Verify non-constant vectors
178+
std = np.std(emb)
179+
print(f"Embedding {i+1} std: {std:.6f}")
180+
assert std > 1e-6, f"Embedding {i+1} is too close to constant vector: std={std}"
181+
182+
# Verify different texts produce different embeddings
183+
for i in range(len(embeddings_data)):
184+
for j in range(i+1, len(embeddings_data)):
185+
similarity = 1 - cosine(embeddings_data[i], embeddings_data[j])
186+
print(f"Similarity between text {i+1} and {j+1}: {similarity:.6f}")
187+
# Different texts should not be too similar
188+
assert similarity < 0.99, f"Embeddings {i+1} and {j+1} are too similar: {similarity:.6f}"
189+
190+
print("✓ All embedding quality checks passed!")
191+
192+
193+
# MTEB tests (currently skipped for projector models due to batch processing optimization pending)
194+
@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
195+
@pytest.mark.skip(reason="Projector loading and single-sentence inference verified. MTEB batch processing optimization pending.")
196+
def test_st_projector_models_mteb(hf_runner, vllm_runner,
197+
model_info: EmbedModelInfo) -> None:
198+
"""Test ST models with projectors using MTEB."""
199+
if not model_info.enable_test:
200+
pytest.skip("Skipping test.")
201+
vllm_extra_kwargs: dict[str, Any] = {}
202+
if model_info.architecture == "GteNewModel":
203+
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
204+
mteb_test_embed_models(hf_runner, vllm_runner, model_info,
205+
vllm_extra_kwargs)
206+
207+
208+
@pytest.mark.parametrize("model_info", NON_PROJECTOR_MODELS)
209+
def test_non_projector_models_mteb(hf_runner, vllm_runner,
210+
model_info: EmbedModelInfo) -> None:
211+
"""Test models without projectors to ensure no regression."""
212+
if not model_info.enable_test:
213+
pytest.skip("Skipping test.")
214+
vllm_extra_kwargs: dict[str, Any] = {}
215+
if model_info.architecture == "GteNewModel":
216+
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
217+
mteb_test_embed_models(hf_runner, vllm_runner, model_info,
218+
vllm_extra_kwargs)

0 commit comments

Comments
 (0)