Skip to content

Commit 586f286

Browse files
authored
[Model] Pooling model activation supports per request control by PoolingParams (#20538)
Signed-off-by: wang.yuqi <[email protected]>
1 parent 811ac13 commit 586f286

21 files changed

+948
-173
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import weakref
5+
6+
import pytest
7+
import torch
8+
9+
from vllm import LLM, PoolingParams
10+
from vllm.distributed import cleanup_dist_env_and_memory
11+
12+
from ...models.utils import softmax
13+
14+
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
15+
16+
prompts = ["The chef prepared a delicious meal."]
17+
18+
19+
@pytest.fixture(autouse=True)
20+
def v1(run_with_both_engines):
21+
# Simple autouse wrapper to run both engines for each test
22+
# This can be promoted up to conftest.py to run for every
23+
# test in a package
24+
pass
25+
26+
27+
@pytest.fixture(scope="module")
28+
def llm():
29+
# pytest caches the fixture so we use weakref.proxy to
30+
# enable garbage collection
31+
llm = LLM(model=MODEL_NAME,
32+
max_num_batched_tokens=32768,
33+
tensor_parallel_size=1,
34+
gpu_memory_utilization=0.75,
35+
enforce_eager=True,
36+
seed=0)
37+
38+
with llm.deprecate_legacy_api():
39+
yield weakref.proxy(llm)
40+
41+
del llm
42+
43+
cleanup_dist_env_and_memory()
44+
45+
46+
@pytest.mark.skip_global_cleanup
47+
def test_pooling_params(llm: LLM):
48+
49+
def get_outputs(activation):
50+
outputs = llm.classify(
51+
prompts,
52+
pooling_params=PoolingParams(activation=activation),
53+
use_tqdm=False)
54+
return torch.tensor([x.outputs.probs for x in outputs])
55+
56+
default = get_outputs(activation=None)
57+
w_activation = get_outputs(activation=True)
58+
wo_activation = get_outputs(activation=False)
59+
60+
assert torch.allclose(default, w_activation,
61+
atol=1e-2), "Default should use activation."
62+
assert not torch.allclose(
63+
w_activation, wo_activation,
64+
atol=1e-2), "wo_activation should not use activation."
65+
assert torch.allclose(
66+
softmax(wo_activation), w_activation, atol=1e-2
67+
), "w_activation should be close to activation(wo_activation)."
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import weakref
5+
6+
import pytest
7+
import torch
8+
import torch.nn.functional as F
9+
10+
from vllm import LLM, PoolingParams
11+
from vllm.distributed import cleanup_dist_env_and_memory
12+
13+
MODEL_NAME = "intfloat/multilingual-e5-small"
14+
15+
prompts = ["The chef prepared a delicious meal."]
16+
17+
18+
@pytest.fixture(scope="module")
19+
def llm():
20+
# pytest caches the fixture so we use weakref.proxy to
21+
# enable garbage collection
22+
llm = LLM(model=MODEL_NAME,
23+
max_num_batched_tokens=32768,
24+
tensor_parallel_size=1,
25+
gpu_memory_utilization=0.75,
26+
enforce_eager=True,
27+
seed=0)
28+
29+
with llm.deprecate_legacy_api():
30+
yield weakref.proxy(llm)
31+
32+
del llm
33+
34+
cleanup_dist_env_and_memory()
35+
36+
37+
@pytest.mark.skip_global_cleanup
38+
def test_pooling_params(llm: LLM):
39+
40+
def get_outputs(normalize):
41+
outputs = llm.embed(prompts,
42+
pooling_params=PoolingParams(normalize=normalize),
43+
use_tqdm=False)
44+
return torch.tensor([x.outputs.embedding for x in outputs])
45+
46+
default = get_outputs(normalize=None)
47+
w_normal = get_outputs(normalize=True)
48+
wo_normal = get_outputs(normalize=False)
49+
50+
assert torch.allclose(default, w_normal,
51+
atol=1e-2), "Default should use normal."
52+
assert not torch.allclose(w_normal, wo_normal,
53+
atol=1e-2), "wo_normal should not use normal."
54+
assert torch.allclose(
55+
w_normal, F.normalize(wo_normal, p=2, dim=-1),
56+
atol=1e-2), "w_normal should be close to normal(wo_normal)."

tests/entrypoints/llm/test_reward.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import weakref
5+
6+
import pytest
7+
import torch
8+
9+
from vllm import LLM, PoolingParams
10+
from vllm.distributed import cleanup_dist_env_and_memory
11+
12+
from ...models.utils import softmax
13+
14+
MODEL_NAME = "internlm/internlm2-1_8b-reward"
15+
16+
prompts = ["The chef prepared a delicious meal."]
17+
18+
19+
@pytest.fixture(autouse=True)
20+
def v1(run_with_both_engines):
21+
# Simple autouse wrapper to run both engines for each test
22+
# This can be promoted up to conftest.py to run for every
23+
# test in a package
24+
pass
25+
26+
27+
@pytest.fixture(scope="module")
28+
def llm():
29+
# pytest caches the fixture so we use weakref.proxy to
30+
# enable garbage collection
31+
llm = LLM(model=MODEL_NAME,
32+
max_num_batched_tokens=32768,
33+
tensor_parallel_size=1,
34+
gpu_memory_utilization=0.75,
35+
enforce_eager=True,
36+
trust_remote_code=True,
37+
seed=0)
38+
39+
with llm.deprecate_legacy_api():
40+
yield weakref.proxy(llm)
41+
42+
del llm
43+
44+
cleanup_dist_env_and_memory()
45+
46+
47+
@pytest.mark.skip_global_cleanup
48+
def test_pooling_params(llm: LLM):
49+
50+
def get_outputs(softmax):
51+
outputs = llm.reward(prompts,
52+
pooling_params=PoolingParams(softmax=softmax),
53+
use_tqdm=False)
54+
return torch.cat([x.outputs.data for x in outputs])
55+
56+
default = get_outputs(softmax=None)
57+
w_softmax = get_outputs(softmax=True)
58+
wo_softmax = get_outputs(softmax=False)
59+
60+
assert torch.allclose(default, w_softmax,
61+
atol=1e-2), "Default should use softmax."
62+
assert not torch.allclose(w_softmax, wo_softmax,
63+
atol=1e-2), "wo_softmax should not use softmax."
64+
assert torch.allclose(
65+
softmax(wo_softmax), w_softmax,
66+
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."

tests/entrypoints/llm/test_score.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import weakref
5+
6+
import pytest
7+
import torch
8+
9+
from vllm import LLM, PoolingParams
10+
from vllm.distributed import cleanup_dist_env_and_memory
11+
12+
from ...models.utils import softmax
13+
14+
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
15+
16+
17+
@pytest.fixture(autouse=True)
18+
def v1(run_with_both_engines):
19+
# Simple autouse wrapper to run both engines for each test
20+
# This can be promoted up to conftest.py to run for every
21+
# test in a package
22+
pass
23+
24+
25+
@pytest.fixture(scope="module")
26+
def llm():
27+
# pytest caches the fixture so we use weakref.proxy to
28+
# enable garbage collection
29+
llm = LLM(model=MODEL_NAME,
30+
max_num_batched_tokens=32768,
31+
tensor_parallel_size=1,
32+
gpu_memory_utilization=0.75,
33+
enforce_eager=True,
34+
seed=0)
35+
36+
with llm.deprecate_legacy_api():
37+
yield weakref.proxy(llm)
38+
39+
del llm
40+
41+
cleanup_dist_env_and_memory()
42+
43+
44+
@pytest.mark.skip_global_cleanup
45+
def test_pooling_params(llm: LLM):
46+
47+
def get_outputs(activation):
48+
text_1 = "What is the capital of France?"
49+
text_2 = "The capital of France is Paris."
50+
51+
outputs = llm.score(
52+
text_1,
53+
text_2,
54+
pooling_params=PoolingParams(activation=activation),
55+
use_tqdm=False)
56+
return torch.tensor([x.outputs.score for x in outputs])
57+
58+
default = get_outputs(activation=None)
59+
w_activation = get_outputs(activation=True)
60+
wo_activation = get_outputs(activation=False)
61+
62+
assert torch.allclose(default, w_activation,
63+
atol=1e-2), "Default should use activation."
64+
assert not torch.allclose(
65+
w_activation, wo_activation,
66+
atol=1e-2), "wo_activation should not use activation."
67+
assert torch.allclose(
68+
softmax(wo_activation), w_activation, atol=1e-2
69+
), "w_activation should be close to activation(wo_activation)."

tests/entrypoints/openai/test_classification.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import pytest
55
import requests
6+
import torch
7+
import torch.nn.functional as F
68

79
from vllm.entrypoints.openai.protocol import ClassificationResponse
810

@@ -181,3 +183,32 @@ async def test_invocations(server: RemoteOpenAIServer):
181183
assert classification_data.keys() == invocation_data.keys()
182184
assert classification_data["probs"] == pytest.approx(
183185
invocation_data["probs"], rel=0.01)
186+
187+
188+
@pytest.mark.asyncio
189+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
190+
async def test_activation(server: RemoteOpenAIServer, model_name: str):
191+
input_text = ["This product was excellent and exceeded my expectations"]
192+
193+
async def get_outputs(activation):
194+
response = requests.post(server.url_for("classify"),
195+
json={
196+
"model": model_name,
197+
"input": input_text,
198+
"activation": activation
199+
})
200+
outputs = response.json()
201+
return torch.tensor([x['probs'] for x in outputs["data"]])
202+
203+
default = await get_outputs(activation=None)
204+
w_activation = await get_outputs(activation=True)
205+
wo_activation = await get_outputs(activation=False)
206+
207+
assert torch.allclose(default, w_activation,
208+
atol=1e-2), "Default should use activation."
209+
assert not torch.allclose(
210+
w_activation, wo_activation,
211+
atol=1e-2), "wo_activation should not use activation."
212+
assert torch.allclose(
213+
F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
214+
), "w_activation should be close to activation(wo_activation)."

tests/entrypoints/openai/test_embedding.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import pytest
99
import pytest_asyncio
1010
import requests
11+
import torch
12+
import torch.nn.functional as F
1113

1214
from vllm.entrypoints.openai.protocol import EmbeddingResponse
1315
from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -369,3 +371,35 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
369371
embeddings_1_lst=[invocation_data["embedding"]],
370372
name_0="chat",
371373
name_1="invocation")
374+
375+
376+
@pytest.mark.asyncio
377+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
378+
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
379+
input_text = ["The chef prepared a delicious meal."]
380+
381+
async def get_outputs(normalize):
382+
request_args = {
383+
"model": MODEL_NAME,
384+
"input": input_text,
385+
"encoding_format": "float",
386+
"normalize": normalize
387+
}
388+
389+
response = requests.post(server.url_for("v1/embeddings"),
390+
json=request_args)
391+
outputs = response.json()
392+
393+
return torch.tensor([x['embedding'] for x in outputs["data"]])
394+
395+
default = await get_outputs(normalize=None)
396+
w_normal = await get_outputs(normalize=True)
397+
wo_normal = await get_outputs(normalize=False)
398+
399+
assert torch.allclose(default, w_normal,
400+
atol=1e-2), "Default should use normal."
401+
assert not torch.allclose(w_normal, wo_normal,
402+
atol=1e-2), "wo_normal should not use normal."
403+
assert torch.allclose(
404+
w_normal, F.normalize(wo_normal, p=2, dim=-1),
405+
atol=1e-2), "w_normal should be close to normal(wo_normal)."

tests/entrypoints/openai/test_rerank.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import pytest
55
import requests
6+
import torch
7+
import torch.nn.functional as F
68

79
from vllm.entrypoints.openai.protocol import RerankResponse
810

@@ -125,3 +127,39 @@ def test_invocations(server: RemoteOpenAIServer):
125127
assert rerank_result.keys() == invocations_result.keys()
126128
assert rerank_result["relevance_score"] == pytest.approx(
127129
invocations_result["relevance_score"], rel=0.01)
130+
131+
132+
@pytest.mark.asyncio
133+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
134+
async def test_activation(server: RemoteOpenAIServer, model_name: str):
135+
136+
async def get_outputs(activation):
137+
query = "What is the capital of France?"
138+
documents = [
139+
"The capital of Brazil is Brasilia.",
140+
"The capital of France is Paris."
141+
]
142+
143+
response = requests.post(server.url_for("rerank"),
144+
json={
145+
"model": model_name,
146+
"query": query,
147+
"documents": documents,
148+
"activation": activation
149+
})
150+
outputs = response.json()
151+
152+
return torch.tensor([x['relevance_score'] for x in outputs["results"]])
153+
154+
default = await get_outputs(activation=None)
155+
w_activation = await get_outputs(activation=True)
156+
wo_activation = await get_outputs(activation=False)
157+
158+
assert torch.allclose(default, w_activation,
159+
atol=1e-2), "Default should use activation."
160+
assert not torch.allclose(
161+
w_activation, wo_activation,
162+
atol=1e-2), "wo_activation should not use activation."
163+
assert torch.allclose(
164+
F.sigmoid(wo_activation), w_activation, atol=1e-2
165+
), "w_activation should be close to activation(wo_activation)."

0 commit comments

Comments
 (0)