Skip to content

Commit bc22a3b

Browse files
authored
Add functional tests for deployment of HF models via vllm with Ray (#326)
Signed-off-by: Abhishree <abhishreetm@gmail.com>
1 parent c9e0a22 commit bc22a3b

File tree

4 files changed

+474
-3
lines changed

4 files changed

+474
-3
lines changed

nemo_deploy/nlp/hf_deployable.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def triton_infer_fn(self, **inputs: np.ndarray):
294294
prompts = str_ndarray2list(inputs.pop("prompts"))
295295
temperature = inputs.pop("temperature")[0][0] if "temperature" in inputs else 1.0
296296
top_k = int(inputs.pop("top_k")[0][0] if "top_k" in inputs else 1)
297-
top_p = inputs.pop("top_p")[0][0] if "top_k" in inputs else 0.0
297+
top_p = inputs.pop("top_p")[0][0] if "top_p" in inputs else 0
298298
num_tokens_to_generate = inputs.pop("max_length")[0][0] if "max_length" in inputs else 256
299299
output_logits = inputs.pop("output_logits")[0][0] if "output_logits" in inputs else False
300300
output_scores = inputs.pop("output_scores")[0][0] if "output_scores" in inputs else False

nemo_deploy/nlp/hf_deployable_ray.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ async def completions(self, request: Dict[Any, Any]):
174174
"max_tokens": request.get("max_tokens", 256),
175175
"temperature": request.get("temperature", 0.0),
176176
"top_k": request.get("top_k", 0),
177-
"top_p": request.get("top_p", 0.0),
177+
"top_p": request.get("top_p", 0),
178178
"output_logits": request.get("output_logits", False),
179179
"output_scores": request.get("output_scores", False),
180180
}
@@ -279,7 +279,7 @@ async def chat_completions(self, request: Dict[Any, Any]):
279279
"max_tokens": request.get("max_tokens", 256),
280280
"temperature": request.get("temperature", 1.0),
281281
"top_k": request.get("top_k", 0),
282-
"top_p": request.get("top_p", 0.0),
282+
"top_p": request.get("top_p", 0),
283283
"output_logits": request.get("output_logits", False),
284284
"output_scores": request.get("output_scores", False),
285285
}
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import logging
16+
import subprocess
17+
import time
18+
19+
logging.basicConfig(level=logging.INFO)
20+
logger = logging.getLogger(__name__)
21+
22+
from tests.functional_tests.utils.ray_test_utils import (
23+
query_ray_deployment,
24+
terminate_deployment_process,
25+
wait_for_deployment_ready,
26+
)
27+
28+
29+
class TestDeployRayHFVLLM:
30+
def setup_method(self):
31+
"""Setup for each test method."""
32+
self.deploy_proc = None
33+
34+
def teardown_method(self):
35+
"""Cleanup after each test method."""
36+
if self.deploy_proc is not None:
37+
terminate_deployment_process(self.deploy_proc)
38+
# Avoid double termination in case test used finally to clean up
39+
self.deploy_proc = None
40+
41+
def test_deploy_ray_hf_vllm_backend(self):
42+
"""Test deploying HuggingFace model with vLLM backend using Ray."""
43+
hf_model_path = "meta-llama/Llama-3.2-1B"
44+
45+
try:
46+
# Run Ray deployment for HF model with vLLM backend
47+
self.deploy_proc = subprocess.Popen(
48+
[
49+
"coverage",
50+
"run",
51+
"--data-file=/workspace/.coverage",
52+
"--source=/workspace/",
53+
"--parallel-mode",
54+
"scripts/deploy/nlp/deploy_ray_hf.py",
55+
"--model_path",
56+
hf_model_path,
57+
"--task",
58+
"text-generation",
59+
"--model_id",
60+
"hf-llm-vllm",
61+
"--num_gpus",
62+
str(1),
63+
"--host",
64+
"0.0.0.0",
65+
"--port",
66+
str(8002),
67+
"--trust_remote_code",
68+
"--cuda_visible_devices",
69+
"0",
70+
"--use_vllm_backend",
71+
]
72+
)
73+
print("HF Deployment with vLLM backend started. Waiting for it to be ready...")
74+
75+
# Wait for deployment to be ready
76+
if not wait_for_deployment_ready(host="0.0.0.0", port=8002, max_wait_time=300):
77+
assert False, "Deployment failed to become ready within timeout"
78+
79+
time.sleep(20)
80+
81+
# Test basic completion endpoint
82+
output = query_ray_deployment(
83+
host="0.0.0.0",
84+
port=8002,
85+
model_id="hf-llm-vllm",
86+
prompt="What is the color of a banana?",
87+
max_tokens=20,
88+
)
89+
90+
print(f"Basic completion response: {output}")
91+
92+
# Check if deployment was successful
93+
assert output != "", "First prediction is empty"
94+
95+
# Test chat completion endpoint
96+
output_chat = query_ray_deployment(
97+
host="0.0.0.0",
98+
port=8002,
99+
model_id="hf-llm-vllm",
100+
prompt=[{"role": "user", "content": "Hello, how are you?"}],
101+
max_tokens=20,
102+
use_chat=True,
103+
)
104+
print(f"Chat completion response: {output_chat}")
105+
106+
# Check if deployment was successful
107+
assert output_chat != "", "Second prediction (chat) is empty"
108+
109+
# Test with different temperature
110+
output_temp = query_ray_deployment(
111+
host="0.0.0.0",
112+
port=8002,
113+
model_id="hf-llm-vllm",
114+
prompt="Tell me a short story about a cat.",
115+
max_tokens=30,
116+
temperature=0.9,
117+
)
118+
print(f"High temperature response: {output_temp}")
119+
120+
# Check if deployment was successful
121+
assert output_temp != "", "High temperature prediction is empty"
122+
123+
finally:
124+
# Ensure the deployment is terminated as soon as queries complete or on failure
125+
if self.deploy_proc is not None:
126+
terminate_deployment_process(self.deploy_proc)
127+
self.deploy_proc = None
128+
129+
def test_deploy_ray_hf_vllm_backend_with_parameters(self):
130+
"""Test deploying HuggingFace model with vLLM backend and custom parameters."""
131+
hf_model_path = "meta-llama/Llama-3.2-1B"
132+
133+
try:
134+
# Run Ray deployment for HF model with vLLM backend and custom parameters
135+
self.deploy_proc = subprocess.Popen(
136+
[
137+
"coverage",
138+
"run",
139+
"--data-file=/workspace/.coverage",
140+
"--source=/workspace/",
141+
"--parallel-mode",
142+
"scripts/deploy/nlp/deploy_ray_hf.py",
143+
"--model_path",
144+
hf_model_path,
145+
"--task",
146+
"text-generation",
147+
"--model_id",
148+
"hf-llm-vllm-params",
149+
"--num_gpus",
150+
str(1),
151+
"--host",
152+
"0.0.0.0",
153+
"--port",
154+
str(8003),
155+
"--trust_remote_code",
156+
"--cuda_visible_devices",
157+
"0",
158+
"--use_vllm_backend",
159+
"--num_replicas",
160+
str(1),
161+
"--num_gpus_per_replica",
162+
str(1),
163+
"--num_cpus_per_replica",
164+
str(4),
165+
"--max_ongoing_requests",
166+
str(5),
167+
]
168+
)
169+
print("HF Deployment with vLLM backend and custom parameters started. Waiting for it to be ready...")
170+
171+
# Wait for deployment to be ready
172+
if not wait_for_deployment_ready(host="0.0.0.0", port=8003, max_wait_time=300):
173+
assert False, "Deployment failed to become ready within timeout"
174+
175+
time.sleep(20)
176+
177+
# Test multiple requests to verify the deployment handles them correctly
178+
prompts = [
179+
"What is 2+2?",
180+
"Name a fruit that is red.",
181+
"What is the capital of France?",
182+
]
183+
184+
for i, prompt in enumerate(prompts):
185+
output = query_ray_deployment(
186+
host="0.0.0.0",
187+
port=8003,
188+
model_id="hf-llm-vllm-params",
189+
prompt=prompt,
190+
max_tokens=15,
191+
temperature=0.7,
192+
)
193+
print(f"Request {i + 1} response: {output}")
194+
assert output != "", f"Prediction {i + 1} is empty"
195+
196+
finally:
197+
# Ensure the deployment is terminated as soon as queries complete or on failure
198+
if self.deploy_proc is not None:
199+
terminate_deployment_process(self.deploy_proc)
200+
self.deploy_proc = None

0 commit comments

Comments
 (0)