Skip to content

Commit 2129a93

Browse files
New script path (#487)
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
1 parent 5af89b5 commit 2129a93

21 files changed

+4544
-2
lines changed

nemo_deploy/llm/query_llm.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@ def query_llm(
101101
apply_chat_template (bool): applies chat template if its a chat model. Default: False
102102
init_timeout (flat): timeout for the connection.
103103
"""
104+
if top_k is not None and top_p is not None and top_k > 0 and top_p > 0.0:
105+
raise ValueError(
106+
"Cannot have top_p and top_k both greater than zero. Set top_k as zero in order to set top_p > 0.0."
107+
)
108+
104109
prompts = str_list2numpy(prompts)
105110
inputs = {
106111
"prompts": prompts,
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
import logging
17+
import multiprocessing
18+
19+
from nemo_deploy.deploy_ray import DeployRay
20+
21+
LOGGER = logging.getLogger("NeMo")
22+
23+
24+
def get_available_cpus():
25+
"""Get the total number of available CPUs in the system."""
26+
return multiprocessing.cpu_count()
27+
28+
29+
def parse_args():
30+
"""Parse command-line arguments for the Ray deployment script."""
31+
parser = argparse.ArgumentParser(description="Deploy a Megatron model using Ray")
32+
parser.add_argument(
33+
"--nemo_checkpoint",
34+
type=str,
35+
default=None,
36+
help="Path to the .nemo checkpoint file",
37+
)
38+
parser.add_argument(
39+
"--num_gpus",
40+
type=int,
41+
default=1,
42+
help="Number of GPUs to use per node",
43+
)
44+
parser.add_argument(
45+
"--tensor_model_parallel_size",
46+
type=int,
47+
default=1,
48+
help="Size of the tensor model parallelism",
49+
)
50+
parser.add_argument(
51+
"--pipeline_model_parallel_size",
52+
type=int,
53+
default=1,
54+
help="Size of the pipeline model parallelism",
55+
)
56+
parser.add_argument(
57+
"-nlfps",
58+
"--num_layers_in_first_pipeline_stage",
59+
default=None,
60+
type=int,
61+
help="Number of layers in the first pipeline stage",
62+
)
63+
parser.add_argument(
64+
"-nllps",
65+
"--num_layers_in_last_pipeline_stage",
66+
default=None,
67+
type=int,
68+
help="Number of layers in the last pipeline stage",
69+
)
70+
parser.add_argument(
71+
"--expert_model_parallel_size",
72+
type=int,
73+
default=1,
74+
help="Size of the expert model parallelism",
75+
)
76+
parser.add_argument(
77+
"--context_parallel_size",
78+
type=int,
79+
default=1,
80+
help="Size of the context parallelism",
81+
)
82+
parser.add_argument(
83+
"-eps",
84+
"--account_for_embedding_in_pipeline_split",
85+
default=False,
86+
action="store_true",
87+
help="Account for embedding in the pipeline split",
88+
)
89+
parser.add_argument(
90+
"-lps",
91+
"--account_for_loss_in_pipeline_split",
92+
default=False,
93+
action="store_true",
94+
help="Account for loss in the pipeline split",
95+
)
96+
parser.add_argument(
97+
"--model_id",
98+
type=str,
99+
default="nemo-model",
100+
help="Identifier for the model in the API responses",
101+
)
102+
parser.add_argument(
103+
"--host",
104+
type=str,
105+
default="0.0.0.0",
106+
help="Host address to bind the Ray Serve server to",
107+
)
108+
parser.add_argument(
109+
"--port",
110+
type=int,
111+
default=1024,
112+
help="Port number to use for the Ray Serve server",
113+
)
114+
parser.add_argument(
115+
"--num_cpus",
116+
type=int,
117+
default=None,
118+
help="Number of CPUs to allocate for the Ray cluster. If None, will use all available CPUs.",
119+
)
120+
parser.add_argument(
121+
"--num_cpus_per_replica",
122+
type=float,
123+
default=8,
124+
help="Number of CPUs per model replica",
125+
)
126+
parser.add_argument(
127+
"--include_dashboard",
128+
action="store_true",
129+
help="Whether to include the Ray dashboard",
130+
)
131+
parser.add_argument(
132+
"--cuda_visible_devices",
133+
type=str,
134+
default=None,
135+
help="Comma-separated list of CUDA visible devices",
136+
)
137+
parser.add_argument(
138+
"--enable_cuda_graphs",
139+
action="store_true",
140+
help="Whether to enable CUDA graphs for faster inference",
141+
)
142+
parser.add_argument(
143+
"--enable_flash_decode",
144+
action="store_true",
145+
help="Whether to enable Flash Attention decode",
146+
)
147+
parser.add_argument(
148+
"--num_replicas",
149+
type=int,
150+
default=1,
151+
help="Number of replicas for the deployment",
152+
)
153+
parser.add_argument(
154+
"--legacy_ckpt",
155+
action="store_true",
156+
help="Whether to use legacy checkpoint format",
157+
)
158+
parser.add_argument(
159+
"--max_batch_size",
160+
type=int,
161+
default=32,
162+
help="Maximum batch size for inference",
163+
)
164+
parser.add_argument(
165+
"--random_seed",
166+
type=int,
167+
default=None,
168+
help="Random seed for reproducible inference",
169+
)
170+
parser.add_argument(
171+
"--megatron_checkpoint",
172+
type=str,
173+
default=None,
174+
help="Path to the Megatron checkpoint file",
175+
)
176+
parser.add_argument(
177+
"--model_type",
178+
type=str,
179+
default="gpt",
180+
help="Type of model to load",
181+
)
182+
parser.add_argument(
183+
"--micro_batch_size",
184+
type=int,
185+
default=None,
186+
help="Micro batch size for model execution",
187+
)
188+
return parser.parse_args()
189+
190+
191+
def main():
192+
"""Main function to deploy a Megatron model using Ray."""
193+
args = parse_args()
194+
# Initialize Ray deployment with updated DeployRay class
195+
runtime_env = {}
196+
if args.cuda_visible_devices is not None:
197+
runtime_env["env_vars"] = {
198+
"CUDA_VISIBLE_DEVICES": args.cuda_visible_devices,
199+
}
200+
201+
ray_deployer = DeployRay(
202+
num_cpus=args.num_cpus,
203+
num_gpus=args.num_gpus,
204+
include_dashboard=args.include_dashboard,
205+
host=args.host,
206+
port=args.port,
207+
runtime_env=runtime_env,
208+
)
209+
if args.nemo_checkpoint:
210+
model_format = "nemo"
211+
elif args.megatron_checkpoint:
212+
model_format = "megatron"
213+
else:
214+
raise ValueError("Either --nemo_checkpoint or --megatron_checkpoint must be provided")
215+
216+
model_config_kwargs = {
217+
"account_for_embedding_in_pipeline_split": args.account_for_embedding_in_pipeline_split,
218+
"account_for_loss_in_pipeline_split": args.account_for_loss_in_pipeline_split,
219+
}
220+
221+
if args.num_layers_in_first_pipeline_stage is not None:
222+
model_config_kwargs["num_layers_in_first_pipeline_stage"] = args.num_layers_in_first_pipeline_stage
223+
224+
if args.num_layers_in_last_pipeline_stage is not None:
225+
model_config_kwargs["num_layers_in_last_pipeline_stage"] = args.num_layers_in_last_pipeline_stage
226+
227+
# Deploy the inframework model using the updated API
228+
ray_deployer.deploy_inframework_model(
229+
nemo_checkpoint=args.nemo_checkpoint,
230+
num_gpus=args.num_gpus,
231+
tensor_model_parallel_size=args.tensor_model_parallel_size,
232+
pipeline_model_parallel_size=args.pipeline_model_parallel_size,
233+
expert_model_parallel_size=args.expert_model_parallel_size,
234+
context_parallel_size=args.context_parallel_size,
235+
model_id=args.model_id,
236+
num_cpus_per_replica=args.num_cpus_per_replica,
237+
num_replicas=args.num_replicas,
238+
enable_cuda_graphs=args.enable_cuda_graphs,
239+
enable_flash_decode=args.enable_flash_decode,
240+
legacy_ckpt=args.legacy_ckpt,
241+
max_batch_size=args.max_batch_size,
242+
random_seed=args.random_seed,
243+
megatron_checkpoint_filepath=args.megatron_checkpoint,
244+
model_type=args.model_type,
245+
model_format=model_format,
246+
micro_batch_size=args.micro_batch_size,
247+
**model_config_kwargs,
248+
)
249+
250+
251+
if __name__ == "__main__":
252+
main()

0 commit comments

Comments
 (0)