Skip to content

Commit 3ef3a15

Browse files
Adding kwarg for the in-framewok deployment (#375)
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
1 parent 8e8c263 commit 3ef3a15

File tree

6 files changed

+97
-0
lines changed

6 files changed

+97
-0
lines changed

nemo_deploy/deploy_ray.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ def deploy_inframework_model(
190190
model_type: str = "gpt",
191191
model_format: str = "nemo",
192192
micro_batch_size: Optional[int] = None,
193+
**model_config_kwargs,
193194
):
194195
"""Deploy an inframework NeMo/Megatron model using Ray Serve.
195196
@@ -274,6 +275,7 @@ def deploy_inframework_model(
274275
model_type=model_type,
275276
model_format=model_format,
276277
micro_batch_size=micro_batch_size,
278+
**model_config_kwargs,
277279
)
278280

279281
# Deploy the model

nemo_deploy/nlp/inference/inference_base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ def setup_model_and_tokenizer_for_inference(
279279
enable_flash_decode: bool = False,
280280
enable_cuda_graphs: bool = False,
281281
legacy_ckpt: bool = False,
282+
**model_config_kwargs,
282283
) -> Tuple[List[MegatronModule], MCoreTokenizerWrappper]:
283284
"""Initialize a Megatron-Core model and tokenizer for inference from a NeMo-2.0 checkpoint.
284285
@@ -311,6 +312,10 @@ def setup_model_and_tokenizer_for_inference(
311312

312313
model_config = model_context.config
313314

315+
for name, value in model_config_kwargs.items():
316+
if hasattr(model_config, name):
317+
setattr(model_config, name, value)
318+
314319
# Disable gradient_accumulation_fusion since its not required for inference
315320
# and only available with Apex. We don't support Apex for community cuda-based
316321
# installs.
@@ -437,6 +442,7 @@ def create_mcore_engine(
437442
model_type: str = "gpt",
438443
model_format: str = "nemo",
439444
micro_batch_size: Optional[int] = None,
445+
**model_config_kwargs,
440446
) -> Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]:
441447
"""Set up the model, tokenizer and MCoreEngine for inference.
442448
@@ -501,6 +507,7 @@ def create_mcore_engine(
501507
enable_flash_decode=enable_flash_decode,
502508
enable_cuda_graphs=enable_cuda_graphs,
503509
legacy_ckpt=legacy_ckpt,
510+
**model_config_kwargs,
504511
)
505512
elif model_format == "megatron":
506513
modelList, tokenizer, mlm_args = setup_megatron_model_and_tokenizer_for_inference(

nemo_deploy/nlp/megatronllm_deployable.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ def __init__(
163163
model_type: str = "gpt",
164164
model_format: str = "nemo",
165165
micro_batch_size: Optional[int] = None,
166+
**model_config_kwargs,
166167
):
167168
if not HAVE_TRITON:
168169
raise UnavailableError(MISSING_TRITON_MSG)
@@ -195,6 +196,7 @@ def __init__(
195196
model_type=model_type,
196197
model_format=model_format,
197198
micro_batch_size=micro_batch_size,
199+
**model_config_kwargs,
198200
)
199201
self.enable_cuda_graphs = enable_cuda_graphs
200202
self.max_batch_size = max_batch_size

nemo_deploy/nlp/megatronllm_deployable_ray.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def __init__(
6060
model_type: str = "gpt",
6161
model_format: str = "nemo",
6262
micro_batch_size: Optional[int] = None,
63+
**model_config_kwargs,
6364
):
6465
# Use replica-specific environment variables to avoid conflicts
6566
os.environ["MASTER_PORT"] = master_port
@@ -95,6 +96,7 @@ def __init__(
9596
model_type=model_type,
9697
model_format=model_format,
9798
micro_batch_size=micro_batch_size,
99+
**model_config_kwargs,
98100
)
99101
if rank != 0:
100102
self.model.generate_other_ranks()
@@ -138,6 +140,7 @@ def __init__(
138140
model_type: str = "gpt",
139141
model_format: str = "nemo",
140142
micro_batch_size: Optional[int] = None,
143+
**model_config_kwargs,
141144
):
142145
"""Initialize the distributed Megatron LLM model deployment.
143146
@@ -202,6 +205,7 @@ def __init__(
202205
model_type=model_type,
203206
model_format=model_format,
204207
micro_batch_size=micro_batch_size,
208+
**model_config_kwargs,
205209
)
206210
worker_futures.append(rank_0_worker)
207211

@@ -230,6 +234,7 @@ def __init__(
230234
model_type=model_type,
231235
model_format=model_format,
232236
micro_batch_size=micro_batch_size,
237+
**model_config_kwargs,
233238
)
234239
worker_futures.append(worker)
235240

scripts/deploy/nlp/deploy_inframework_triton.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,20 @@ def get_args(argv):
9898
type=int,
9999
help="Pipeline parallelism size",
100100
)
101+
parser.add_argument(
102+
"-nlfps",
103+
"--num_layers_in_first_pipeline_stage",
104+
default=None,
105+
type=int,
106+
help="Number of layers in the first pipeline stage",
107+
)
108+
parser.add_argument(
109+
"-nllps",
110+
"--num_layers_in_last_pipeline_stage",
111+
default=None,
112+
type=int,
113+
help="Number of layers in the last pipeline stage",
114+
)
101115
parser.add_argument(
102116
"-cps",
103117
"--context_parallel_size",
@@ -112,6 +126,20 @@ def get_args(argv):
112126
type=int,
113127
help="Distributes MoE Experts across sub data parallel dimension.",
114128
)
129+
parser.add_argument(
130+
"-eps",
131+
"--account_for_embedding_in_pipeline_split",
132+
default=False,
133+
action="store_true",
134+
help="Account for embedding in the pipeline split",
135+
)
136+
parser.add_argument(
137+
"-lps",
138+
"--account_for_loss_in_pipeline_split",
139+
default=False,
140+
action="store_true",
141+
help="Account for loss in the pipeline split",
142+
)
115143
parser.add_argument(
116144
"-mbs",
117145
"--max_batch_size",
@@ -203,6 +231,17 @@ def nemo_deploy(argv):
203231
if args.nemo_checkpoint is None:
204232
raise ValueError("In-Framework deployment requires a checkpoint folder.")
205233

234+
model_config_kwargs = {
235+
"account_for_embedding_in_pipeline_split": args.account_for_embedding_in_pipeline_split,
236+
"account_for_loss_in_pipeline_split": args.account_for_loss_in_pipeline_split,
237+
}
238+
239+
if args.num_layers_in_first_pipeline_stage is not None:
240+
model_config_kwargs["num_layers_in_first_pipeline_stage"] = args.num_layers_in_first_pipeline_stage
241+
242+
if args.num_layers_in_last_pipeline_stage is not None:
243+
model_config_kwargs["num_layers_in_last_pipeline_stage"] = args.num_layers_in_last_pipeline_stage
244+
206245
model = MegatronLLMDeployableNemo2(
207246
num_devices=args.num_gpus,
208247
num_nodes=args.num_nodes,
@@ -219,6 +258,7 @@ def nemo_deploy(argv):
219258
model_type=args.model_type,
220259
model_format=args.model_format,
221260
micro_batch_size=args.micro_batch_size,
261+
**model_config_kwargs,
222262
)
223263

224264
if torch.distributed.is_initialized():

scripts/deploy/nlp/deploy_ray_inframework.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,20 @@ def parse_args():
5353
default=1,
5454
help="Size of the pipeline model parallelism",
5555
)
56+
parser.add_argument(
57+
"-nlfps",
58+
"--num_layers_in_first_pipeline_stage",
59+
default=None,
60+
type=int,
61+
help="Number of layers in the first pipeline stage",
62+
)
63+
parser.add_argument(
64+
"-nllps",
65+
"--num_layers_in_last_pipeline_stage",
66+
default=None,
67+
type=int,
68+
help="Number of layers in the last pipeline stage",
69+
)
5670
parser.add_argument(
5771
"--expert_model_parallel_size",
5872
type=int,
@@ -65,6 +79,20 @@ def parse_args():
6579
default=1,
6680
help="Size of the context parallelism",
6781
)
82+
parser.add_argument(
83+
"-eps",
84+
"--account_for_embedding_in_pipeline_split",
85+
default=False,
86+
action="store_true",
87+
help="Account for embedding in the pipeline split",
88+
)
89+
parser.add_argument(
90+
"-lps",
91+
"--account_for_loss_in_pipeline_split",
92+
default=False,
93+
action="store_true",
94+
help="Account for loss in the pipeline split",
95+
)
6896
parser.add_argument(
6997
"--model_id",
7098
type=str,
@@ -184,6 +212,18 @@ def main():
184212
model_format = "megatron"
185213
else:
186214
raise ValueError("Either --nemo_checkpoint or --megatron_checkpoint must be provided")
215+
216+
model_config_kwargs = {
217+
"account_for_embedding_in_pipeline_split": args.account_for_embedding_in_pipeline_split,
218+
"account_for_loss_in_pipeline_split": args.account_for_loss_in_pipeline_split,
219+
}
220+
221+
if args.num_layers_in_first_pipeline_stage is not None:
222+
model_config_kwargs["num_layers_in_first_pipeline_stage"] = args.num_layers_in_first_pipeline_stage
223+
224+
if args.num_layers_in_last_pipeline_stage is not None:
225+
model_config_kwargs["num_layers_in_last_pipeline_stage"] = args.num_layers_in_last_pipeline_stage
226+
187227
# Deploy the inframework model using the updated API
188228
ray_deployer.deploy_inframework_model(
189229
nemo_checkpoint=args.nemo_checkpoint,
@@ -204,6 +244,7 @@ def main():
204244
model_type=args.model_type,
205245
model_format=model_format,
206246
micro_batch_size=args.micro_batch_size,
247+
**model_config_kwargs,
207248
)
208249

209250

0 commit comments

Comments
 (0)