Adding kwarg for the in-framewok deployment (#375)

oyilmaz-nvidia · web-flow · commit 3ef3a15f3502 · 2025-09-10T16:22:54.000-04:00
Signed-off-by: Onur Yilmaz &lt;oyilmaz@nvidia.com&gt;
diff --git a/nemo_deploy/deploy_ray.py b/nemo_deploy/deploy_ray.py
@@ -190,6 +190,7 @@ def deploy_inframework_model(
         model_type: str = "gpt",
         model_format: str = "nemo",
         micro_batch_size: Optional[int] = None,
+        **model_config_kwargs,
     ):
         """Deploy an inframework NeMo/Megatron model using Ray Serve.
 
@@ -274,6 +275,7 @@ def deploy_inframework_model(
                 model_type=model_type,
                 model_format=model_format,
                 micro_batch_size=micro_batch_size,
+                **model_config_kwargs,
             )
 
             # Deploy the model
diff --git a/nemo_deploy/nlp/inference/inference_base.py b/nemo_deploy/nlp/inference/inference_base.py
@@ -279,6 +279,7 @@ def setup_model_and_tokenizer_for_inference(
     enable_flash_decode: bool = False,
     enable_cuda_graphs: bool = False,
     legacy_ckpt: bool = False,
+    **model_config_kwargs,
 ) -> Tuple[List[MegatronModule], MCoreTokenizerWrappper]:
     """Initialize a Megatron-Core model and tokenizer for inference from a NeMo-2.0 checkpoint.
 
@@ -311,6 +312,10 @@ def setup_model_and_tokenizer_for_inference(
 
     model_config = model_context.config
 
+    for name, value in model_config_kwargs.items():
+        if hasattr(model_config, name):
+            setattr(model_config, name, value)
+
     # Disable gradient_accumulation_fusion since its not required for inference
     # and only available with Apex. We don't support Apex for community cuda-based
     # installs.
@@ -437,6 +442,7 @@ def create_mcore_engine(
     model_type: str = "gpt",
     model_format: str = "nemo",
     micro_batch_size: Optional[int] = None,
+    **model_config_kwargs,
 ) -> Tuple[MCoreEngineWithCleanup, GPTInferenceWrapper, Union[MCoreTokenizerWrappper, MegatronTokenizer]]:
     """Set up the model, tokenizer and MCoreEngine for inference.
 
@@ -501,6 +507,7 @@ def create_mcore_engine(
             enable_flash_decode=enable_flash_decode,
             enable_cuda_graphs=enable_cuda_graphs,
             legacy_ckpt=legacy_ckpt,
+            **model_config_kwargs,
         )
     elif model_format == "megatron":
         modelList, tokenizer, mlm_args = setup_megatron_model_and_tokenizer_for_inference(
diff --git a/nemo_deploy/nlp/megatronllm_deployable.py b/nemo_deploy/nlp/megatronllm_deployable.py
@@ -163,6 +163,7 @@ def __init__(
         model_type: str = "gpt",
         model_format: str = "nemo",
         micro_batch_size: Optional[int] = None,
+        **model_config_kwargs,
     ):
         if not HAVE_TRITON:
             raise UnavailableError(MISSING_TRITON_MSG)
@@ -195,6 +196,7 @@ def __init__(
             model_type=model_type,
             model_format=model_format,
             micro_batch_size=micro_batch_size,
+            **model_config_kwargs,
         )
         self.enable_cuda_graphs = enable_cuda_graphs
         self.max_batch_size = max_batch_size
diff --git a/nemo_deploy/nlp/megatronllm_deployable_ray.py b/nemo_deploy/nlp/megatronllm_deployable_ray.py
@@ -60,6 +60,7 @@ def __init__(
         model_type: str = "gpt",
         model_format: str = "nemo",
         micro_batch_size: Optional[int] = None,
+        **model_config_kwargs,
     ):
         # Use replica-specific environment variables to avoid conflicts
         os.environ["MASTER_PORT"] = master_port
@@ -95,6 +96,7 @@ def __init__(
                 model_type=model_type,
                 model_format=model_format,
                 micro_batch_size=micro_batch_size,
+                **model_config_kwargs,
             )
             if rank != 0:
                 self.model.generate_other_ranks()
@@ -138,6 +140,7 @@ def __init__(
         model_type: str = "gpt",
         model_format: str = "nemo",
         micro_batch_size: Optional[int] = None,
+        **model_config_kwargs,
     ):
         """Initialize the distributed Megatron LLM model deployment.
 
@@ -202,6 +205,7 @@ def __init__(
                 model_type=model_type,
                 model_format=model_format,
                 micro_batch_size=micro_batch_size,
+                **model_config_kwargs,
             )
             worker_futures.append(rank_0_worker)
 
@@ -230,6 +234,7 @@ def __init__(
                     model_type=model_type,
                     model_format=model_format,
                     micro_batch_size=micro_batch_size,
+                    **model_config_kwargs,
                 )
                 worker_futures.append(worker)
 
diff --git a/scripts/deploy/nlp/deploy_inframework_triton.py b/scripts/deploy/nlp/deploy_inframework_triton.py
@@ -98,6 +98,20 @@ def get_args(argv):
         type=int,
         help="Pipeline parallelism size",
     )
+    parser.add_argument(
+        "-nlfps",
+        "--num_layers_in_first_pipeline_stage",
+        default=None,
+        type=int,
+        help="Number of layers in the first pipeline stage",
+    )
+    parser.add_argument(
+        "-nllps",
+        "--num_layers_in_last_pipeline_stage",
+        default=None,
+        type=int,
+        help="Number of layers in the last pipeline stage",
+    )
     parser.add_argument(
         "-cps",
         "--context_parallel_size",
@@ -112,6 +126,20 @@ def get_args(argv):
         type=int,
         help="Distributes MoE Experts across sub data parallel dimension.",
     )
+    parser.add_argument(
+        "-eps",
+        "--account_for_embedding_in_pipeline_split",
+        default=False,
+        action="store_true",
+        help="Account for embedding in the pipeline split",
+    )
+    parser.add_argument(
+        "-lps",
+        "--account_for_loss_in_pipeline_split",
+        default=False,
+        action="store_true",
+        help="Account for loss in the pipeline split",
+    )
     parser.add_argument(
         "-mbs",
         "--max_batch_size",
@@ -203,6 +231,17 @@ def nemo_deploy(argv):
     if args.nemo_checkpoint is None:
         raise ValueError("In-Framework deployment requires a checkpoint folder.")
 
+    model_config_kwargs = {
+        "account_for_embedding_in_pipeline_split": args.account_for_embedding_in_pipeline_split,
+        "account_for_loss_in_pipeline_split": args.account_for_loss_in_pipeline_split,
+    }
+
+    if args.num_layers_in_first_pipeline_stage is not None:
+        model_config_kwargs["num_layers_in_first_pipeline_stage"] = args.num_layers_in_first_pipeline_stage
+
+    if args.num_layers_in_last_pipeline_stage is not None:
+        model_config_kwargs["num_layers_in_last_pipeline_stage"] = args.num_layers_in_last_pipeline_stage
+
     model = MegatronLLMDeployableNemo2(
         num_devices=args.num_gpus,
         num_nodes=args.num_nodes,
@@ -219,6 +258,7 @@ def nemo_deploy(argv):
         model_type=args.model_type,
         model_format=args.model_format,
         micro_batch_size=args.micro_batch_size,
+        **model_config_kwargs,
     )
 
     if torch.distributed.is_initialized():
diff --git a/scripts/deploy/nlp/deploy_ray_inframework.py b/scripts/deploy/nlp/deploy_ray_inframework.py
@@ -53,6 +53,20 @@ def parse_args():
         default=1,
         help="Size of the pipeline model parallelism",
     )
+    parser.add_argument(
+        "-nlfps",
+        "--num_layers_in_first_pipeline_stage",
+        default=None,
+        type=int,
+        help="Number of layers in the first pipeline stage",
+    )
+    parser.add_argument(
+        "-nllps",
+        "--num_layers_in_last_pipeline_stage",
+        default=None,
+        type=int,
+        help="Number of layers in the last pipeline stage",
+    )
     parser.add_argument(
         "--expert_model_parallel_size",
         type=int,
@@ -65,6 +79,20 @@ def parse_args():
         default=1,
         help="Size of the context parallelism",
     )
+    parser.add_argument(
+        "-eps",
+        "--account_for_embedding_in_pipeline_split",
+        default=False,
+        action="store_true",
+        help="Account for embedding in the pipeline split",
+    )
+    parser.add_argument(
+        "-lps",
+        "--account_for_loss_in_pipeline_split",
+        default=False,
+        action="store_true",
+        help="Account for loss in the pipeline split",
+    )
     parser.add_argument(
         "--model_id",
         type=str,
@@ -184,6 +212,18 @@ def main():
         model_format = "megatron"
     else:
         raise ValueError("Either --nemo_checkpoint or --megatron_checkpoint must be provided")
+
+    model_config_kwargs = {
+        "account_for_embedding_in_pipeline_split": args.account_for_embedding_in_pipeline_split,
+        "account_for_loss_in_pipeline_split": args.account_for_loss_in_pipeline_split,
+    }
+
+    if args.num_layers_in_first_pipeline_stage is not None:
+        model_config_kwargs["num_layers_in_first_pipeline_stage"] = args.num_layers_in_first_pipeline_stage
+
+    if args.num_layers_in_last_pipeline_stage is not None:
+        model_config_kwargs["num_layers_in_last_pipeline_stage"] = args.num_layers_in_last_pipeline_stage
+
     # Deploy the inframework model using the updated API
     ray_deployer.deploy_inframework_model(
         nemo_checkpoint=args.nemo_checkpoint,
@@ -204,6 +244,7 @@ def main():
         model_type=args.model_type,
         model_format=model_format,
         micro_batch_size=args.micro_batch_size,
+        **model_config_kwargs,
     )