NVIDIA-NeMo
diff --git a/‎nemo_deploy/deploy_ray.py‎
Lines changed: 355 additions & 23 deletions b/‎nemo_deploy/deploy_ray.py‎
Lines changed: 355 additions & 23 deletions
diff --git a/‎nemo_deploy/nlp/hf_deployable_ray.py‎
Lines changed: 18 additions & 29 deletions b/‎nemo_deploy/nlp/hf_deployable_ray.py‎
Lines changed: 18 additions & 29 deletions
diff --git a/‎nemo_deploy/nlp/megatronllm_deployable_ray.py‎
Lines changed: 20 additions & 12 deletions b/‎nemo_deploy/nlp/megatronllm_deployable_ray.py‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎scripts/deploy/nlp/deploy_ray_hf.py‎
Lines changed: 26 additions & 64 deletions b/‎scripts/deploy/nlp/deploy_ray_hf.py‎
Lines changed: 26 additions & 64 deletions
@@ -15,7 +15,7 @@
 
 import logging
 import time
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import numpy as np
 import torch
@@ -62,8 +62,8 @@ def __init__(
         task: str = "text-generation",
         trust_remote_code: bool = True,
         model_id: str = "nemo-model",
-        device_map: str = "auto",
-        max_memory: str = None,
+        device_map: Optional[str] = None,
+        max_memory: Optional[str] = None,
     ):
         """Initialize the HuggingFace model deployment.
 
@@ -81,7 +81,7 @@ def __init__(
         """
         try:
             max_memory_dict = None
-            self._setup_unique_distributed_parameters(device_map)
+            self._setup_unique_distributed_parameters()
             if device_map == "balanced":
                 if not max_memory:
                     raise ValueError("max_memory must be provided when device_map is 'balanced'")
@@ -102,29 +102,25 @@ def __init__(
             LOGGER.error(f"Error initializing HuggingFaceLLMServe replica: {str(e)}")
             raise
 
-    def _setup_unique_distributed_parameters(self, device_map):
+    def _setup_unique_distributed_parameters(self):
         """Configure unique distributed communication parameters for each model replica.
 
         This function sets up unique MASTER_PORT environment variables for each Ray Serve
         replica to ensure they can initialize their own torch.distributed process groups
-        without port conflicts. Only runs for 'balanced' or 'auto' device maps.
-
-        Args:
-            device_map (str): The device mapping strategy ('auto', 'balanced', etc.)
+        without port conflicts.
         """
-        if device_map == "balanced" or device_map == "auto":
-            import os
+        import os
 
-            import torch.distributed as dist
+        import torch.distributed as dist
 
-            # Check if torch.distributed is already initialized
-            if not dist.is_initialized():
-                # Get a unique port based on current process ID to avoid conflicts
+        # Check if torch.distributed is already initialized
+        if not dist.is_initialized():
+            # Get a unique port based on current process ID to avoid conflicts
 
-                unique_port = find_available_port(29500, "127.0.0.1")
-                # Set environment variables for torch.distributed
-                os.environ["MASTER_ADDR"] = "127.0.0.1"
-                os.environ["MASTER_PORT"] = str(unique_port)
+            unique_port = find_available_port(29500, "127.0.0.1")
+            # Set environment variables for torch.distributed
+            os.environ["MASTER_ADDR"] = "127.0.0.1"
+            os.environ["MASTER_PORT"] = str(unique_port)
 
     @app.post("/v1/completions/")
     async def completions(self, request: Dict[Any, Any]):
@@ -267,16 +263,9 @@ async def chat_completions(self, request: Dict[Any, Any]):
             prompt = "\n".join([f"{msg.get('role', 'user')}: {msg.get('content', '')}" for msg in messages])
             prompt += "\nassistant:"
 
-            # Create a modified request with the prompt
-            chat_request = request.copy()
-            chat_request["prompt"] = prompt
-
-            # Extract parameters from the request dictionary
-            messages = request.get("messages", [])
-
-            # Prepare inference parameters
+            # Prepare inference parameters using the formatted prompt
             inference_inputs = {
-                "prompts": [messages],  # Wrap messages in a list so apply_chat_template gets the full conversation
+                "prompts": [prompt],  # Use formatted prompt string instead of raw messages
                 "max_length": request.get("max_tokens", 256),
                 "temperature": request.get("temperature", 1.0),
                 "top_k": request.get("top_k", 0),
@@ -330,7 +319,7 @@ async def chat_completions(self, request: Dict[Any, Any]):
                         ),
                         "finish_reason": (
                             "length"
-                            if generated_texts and len(generated_texts[0]) >= inference_inputs["max_length"]
+                            if generated_texts and len(generated_texts[0]) >= request.get("max_tokens", 256)
                             else "stop"
                         ),
                     }
 
@@ -16,7 +16,7 @@
 import logging
 import os
 import time
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import numpy as np
 import ray
@@ -53,6 +53,8 @@ def __init__(
         enable_cuda_graphs: bool = False,
         enable_flash_decode: bool = False,
         legacy_ckpt: bool = False,
+        max_batch_size: int = 32,
+        random_seed: Optional[int] = None,
     ):
         # Use replica-specific environment variables to avoid conflicts
         os.environ["MASTER_PORT"] = master_port
@@ -82,6 +84,8 @@ def __init__(
                 enable_cuda_graphs=enable_cuda_graphs,
                 enable_flash_decode=enable_flash_decode,
                 legacy_ckpt=legacy_ckpt,
+                max_batch_size=max_batch_size,
+                random_seed=random_seed,
             )
             if rank != 0:
                 self.model.generate_other_ranks()
@@ -111,7 +115,6 @@ def __init__(
         self,
         nemo_checkpoint_filepath: str,
         num_gpus: int = 1,
-        num_nodes: int = 1,
         tensor_model_parallel_size: int = 1,
         pipeline_model_parallel_size: int = 1,
         context_parallel_size: int = 1,
@@ -120,13 +123,14 @@ def __init__(
         enable_cuda_graphs: bool = False,
         enable_flash_decode: bool = False,
         legacy_ckpt: bool = False,
+        max_batch_size: int = 32,
+        random_seed: Optional[int] = None,
     ):
         """Initialize the distributed Megatron LLM model deployment.
 
         Args:
             nemo_checkpoint_filepath (str): Path to the .nemo checkpoint file.
-            num_gpus (int): Number of GPUs to use per replica.
-            num_nodes (int): Number of nodes to use for deployment.
+            num_gpus (int): Number of GPUs to use for the deployment
             tensor_model_parallel_size (int): Size of tensor model parallelism.
             pipeline_model_parallel_size (int): Size of pipeline model parallelism.
             context_parallel_size (int): Size of context parallelism.
@@ -136,16 +140,16 @@ def __init__(
             max_batch_size (int): Maximum batch size for request batching.
             batch_wait_timeout_s (float): Maximum time to wait for batching requests.
             legacy_ckpt (bool): Whether to use legacy checkpoint format. Defaults to False.
+            random_seed (int): Random seed for model initialization.
         """
         try:
             self.model_id = model_id
-            world_size = num_gpus * num_nodes
 
             # Validate parallelism configuration
             total_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size
-            if total_parallel_size != world_size:
+            if total_parallel_size != num_gpus:
                 raise ValueError(
-                    f"Total parallelism size ({total_parallel_size}) must equal total GPUs per replica ({world_size})"
+                    f"Total parallelism size ({total_parallel_size}) must equal total GPUs per replica ({num_gpus})"
                 )
 
             # Generate a unique replica ID based on the actor handle
@@ -165,7 +169,7 @@ def __init__(
             rank_0_worker = ModelWorker.remote(
                 nemo_checkpoint_filepath=nemo_checkpoint_filepath,
                 rank=0,
-                world_size=world_size,
+                world_size=num_gpus,
                 tensor_model_parallel_size=tensor_model_parallel_size,
                 pipeline_model_parallel_size=pipeline_model_parallel_size,
                 context_parallel_size=context_parallel_size,
@@ -175,6 +179,8 @@ def __init__(
                 enable_cuda_graphs=enable_cuda_graphs,
                 enable_flash_decode=enable_flash_decode,
                 legacy_ckpt=legacy_ckpt,
+                max_batch_size=max_batch_size,
+                random_seed=random_seed,
             )
             worker_futures.append(rank_0_worker)
 
@@ -184,11 +190,11 @@ def __init__(
             time.sleep(1)  # Give rank 0 time to start the distributed backend
 
             # Create remaining workers in parallel
-            for rank in range(1, world_size):
+            for rank in range(1, num_gpus):
                 worker = ModelWorker.remote(
                     nemo_checkpoint_filepath=nemo_checkpoint_filepath,
                     rank=rank,
-                    world_size=world_size,
+                    world_size=num_gpus,
                     tensor_model_parallel_size=tensor_model_parallel_size,
                     pipeline_model_parallel_size=pipeline_model_parallel_size,
                     context_parallel_size=context_parallel_size,
@@ -197,17 +203,19 @@ def __init__(
                     replica_id=replica_id,
                     enable_cuda_graphs=enable_cuda_graphs,
                     enable_flash_decode=enable_flash_decode,
+                    max_batch_size=max_batch_size,
+                    random_seed=random_seed,
                 )
                 worker_futures.append(worker)
 
             # Wait for all workers to be created and store them
             self.workers = worker_futures
-            LOGGER.info(f"Replica {replica_id} - All {world_size} workers created successfully")
+            LOGGER.info(f"Replica {replica_id} - All {num_gpus} workers created successfully")
 
             # Primary worker for coordinating inference
             self.primary_worker = self.workers[0]
 
-            LOGGER.info(f"Replica {replica_id} - Initialized {world_size} model workers across {num_nodes} nodes")
+            LOGGER.info(f"Replica {replica_id} - Initialized {num_gpus} model workers")
 
         except Exception as e:
             LOGGER.error(f"Error initializing distributed model deployment: {str(e)}")
 
@@ -14,21 +14,12 @@
 
 import argparse
 import logging
-import multiprocessing
-import signal
-import sys
 
 from nemo_deploy.deploy_ray import DeployRay
-from nemo_deploy.nlp.hf_deployable_ray import HFRayDeployable
 
 LOGGER = logging.getLogger("NeMo")
 
 
-def get_available_cpus():
-    """Get the total number of available CPUs in the system."""
-    return multiprocessing.cpu_count()
-
-
 def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(description="Deploy a HuggingFace model using Ray")
@@ -52,7 +43,7 @@ def parse_args():
     parser.add_argument(
         "--device_map",
         type=str,
-        default="auto",
+        default=None,
         help="Device mapping strategy for model placement",
     )
     parser.add_argument(
@@ -77,7 +68,7 @@ def parse_args():
         "--port",
         type=int,
         default=1024,
-        help="Port number to use for the Ray Serve server",
+        help="Port number to use for the Ray Serve server. If None, an available port will be found automatically.",
     )
     parser.add_argument(
         "--num_cpus",
@@ -114,83 +105,54 @@ def parse_args():
         default=8,
         help="Number of CPUs per model replica",
     )
+    parser.add_argument(
+        "--max_ongoing_requests",
+        type=int,
+        default=10,
+        help="Maximum number of ongoing requests per replica",
+    )
     parser.add_argument(
         "--cuda_visible_devices",
         type=str,
-        default="0,1",
+        default="0",
         help="Comma-separated list of CUDA visible devices",
     )
     return parser.parse_args()
 
 
-def signal_handler(signum, frame, deployer):
-    """Handle interrupt signals."""
-    LOGGER.info("Received interrupt signal. Shutting down gracefully...")
-    deployer.stop()
-    sys.exit(0)
-
-
 def main():
+    """Main function to deploy HuggingFace model using the updated DeployRay API."""
     args = parse_args()
 
-    # If num_cpus is not specified, use all available CPUs
-    if args.num_cpus is None:
-        args.num_cpus = get_available_cpus()
-        LOGGER.error(f"Using all available CPUs: {args.num_cpus}")
-
-    # Initialize Ray deployment
+    # Initialize Ray deployment with host, port, and runtime environment
     ray_deployer = DeployRay(
         num_cpus=args.num_cpus,
         num_gpus=args.num_gpus,
         include_dashboard=args.include_dashboard,
+        host=args.host,
+        port=args.port,
         runtime_env={
             "env_vars": {
                 "CUDA_VISIBLE_DEVICES": args.cuda_visible_devices,
             }
         },
     )
 
-    # Set up signal handlers
-    signal.signal(signal.SIGINT, lambda signum, frame: signal_handler(signum, frame, ray_deployer))
-    signal.signal(
-        signal.SIGTERM,
-        lambda signum, frame: signal_handler(signum, frame, ray_deployer),
+    # Deploy the HuggingFace model using the new API
+    # This method handles the complete deployment lifecycle internally
+    ray_deployer.deploy_huggingface_model(
+        hf_model_id_path=args.model_path,
+        task=args.task,
+        trust_remote_code=args.trust_remote_code,
+        device_map=args.device_map,
+        max_memory=args.max_memory,
+        model_id=args.model_id,
+        num_replicas=args.num_replicas,
+        num_cpus_per_replica=args.num_cpus_per_replica,
+        num_gpus_per_replica=args.num_gpus_per_replica,
+        max_ongoing_requests=args.max_ongoing_requests,
     )
 
-    try:
-        # Start Ray Serve
-        ray_deployer.start(host=args.host, port=args.port)
-
-        # Create the HuggingFace model deployment
-        app = HFRayDeployable.options(
-            num_replicas=args.num_replicas,
-            ray_actor_options={
-                "num_gpus": args.num_gpus_per_replica,
-                "num_cpus": args.num_cpus_per_replica,
-            },
-        ).bind(
-            hf_model_id_path=args.model_path,
-            task=args.task,
-            trust_remote_code=args.trust_remote_code,
-            model_id=args.model_id,
-            device_map=args.device_map,
-            max_memory=args.max_memory,
-        )
-
-        # Deploy the model
-        ray_deployer.run(app, args.model_id)
-
-        LOGGER.info(f"Model deployed successfully at {args.host}:{args.port}")
-        LOGGER.info("Press Ctrl+C to stop the deployment")
-
-        # Keep the script running
-        while True:
-            signal.pause()
-    except Exception as e:
-        LOGGER.error(f"Error during deployment: {str(e)}")
-        ray_deployer.stop()
-        sys.exit(1)
-
 
 if __name__ == "__main__":
     main()