Add torch_dtype and default values (#466)

oyilmaz-nvidia · web-flow · commit 0d8d86b68911 · 2025-10-23T01:30:23.000Z
Signed-off-by: Onur Yilmaz &lt;oyilmaz@nvidia.com&gt;
diff --git a/nemo_deploy/deploy_ray.py b/nemo_deploy/deploy_ray.py
@@ -283,7 +283,8 @@ def deploy_huggingface_model(
         hf_model_id_path: str,
         task: str = "text-generation",
         trust_remote_code: bool = True,
-        device_map: Optional[str] = None,
+        device_map: Optional[str] = "auto",
+        torch_dtype: Optional[str] = "auto",
         max_memory: Optional[str] = None,
         model_id: str = "hf-model",
         num_replicas: int = 1,
@@ -347,6 +348,7 @@ def deploy_huggingface_model(
                 task=task,
                 trust_remote_code=trust_remote_code,
                 device_map=device_map,
+                torch_dtype=torch_dtype,
                 max_memory=max_memory,
                 model_id=model_id,
                 use_vllm_backend=use_vllm_backend,
diff --git a/nemo_deploy/llm/hf_deployable.py b/nemo_deploy/llm/hf_deployable.py
@@ -78,6 +78,8 @@ def __init__(
         tokenizer_truncation=True,
         tokenizer_padding_side="left",
         task: Optional[str] = "text-generation",
+        torch_dtype: Optional[torch.dtype] = "auto",
+        device_map: Optional[str] = "auto",
         **hf_kwargs,
     ):
         if not HAVE_TRITON:
@@ -107,22 +109,31 @@ def __init__(
             self.tokenizer_id_path = tokenizer_id_path
 
         if model is None:
-            self._load(**hf_kwargs)
+            self._load(torch_dtype=torch_dtype, device_map=device_map, **hf_kwargs)
 
-    def _load(self, **hf_kwargs) -> None:
+    def _load(
+        self, torch_dtype: Optional[torch.dtype] = "auto", device_map: Optional[str] = "auto", **hf_kwargs
+    ) -> None:
         """Load the HuggingFace pipeline with the specified model and task.
 
         This method initializes the HuggingFace AutoModel classes using the provided model
         configuration and task type. It handles the model and tokenizer loading
         process.
 
+        Args:
+            torch_dtype (torch.dtype): Data type for the model. Defaults to "auto".
+            device_map (str): Device map for the model. Defaults to "auto".
+            **hf_kwargs: Additional keyword arguments to pass to the HuggingFace model loading.
+
         Raises:
             AssertionError: If task is not specified.
         """
         assert self.task is not None, "A task has to be given for the generation task."
 
         if self.task == "text-generation":
-            self.model = AutoModelForCausalLM.from_pretrained(self.hf_model_id_path, **hf_kwargs)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.hf_model_id_path, torch_dtype=torch_dtype, device_map=device_map, **hf_kwargs
+            )
 
             if self.hf_peft_model_id_path is not None:
                 self.model = PeftModel.from_pretrained(self.model, self.hf_peft_model_id_path)
@@ -131,7 +142,7 @@ def _load(self, **hf_kwargs) -> None:
         num_gpus = torch.cuda.device_count()
         # If there is only one GPU, move the model to GPU. If you are using device_map as "auto" or "balanced",
         # the model will be moved to GPU automatically.
-        if num_gpus == 1:
+        if device_map == None and num_gpus >= 1 and self.model.device.type != "cuda":
             self.model.cuda()
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.tokenizer_id_path,
diff --git a/nemo_deploy/llm/hf_deployable_ray.py b/nemo_deploy/llm/hf_deployable_ray.py
@@ -63,9 +63,11 @@ def __init__(
         task: str = "text-generation",
         trust_remote_code: bool = True,
         model_id: str = "nemo-model",
-        device_map: Optional[str] = None,
         max_memory: Optional[str] = None,
         use_vllm_backend: bool = False,
+        torch_dtype: Optional[torch.dtype] = "auto",
+        device_map: Optional[str] = "auto",
+        **kwargs,
     ):
         """Initialize the HuggingFace model deployment.
 
@@ -78,7 +80,8 @@ def __init__(
             max_memory (str): Maximum memory allocation when using balanced device map.
             use_vllm_backend (bool, optional): Whether to use vLLM backend for deployment. If True, exports the HF ckpt
             to vLLM format and uses vLLM backend for inference. Defaults to False.
-
+            torch_dtype (torch.dtype): Data type for the model. Defaults to "auto".
+            **kwargs: Additional keyword arguments to pass to the HuggingFace model deployment.
         Raises:
             ImportError: If Ray is not installed.
             Exception: If model initialization fails.
@@ -97,15 +100,17 @@ def __init__(
                 from nemo_export.vllm_exporter import vLLMExporter
 
                 vllm_exporter = vLLMExporter()
-                vllm_exporter.export(model_path_id=hf_model_id_path)
+                vllm_exporter.export(model_path_id=hf_model_id_path, **kwargs)
                 self.model = vllm_exporter
             else:
                 self.model = HuggingFaceLLMDeploy(
                     hf_model_id_path=hf_model_id_path,
                     task=task,
                     trust_remote_code=trust_remote_code,
-                    device_map=device_map,
                     max_memory=max_memory_dict,
+                    torch_dtype=torch_dtype,
+                    device_map=device_map,
+                    **kwargs,
                 )
             self.model_id = model_id
 
diff --git a/scripts/deploy/nlp/deploy_inframework_hf_triton.py b/scripts/deploy/nlp/deploy_inframework_hf_triton.py
@@ -80,10 +80,19 @@ def get_args(argv):
         "--device_map",
         nargs="?",
         choices=["auto", "balanced", "balanced_low_0", "sequential"],
-        default=None,
+        default="auto",
         type=str,
         help="Device mapping strategy for model placement (e.g. 'auto', 'sequential', etc)",
     )
+    parser.add_argument(
+        "-td",
+        "--torch_dtype",
+        nargs="?",
+        choices=["auto", "bfloat16", "float16", "float32"],
+        default="auto",
+        type=str,
+        help="Torch dtype for the model",
+    )
     parser.add_argument(
         "-tpp",
         "--tp_plan",
@@ -196,6 +205,7 @@ def hf_deploy(argv):
         task=args.task,
         trust_remote_code=args.trust_remote_code,
         device_map=args.device_map,
+        torch_dtype=args.torch_dtype,
         tp_plan=args.tp_plan,
     )
 
diff --git a/scripts/deploy/nlp/deploy_ray_hf.py b/scripts/deploy/nlp/deploy_ray_hf.py
@@ -40,11 +40,21 @@ def parse_args():
         action="store_true",
         help="Whether to trust remote code when loading the model",
     )
+    parser.add_argument(
+        "--torch_dtype",
+        nargs="?",
+        choices=["auto", "bfloat16", "float16", "float32"],
+        default="auto",
+        type=str,
+        help="Torch dtype for the model",
+    )
     parser.add_argument(
         "--device_map",
+        nargs="?",
+        choices=["auto", "balanced", "balanced_low_0", "sequential"],
+        default="auto",
         type=str,
-        default=None,
-        help="Device mapping strategy for model placement",
+        help="Device mapping strategy for model placement (e.g. 'auto', 'sequential', etc)",
     )
     parser.add_argument(
         "--max_memory",
@@ -149,6 +159,7 @@ def main():
         hf_model_id_path=args.model_path,
         task=args.task,
         trust_remote_code=args.trust_remote_code,
+        torch_dtype=args.torch_dtype,
         device_map=args.device_map,
         max_memory=args.max_memory,
         model_id=args.model_id,