Skip to content

Commit 0d8d86b

Browse files
Add torch_dtype and default values (#466)
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
1 parent dcf5d18 commit 0d8d86b

File tree

5 files changed

+51
-12
lines changed

5 files changed

+51
-12
lines changed

nemo_deploy/deploy_ray.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,8 @@ def deploy_huggingface_model(
283283
hf_model_id_path: str,
284284
task: str = "text-generation",
285285
trust_remote_code: bool = True,
286-
device_map: Optional[str] = None,
286+
device_map: Optional[str] = "auto",
287+
torch_dtype: Optional[str] = "auto",
287288
max_memory: Optional[str] = None,
288289
model_id: str = "hf-model",
289290
num_replicas: int = 1,
@@ -347,6 +348,7 @@ def deploy_huggingface_model(
347348
task=task,
348349
trust_remote_code=trust_remote_code,
349350
device_map=device_map,
351+
torch_dtype=torch_dtype,
350352
max_memory=max_memory,
351353
model_id=model_id,
352354
use_vllm_backend=use_vllm_backend,

nemo_deploy/llm/hf_deployable.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ def __init__(
7878
tokenizer_truncation=True,
7979
tokenizer_padding_side="left",
8080
task: Optional[str] = "text-generation",
81+
torch_dtype: Optional[torch.dtype] = "auto",
82+
device_map: Optional[str] = "auto",
8183
**hf_kwargs,
8284
):
8385
if not HAVE_TRITON:
@@ -107,22 +109,31 @@ def __init__(
107109
self.tokenizer_id_path = tokenizer_id_path
108110

109111
if model is None:
110-
self._load(**hf_kwargs)
112+
self._load(torch_dtype=torch_dtype, device_map=device_map, **hf_kwargs)
111113

112-
def _load(self, **hf_kwargs) -> None:
114+
def _load(
115+
self, torch_dtype: Optional[torch.dtype] = "auto", device_map: Optional[str] = "auto", **hf_kwargs
116+
) -> None:
113117
"""Load the HuggingFace pipeline with the specified model and task.
114118
115119
This method initializes the HuggingFace AutoModel classes using the provided model
116120
configuration and task type. It handles the model and tokenizer loading
117121
process.
118122
123+
Args:
124+
torch_dtype (torch.dtype): Data type for the model. Defaults to "auto".
125+
device_map (str): Device map for the model. Defaults to "auto".
126+
**hf_kwargs: Additional keyword arguments to pass to the HuggingFace model loading.
127+
119128
Raises:
120129
AssertionError: If task is not specified.
121130
"""
122131
assert self.task is not None, "A task has to be given for the generation task."
123132

124133
if self.task == "text-generation":
125-
self.model = AutoModelForCausalLM.from_pretrained(self.hf_model_id_path, **hf_kwargs)
134+
self.model = AutoModelForCausalLM.from_pretrained(
135+
self.hf_model_id_path, torch_dtype=torch_dtype, device_map=device_map, **hf_kwargs
136+
)
126137

127138
if self.hf_peft_model_id_path is not None:
128139
self.model = PeftModel.from_pretrained(self.model, self.hf_peft_model_id_path)
@@ -131,7 +142,7 @@ def _load(self, **hf_kwargs) -> None:
131142
num_gpus = torch.cuda.device_count()
132143
# If there is only one GPU, move the model to GPU. If you are using device_map as "auto" or "balanced",
133144
# the model will be moved to GPU automatically.
134-
if num_gpus == 1:
145+
if device_map == None and num_gpus >= 1 and self.model.device.type != "cuda":
135146
self.model.cuda()
136147
self.tokenizer = AutoTokenizer.from_pretrained(
137148
self.tokenizer_id_path,

nemo_deploy/llm/hf_deployable_ray.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,11 @@ def __init__(
6363
task: str = "text-generation",
6464
trust_remote_code: bool = True,
6565
model_id: str = "nemo-model",
66-
device_map: Optional[str] = None,
6766
max_memory: Optional[str] = None,
6867
use_vllm_backend: bool = False,
68+
torch_dtype: Optional[torch.dtype] = "auto",
69+
device_map: Optional[str] = "auto",
70+
**kwargs,
6971
):
7072
"""Initialize the HuggingFace model deployment.
7173
@@ -78,7 +80,8 @@ def __init__(
7880
max_memory (str): Maximum memory allocation when using balanced device map.
7981
use_vllm_backend (bool, optional): Whether to use vLLM backend for deployment. If True, exports the HF ckpt
8082
to vLLM format and uses vLLM backend for inference. Defaults to False.
81-
83+
torch_dtype (torch.dtype): Data type for the model. Defaults to "auto".
84+
**kwargs: Additional keyword arguments to pass to the HuggingFace model deployment.
8285
Raises:
8386
ImportError: If Ray is not installed.
8487
Exception: If model initialization fails.
@@ -97,15 +100,17 @@ def __init__(
97100
from nemo_export.vllm_exporter import vLLMExporter
98101

99102
vllm_exporter = vLLMExporter()
100-
vllm_exporter.export(model_path_id=hf_model_id_path)
103+
vllm_exporter.export(model_path_id=hf_model_id_path, **kwargs)
101104
self.model = vllm_exporter
102105
else:
103106
self.model = HuggingFaceLLMDeploy(
104107
hf_model_id_path=hf_model_id_path,
105108
task=task,
106109
trust_remote_code=trust_remote_code,
107-
device_map=device_map,
108110
max_memory=max_memory_dict,
111+
torch_dtype=torch_dtype,
112+
device_map=device_map,
113+
**kwargs,
109114
)
110115
self.model_id = model_id
111116

scripts/deploy/nlp/deploy_inframework_hf_triton.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,19 @@ def get_args(argv):
8080
"--device_map",
8181
nargs="?",
8282
choices=["auto", "balanced", "balanced_low_0", "sequential"],
83-
default=None,
83+
default="auto",
8484
type=str,
8585
help="Device mapping strategy for model placement (e.g. 'auto', 'sequential', etc)",
8686
)
87+
parser.add_argument(
88+
"-td",
89+
"--torch_dtype",
90+
nargs="?",
91+
choices=["auto", "bfloat16", "float16", "float32"],
92+
default="auto",
93+
type=str,
94+
help="Torch dtype for the model",
95+
)
8796
parser.add_argument(
8897
"-tpp",
8998
"--tp_plan",
@@ -196,6 +205,7 @@ def hf_deploy(argv):
196205
task=args.task,
197206
trust_remote_code=args.trust_remote_code,
198207
device_map=args.device_map,
208+
torch_dtype=args.torch_dtype,
199209
tp_plan=args.tp_plan,
200210
)
201211

scripts/deploy/nlp/deploy_ray_hf.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,21 @@ def parse_args():
4040
action="store_true",
4141
help="Whether to trust remote code when loading the model",
4242
)
43+
parser.add_argument(
44+
"--torch_dtype",
45+
nargs="?",
46+
choices=["auto", "bfloat16", "float16", "float32"],
47+
default="auto",
48+
type=str,
49+
help="Torch dtype for the model",
50+
)
4351
parser.add_argument(
4452
"--device_map",
53+
nargs="?",
54+
choices=["auto", "balanced", "balanced_low_0", "sequential"],
55+
default="auto",
4556
type=str,
46-
default=None,
47-
help="Device mapping strategy for model placement",
57+
help="Device mapping strategy for model placement (e.g. 'auto', 'sequential', etc)",
4858
)
4959
parser.add_argument(
5060
"--max_memory",
@@ -149,6 +159,7 @@ def main():
149159
hf_model_id_path=args.model_path,
150160
task=args.task,
151161
trust_remote_code=args.trust_remote_code,
162+
torch_dtype=args.torch_dtype,
152163
device_map=args.device_map,
153164
max_memory=args.max_memory,
154165
model_id=args.model_id,

0 commit comments

Comments
 (0)