Skip to content

Commit e31235f

Browse files
authored
Merge branch 'master' into fix-data-quality-schedule
2 parents cca737a + a7baead commit e31235f

File tree

19 files changed

+1383
-65
lines changed

19 files changed

+1383
-65
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
# Changelog
22

3+
## v2.239.0 (2025-02-01)
4+
5+
### Features
6+
7+
* Add support for deepseek recipes
8+
9+
### Bug Fixes and Other Changes
10+
11+
* mpirun protocol - distributed training with @remote decorator
12+
* Allow telemetry only in supported regions
13+
* Fix ssh host policy
14+
315
## v2.238.0 (2025-01-29)
416

517
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.238.1.dev0
1+
2.239.1.dev0

src/sagemaker/modules/train/sm_recipes/utils.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,27 @@ def _register_custom_resolvers():
125125
OmegaConf.register_new_resolver("add", lambda *numbers: sum(numbers))
126126

127127

128+
def _get_trainining_recipe_gpu_model_name_and_script(model_type: str):
129+
"""Get the model base name and script for the training recipe."""
130+
131+
model_type_to_script = {
132+
"llama_v3": ("llama", "llama_pretrain.py"),
133+
"mistral": ("mistral", "mistral_pretrain.py"),
134+
"mixtral": ("mixtral", "mixtral_pretrain.py"),
135+
"deepseek": ("deepseek", "deepseek_pretrain.py"),
136+
}
137+
138+
for key in model_type_to_script:
139+
if model_type.startswith(key):
140+
model_type = key
141+
break
142+
143+
if model_type not in model_type_to_script:
144+
raise ValueError(f"Model type {model_type} not supported")
145+
146+
return model_type_to_script[model_type][0], model_type_to_script[model_type][1]
147+
148+
128149
def _configure_gpu_args(
129150
training_recipes_cfg: Dict[str, Any],
130151
region_name: str,
@@ -140,24 +161,16 @@ def _configure_gpu_args(
140161
)
141162
_run_clone_command_silent(adapter_repo, recipe_train_dir.name)
142163

143-
model_type_to_entry = {
144-
"llama_v3": ("llama", "llama_pretrain.py"),
145-
"mistral": ("mistral", "mistral_pretrain.py"),
146-
"mixtral": ("mixtral", "mixtral_pretrain.py"),
147-
}
148-
149164
if "model" not in recipe:
150165
raise ValueError("Supplied recipe does not contain required field model.")
151166
if "model_type" not in recipe["model"]:
152167
raise ValueError("Supplied recipe does not contain required field model_type.")
153168
model_type = recipe["model"]["model_type"]
154-
if model_type not in model_type_to_entry:
155-
raise ValueError(f"Model type {model_type} not supported")
156169

157-
source_code.source_dir = os.path.join(
158-
recipe_train_dir.name, "examples", model_type_to_entry[model_type][0]
159-
)
160-
source_code.entry_script = model_type_to_entry[model_type][1]
170+
model_base_name, script = _get_trainining_recipe_gpu_model_name_and_script(model_type)
171+
172+
source_code.source_dir = os.path.join(recipe_train_dir.name, "examples", model_base_name)
173+
source_code.entry_script = script
161174

162175
gpu_image_cfg = training_recipes_cfg.get("gpu_image")
163176
if isinstance(gpu_image_cfg, str):

src/sagemaker/pytorch/estimator.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,20 @@ def _get_training_recipe_gpu_script(code_dir, recipe, source_dir):
9595
"llama_v3": ("llama", "llama_pretrain.py"),
9696
"mistral": ("mistral", "mistral_pretrain.py"),
9797
"mixtral": ("mixtral", "mixtral_pretrain.py"),
98+
"deepseek": ("deepseek", "deepseek_pretrain.py"),
9899
}
99100

100101
if "model" not in recipe:
101102
raise ValueError("Supplied recipe does not contain required field model.")
102103
if "model_type" not in recipe["model"]:
103104
raise ValueError("Supplied recipe does not contain required field model_type.")
104105
model_type = recipe["model"]["model_type"]
106+
107+
for key in model_type_to_script:
108+
if model_type.startswith(key):
109+
model_type = key
110+
break
111+
105112
if model_type not in model_type_to_script:
106113
raise ValueError(f"Model type {model_type} not supported")
107114

src/sagemaker/remote_function/client.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ def remote(
9090
spark_config: SparkConfig = None,
9191
use_spot_instances=False,
9292
max_wait_time_in_seconds=None,
93-
use_torchrun=False,
93+
use_torchrun: bool = False,
94+
use_mpirun: bool = False,
9495
nproc_per_node: Optional[int] = None,
9596
):
9697
"""Decorator for running the annotated function as a SageMaker training job.
@@ -207,7 +208,8 @@ def remote(
207208
files are accepted and uploaded to S3.
208209
209210
instance_count (int): The number of instances to use. Defaults to 1.
210-
NOTE: Remote function does not support instance_count > 1 for non Spark jobs.
211+
NOTE: Remote function supports instance_count > 1 for Spark jobs, torchrun and
212+
mpirun utilities
211213
212214
instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run
213215
the SageMaker job. e.g. ml.c4.xlarge. If not provided, a ValueError is thrown.
@@ -284,6 +286,9 @@ def remote(
284286
use_torchrun (bool): Specifies whether to use torchrun for distributed training.
285287
Defaults to ``False``.
286288
289+
use_mpirun (bool): Specifies whether to use mpirun for distributed training.
290+
Defaults to ``False``.
291+
287292
nproc_per_node (Optional int): Specifies the number of processes per node for
288293
distributed training. Defaults to ``None``.
289294
This is defined automatically configured on the instance type.
@@ -320,19 +325,21 @@ def _remote(func):
320325
use_spot_instances=use_spot_instances,
321326
max_wait_time_in_seconds=max_wait_time_in_seconds,
322327
use_torchrun=use_torchrun,
328+
use_mpirun=use_mpirun,
323329
nproc_per_node=nproc_per_node,
324330
)
325331

326332
@functools.wraps(func)
327333
def wrapper(*args, **kwargs):
328334

329335
if instance_count > 1 and not (
330-
(spark_config is not None and not use_torchrun)
331-
or (spark_config is None and use_torchrun)
336+
(spark_config is not None and not use_torchrun and not use_mpirun)
337+
or (spark_config is None and use_torchrun and not use_mpirun)
338+
or (spark_config is None and not use_torchrun and use_mpirun)
332339
):
333340
raise ValueError(
334341
"Remote function do not support training on multi instances "
335-
+ "without spark_config or use_torchrun. "
342+
+ "without spark_config or use_torchrun or use_mpirun. "
336343
+ "Please provide instance_count = 1"
337344
)
338345

@@ -536,7 +543,8 @@ def __init__(
536543
spark_config: SparkConfig = None,
537544
use_spot_instances=False,
538545
max_wait_time_in_seconds=None,
539-
use_torchrun=False,
546+
use_torchrun: bool = False,
547+
use_mpirun: bool = False,
540548
nproc_per_node: Optional[int] = None,
541549
):
542550
"""Constructor for RemoteExecutor
@@ -650,7 +658,8 @@ def __init__(
650658
files are accepted and uploaded to S3.
651659
652660
instance_count (int): The number of instances to use. Defaults to 1.
653-
NOTE: Remote function does not support instance_count > 1 for non Spark jobs.
661+
NOTE: Remote function supports instance_count > 1 for Spark jobs, torchrun and
662+
mpirun utilities
654663
655664
instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run
656665
the SageMaker job. e.g. ml.c4.xlarge. If not provided, a ValueError is thrown.
@@ -730,6 +739,9 @@ def __init__(
730739
use_torchrun (bool): Specifies whether to use torchrun for distributed training.
731740
Defaults to ``False``.
732741
742+
use_mpirun (bool): Specifies whether to use mpirun for distributed training.
743+
Defaults to ``False``.
744+
733745
nproc_per_node (Optional int): Specifies the number of processes per node for
734746
distributed training. Defaults to ``None``.
735747
This is defined automatically configured on the instance type.
@@ -740,12 +752,13 @@ def __init__(
740752
raise ValueError("max_parallel_jobs must be greater than 0.")
741753

742754
if instance_count > 1 and not (
743-
(spark_config is not None and not use_torchrun)
744-
or (spark_config is None and use_torchrun)
755+
(spark_config is not None and not use_torchrun and not use_mpirun)
756+
or (spark_config is None and use_torchrun and not use_mpirun)
757+
or (spark_config is None and not use_torchrun and use_mpirun)
745758
):
746759
raise ValueError(
747760
"Remote function do not support training on multi instances "
748-
+ "without spark_config or use_torchrun. "
761+
+ "without spark_config or use_torchrun or use_mpirun. "
749762
+ "Please provide instance_count = 1"
750763
)
751764

@@ -778,6 +791,7 @@ def __init__(
778791
use_spot_instances=use_spot_instances,
779792
max_wait_time_in_seconds=max_wait_time_in_seconds,
780793
use_torchrun=use_torchrun,
794+
use_mpirun=use_mpirun,
781795
nproc_per_node=nproc_per_node,
782796
)
783797

0 commit comments

Comments
 (0)