aws
diff --git a/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/sagemaker/modules/train/sm_recipes/utils.py‎
Lines changed: 25 additions & 12 deletions b/‎src/sagemaker/modules/train/sm_recipes/utils.py‎
Lines changed: 25 additions & 12 deletions
diff --git a/‎src/sagemaker/pytorch/estimator.py‎
Lines changed: 7 additions & 0 deletions b/‎src/sagemaker/pytorch/estimator.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/sagemaker/remote_function/client.py‎
Lines changed: 24 additions & 10 deletions b/‎src/sagemaker/remote_function/client.py‎
Lines changed: 24 additions & 10 deletions
@@ -1,5 +1,17 @@
 # Changelog
 
+## v2.239.0 (2025-02-01)
+
+### Features
+
+ * Add support for deepseek recipes
+
+### Bug Fixes and Other Changes
+
+ * mpirun protocol - distributed training with @remote decorator
+ * Allow telemetry only in supported regions
+ * Fix ssh host policy
+
 ## v2.238.0 (2025-01-29)
 
 ### Features
 
@@ -1 +1 @@
-2.238.1.dev0
+2.239.1.dev0
@@ -125,6 +125,27 @@ def _register_custom_resolvers():
         OmegaConf.register_new_resolver("add", lambda *numbers: sum(numbers))
 
 
+def _get_trainining_recipe_gpu_model_name_and_script(model_type: str):
+    """Get the model base name and script for the training recipe."""
+
+    model_type_to_script = {
+        "llama_v3": ("llama", "llama_pretrain.py"),
+        "mistral": ("mistral", "mistral_pretrain.py"),
+        "mixtral": ("mixtral", "mixtral_pretrain.py"),
+        "deepseek": ("deepseek", "deepseek_pretrain.py"),
+    }
+
+    for key in model_type_to_script:
+        if model_type.startswith(key):
+            model_type = key
+            break
+
+    if model_type not in model_type_to_script:
+        raise ValueError(f"Model type {model_type} not supported")
+
+    return model_type_to_script[model_type][0], model_type_to_script[model_type][1]
+
+
 def _configure_gpu_args(
     training_recipes_cfg: Dict[str, Any],
     region_name: str,
@@ -140,24 +161,16 @@ def _configure_gpu_args(
     )
     _run_clone_command_silent(adapter_repo, recipe_train_dir.name)
 
-    model_type_to_entry = {
-        "llama_v3": ("llama", "llama_pretrain.py"),
-        "mistral": ("mistral", "mistral_pretrain.py"),
-        "mixtral": ("mixtral", "mixtral_pretrain.py"),
-    }
-
     if "model" not in recipe:
         raise ValueError("Supplied recipe does not contain required field model.")
     if "model_type" not in recipe["model"]:
         raise ValueError("Supplied recipe does not contain required field model_type.")
     model_type = recipe["model"]["model_type"]
-    if model_type not in model_type_to_entry:
-        raise ValueError(f"Model type {model_type} not supported")
 
-    source_code.source_dir = os.path.join(
-        recipe_train_dir.name, "examples", model_type_to_entry[model_type][0]
-    )
-    source_code.entry_script = model_type_to_entry[model_type][1]
+    model_base_name, script = _get_trainining_recipe_gpu_model_name_and_script(model_type)
+
+    source_code.source_dir = os.path.join(recipe_train_dir.name, "examples", model_base_name)
+    source_code.entry_script = script
 
     gpu_image_cfg = training_recipes_cfg.get("gpu_image")
     if isinstance(gpu_image_cfg, str):
 
@@ -95,13 +95,20 @@ def _get_training_recipe_gpu_script(code_dir, recipe, source_dir):
         "llama_v3": ("llama", "llama_pretrain.py"),
         "mistral": ("mistral", "mistral_pretrain.py"),
         "mixtral": ("mixtral", "mixtral_pretrain.py"),
+        "deepseek": ("deepseek", "deepseek_pretrain.py"),
     }
 
     if "model" not in recipe:
         raise ValueError("Supplied recipe does not contain required field model.")
     if "model_type" not in recipe["model"]:
         raise ValueError("Supplied recipe does not contain required field model_type.")
     model_type = recipe["model"]["model_type"]
+
+    for key in model_type_to_script:
+        if model_type.startswith(key):
+            model_type = key
+            break
+
     if model_type not in model_type_to_script:
         raise ValueError(f"Model type {model_type} not supported")
 
 
@@ -90,7 +90,8 @@ def remote(
     spark_config: SparkConfig = None,
     use_spot_instances=False,
     max_wait_time_in_seconds=None,
-    use_torchrun=False,
+    use_torchrun: bool = False,
+    use_mpirun: bool = False,
     nproc_per_node: Optional[int] = None,
 ):
     """Decorator for running the annotated function as a SageMaker training job.
@@ -207,7 +208,8 @@ def remote(
           files are accepted and uploaded to S3.
 
         instance_count (int): The number of instances to use. Defaults to 1.
-          NOTE: Remote function does not support instance_count > 1 for non Spark jobs.
+          NOTE: Remote function supports instance_count > 1 for Spark jobs, torchrun and
+          mpirun utilities
 
         instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run
           the SageMaker job. e.g. ml.c4.xlarge. If not provided, a ValueError is thrown.
@@ -284,6 +286,9 @@ def remote(
         use_torchrun (bool): Specifies whether to use torchrun for distributed training.
           Defaults to ``False``.
 
+        use_mpirun (bool): Specifies whether to use mpirun for distributed training.
+          Defaults to ``False``.
+
         nproc_per_node (Optional int): Specifies the number of processes per node for
           distributed training. Defaults to ``None``.
           This is defined automatically configured on the instance type.
@@ -320,19 +325,21 @@ def _remote(func):
             use_spot_instances=use_spot_instances,
             max_wait_time_in_seconds=max_wait_time_in_seconds,
             use_torchrun=use_torchrun,
+            use_mpirun=use_mpirun,
             nproc_per_node=nproc_per_node,
         )
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
 
             if instance_count > 1 and not (
-                (spark_config is not None and not use_torchrun)
-                or (spark_config is None and use_torchrun)
+                (spark_config is not None and not use_torchrun and not use_mpirun)
+                or (spark_config is None and use_torchrun and not use_mpirun)
+                or (spark_config is None and not use_torchrun and use_mpirun)
             ):
                 raise ValueError(
                     "Remote function do not support training on multi instances "
-                    + "without spark_config or use_torchrun. "
+                    + "without spark_config or use_torchrun or use_mpirun. "
                     + "Please provide instance_count = 1"
                 )
 
@@ -536,7 +543,8 @@ def __init__(
         spark_config: SparkConfig = None,
         use_spot_instances=False,
         max_wait_time_in_seconds=None,
-        use_torchrun=False,
+        use_torchrun: bool = False,
+        use_mpirun: bool = False,
         nproc_per_node: Optional[int] = None,
     ):
         """Constructor for RemoteExecutor
@@ -650,7 +658,8 @@ def __init__(
               files are accepted and uploaded to S3.
 
             instance_count (int): The number of instances to use. Defaults to 1.
-              NOTE: Remote function does not support instance_count > 1 for non Spark jobs.
+              NOTE: Remote function supports instance_count > 1 for Spark jobs, torchrun and
+              mpirun utilities
 
             instance_type (str): The Amazon Elastic Compute Cloud (EC2) instance type to use to run
               the SageMaker job. e.g. ml.c4.xlarge. If not provided, a ValueError is thrown.
@@ -730,6 +739,9 @@ def __init__(
             use_torchrun (bool): Specifies whether to use torchrun for distributed training.
               Defaults to ``False``.
 
+            use_mpirun (bool): Specifies whether to use mpirun for distributed training.
+              Defaults to ``False``.
+
             nproc_per_node (Optional int): Specifies the number of processes per node for
               distributed training. Defaults to ``None``.
               This is defined automatically configured on the instance type.
@@ -740,12 +752,13 @@ def __init__(
             raise ValueError("max_parallel_jobs must be greater than 0.")
 
         if instance_count > 1 and not (
-            (spark_config is not None and not use_torchrun)
-            or (spark_config is None and use_torchrun)
+            (spark_config is not None and not use_torchrun and not use_mpirun)
+            or (spark_config is None and use_torchrun and not use_mpirun)
+            or (spark_config is None and not use_torchrun and use_mpirun)
         ):
             raise ValueError(
                 "Remote function do not support training on multi instances "
-                + "without spark_config or use_torchrun. "
+                + "without spark_config or use_torchrun or use_mpirun. "
                 + "Please provide instance_count = 1"
             )
 
@@ -778,6 +791,7 @@ def __init__(
             use_spot_instances=use_spot_instances,
             max_wait_time_in_seconds=max_wait_time_in_seconds,
             use_torchrun=use_torchrun,
+            use_mpirun=use_mpirun,
             nproc_per_node=nproc_per_node,
         )