aws
diff --git a/‎src/sagemaker/remote_function/client.py‎
Lines changed: 12 additions & 14 deletions b/‎src/sagemaker/remote_function/client.py‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎src/sagemaker/remote_function/core/stored_function.py‎
Lines changed: 0 additions & 6 deletions b/‎src/sagemaker/remote_function/core/stored_function.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎src/sagemaker/remote_function/job.py‎
Lines changed: 53 additions & 13 deletions b/‎src/sagemaker/remote_function/job.py‎
Lines changed: 53 additions & 13 deletions
@@ -91,7 +91,6 @@ def remote(
     use_spot_instances=False,
     max_wait_time_in_seconds=None,
     use_torchrun=False,
-    nproc_per_node=1,
 ):
     """Decorator for running the annotated function as a SageMaker training job.
 
@@ -283,9 +282,6 @@ def remote(
 
         use_torchrun (bool): Specifies whether to use torchrun for distributed training.
           Defaults to ``False``.
-
-        nproc_per_node (int): Specifies the number of processes per node for distributed training.
-          Defaults to ``1``.
     """
 
     def _remote(func):
@@ -319,15 +315,18 @@ def _remote(func):
             use_spot_instances=use_spot_instances,
             max_wait_time_in_seconds=max_wait_time_in_seconds,
             use_torchrun=use_torchrun,
-            nproc_per_node=nproc_per_node,
         )
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
 
-            if instance_count > 1 and not spark_config:
+            if instance_count > 1 and not (
+                (spark_config is not None and not use_torchrun)
+                or (spark_config is None and use_torchrun)
+            ):
                 raise ValueError(
-                    "Remote function do not support training on multi instances. "
+                    "Remote function do not support training on multi instances "
+                    + "without spark_config or use_torchrun. "
                     + "Please provide instance_count = 1"
                 )
 
@@ -532,7 +531,6 @@ def __init__(
         use_spot_instances=False,
         max_wait_time_in_seconds=None,
         use_torchrun=False,
-        nproc_per_node=1,
     ):
         """Constructor for RemoteExecutor
 
@@ -724,18 +722,19 @@ def __init__(
 
             use_torchrun (bool): Specifies whether to use torchrun for distributed training.
               Defaults to ``False``.
-
-            nproc_per_node (int): Specifies the number of processes per node.
-              Defaults to ``1``.
         """
         self.max_parallel_jobs = max_parallel_jobs
 
         if self.max_parallel_jobs <= 0:
             raise ValueError("max_parallel_jobs must be greater than 0.")
 
-        if instance_count > 1 and not spark_config:
+        if instance_count > 1 and not (
+                (spark_config is not None and not use_torchrun)
+                or (spark_config is None and use_torchrun)
+        ):
             raise ValueError(
-                "Remote function do not support training on multi instances. "
+                "Remote function do not support training on multi instances "
+                + "without spark_config or use_torchrun. "
                 + "Please provide instance_count = 1"
             )
 
@@ -768,7 +767,6 @@ def __init__(
             use_spot_instances=use_spot_instances,
             max_wait_time_in_seconds=max_wait_time_in_seconds,
             use_torchrun=use_torchrun,
-            nproc_per_node=nproc_per_node,
         )
 
         self._state_condition = threading.Condition()
 
@@ -55,8 +55,6 @@ def __init__(
         hmac_key: str,
         s3_kms_key: str = None,
         context: Context = Context(),
-        use_torchrun: bool = False,
-        nproc_per_node: int = 1,
     ):
         """Construct a StoredFunction object.
 
@@ -67,16 +65,12 @@ def __init__(
             s3_kms_key: KMS key used to encrypt artifacts uploaded to S3.
             hmac_key: Key used to encrypt serialized and deserialized function and arguments.
             context: Build or run context of a pipeline step.
-            use_torchrun: Whether to use torchrun for distributed training.
-            nproc_per_node: Number of processes per node for distributed training.
         """
         self.sagemaker_session = sagemaker_session
         self.s3_base_uri = s3_base_uri
         self.s3_kms_key = s3_kms_key
         self.hmac_key = hmac_key
         self.context = context
-        self.use_torchrun = use_torchrun
-        self.nproc_per_node = nproc_per_node
 
         self.func_upload_path = s3_path_join(
             s3_base_uri, context.step_name, context.func_step_s3_dir
 
@@ -130,9 +130,12 @@
 export PIP_CACHE_DIR=${{PERSISTENT_CACHE_DIR}}/sm_remotefunction_user_dependencies_cache/pip
 printf "INFO: PIP_CACHE_DIR is set to '$PIP_CACHE_DIR'\\n"
 
+printf "INFO: /opt/ml/input/config/resourceconfig.json:\\n"
+cat /opt/ml/input/config/resourceconfig.json
 
 printf "INFO: Bootstraping runtime environment.\\n"
 python /opt/ml/input/data/{RUNTIME_SCRIPTS_CHANNEL_NAME}/{BOOTSTRAP_SCRIPT_NAME} "$@"
+source /opt/ml/input/sm_training.env
 
 if [ -d {JOB_REMOTE_FUNCTION_WORKSPACE} ]
 then
@@ -155,9 +158,13 @@
     fi
 
     printf "INFO: Invoking remote function inside conda environment: $conda_env.\\n"
+    printf "INFO: $conda_exe run -n $conda_env python -m sagemaker.remote_function.invoke_function \\n"
+    
     $conda_exe run -n $conda_env python -m sagemaker.remote_function.invoke_function "$@"
 else
     printf "INFO: No conda env provided. Invoking remote function\\n"
+    printf "INFO: python -m sagemaker.remote_function.invoke_function \\n"
+    
     python -m sagemaker.remote_function.invoke_function "$@"
 fi
 """
@@ -175,9 +182,12 @@
 export PIP_CACHE_DIR=${{PERSISTENT_CACHE_DIR}}/sm_remotefunction_user_dependencies_cache/pip
 printf "INFO: PIP_CACHE_DIR is set to '$PIP_CACHE_DIR'\\n"
 
+printf "INFO: /opt/ml/input/config/resourceconfig.json:\\n"
+cat /opt/ml/input/config/resourceconfig.json
 
 printf "INFO: Bootstraping runtime environment.\\n"
 python /opt/ml/input/data/{RUNTIME_SCRIPTS_CHANNEL_NAME}/{BOOTSTRAP_SCRIPT_NAME} "$@"
+source /opt/ml/input/sm_training.env
 
 if [ -d {JOB_REMOTE_FUNCTION_WORKSPACE} ]
 then
@@ -200,11 +210,20 @@
     fi
 
     printf "INFO: Invoking remote function with torchrun inside conda environment: $conda_env.\\n"
-    $conda_exe run -n $conda_env torchrun --nproc_per_node $NPROC_PER_NODE \
+    printf "INFO: $conda_exe run -n $conda_env torchrun --nnodes $SM_HOST_COUNT --nproc_per_node $SM_NPROC_PER_NODE \
+    --master_addr $SM_MASTER_ADDR --master_port $SM_MASTER_PORT --node_rank $SM_CURRENT_HOST_RANK \
+    -m sagemaker.remote_function.invoke_function \\n"
+    
+    $conda_exe run -n $conda_env torchrun --nnodes $SM_HOST_COUNT --nproc_per_node $SM_NPROC_PER_NODE \
+    --master_addr $SM_MASTER_ADDR --master_port $SM_MASTER_PORT --node_rank $SM_CURRENT_HOST_RANK \
     -m sagemaker.remote_function.invoke_function "$@"
 else
     printf "INFO: No conda env provided. Invoking remote function with torchrun\\n"
-    torchrun --nproc_per_node $NPROC_PER_NODE -m sagemaker.remote_function.invoke_function "$@"
+    printf "INFO: torchrun --nnodes $SM_HOST_COUNT --nproc_per_node $SM_NPROC_PER_NODE --master_addr $SM_MASTER_ADDR \
+    --master_port $SM_MASTER_PORT --node_rank $SM_CURRENT_HOST_RANK -m sagemaker.remote_function.invoke_function \\n"
+
+    torchrun --nnodes $SM_HOST_COUNT --nproc_per_node $SM_NPROC_PER_NODE --master_addr $SM_MASTER_ADDR \
+    --master_port $SM_MASTER_PORT --node_rank $SM_CURRENT_HOST_RANK -m sagemaker.remote_function.invoke_function "$@"
 fi
 """
 
@@ -263,7 +282,6 @@ def __init__(
         use_spot_instances=False,
         max_wait_time_in_seconds=None,
         use_torchrun=False,
-        nproc_per_node=1,
     ):
         """Initialize a _JobSettings instance which configures the remote job.
 
@@ -604,7 +622,6 @@ def __init__(
         self.tags = self.sagemaker_session._append_sagemaker_config_tags(tags, REMOTE_FUNCTION_TAGS)
 
         self.use_torchrun = use_torchrun
-        self.nproc_per_node = nproc_per_node
 
     @staticmethod
     def _get_default_image(session):
@@ -732,6 +749,8 @@ def start(job_settings: _JobSettings, func, func_args, func_kwargs, run_info=Non
         )
 
         logger.info("Creating job: %s", job_name)
+        logger.info("Environment variables: %s", training_job_request["Environment"])
+
         job_settings.sagemaker_session.sagemaker_client.create_training_job(**training_job_request)
 
         return _Job(
@@ -776,8 +795,6 @@ def compile(
                 s3_base_uri=s3_base_uri,
                 hmac_key=hmac_key,
                 s3_kms_key=job_settings.s3_kms_key,
-                use_torchrun=job_settings.use_torchrun,
-                nproc_per_node=job_settings.nproc_per_node,
             )
             stored_function.save(func, *func_args, **func_kwargs)
         else:
@@ -790,8 +807,6 @@ def compile(
                     step_name=step_compilation_context.step_name,
                     func_step_s3_dir=step_compilation_context.pipeline_build_time,
                 ),
-                use_torchrun=job_settings.use_torchrun,
-                nproc_per_node=job_settings.nproc_per_node,
             )
 
             stored_function.save_pipeline_step_function(serialized_data)
@@ -931,6 +946,7 @@ def compile(
         request_dict["Environment"].update({"REMOTE_FUNCTION_SECRET_KEY": hmac_key})
 
         extended_request = _extend_spark_config_to_request(request_dict, job_settings, s3_base_uri)
+        extended_request = _extend_torchrun_to_request(extended_request, job_settings)
 
         return extended_request
 
@@ -1011,7 +1027,6 @@ def _prepare_and_upload_runtime_scripts(
     s3_kms_key: str,
     sagemaker_session: Session,
     use_torchrun: bool = False,
-    nproc_per_node: int = 1,
 ):
     """Copy runtime scripts to a folder and upload to S3.
 
@@ -1029,8 +1044,6 @@ def _prepare_and_upload_runtime_scripts(
         sagemaker_session (str): SageMaker boto client session.
 
         use_torchrun (bool): Whether to use torchrun or not.
-
-        nproc_per_node (int): Number of processes per node.
     """
 
     from sagemaker.workflow.utilities import load_step_compilation_context
@@ -1054,7 +1067,6 @@ def _prepare_and_upload_runtime_scripts(
 
         if use_torchrun:
             entry_point_script = ENTRYPOINT_TORCHRUN_SCRIPT
-            entry_point_script = entry_point_script.replace("$NPROC_PER_NODE", str(nproc_per_node))
 
         with open(entrypoint_script_path, "w", newline="\n") as file:
             file.writelines(entry_point_script)
@@ -1094,7 +1106,6 @@ def _generate_input_data_config(job_settings: _JobSettings, s3_base_uri: str):
         s3_kms_key=job_settings.s3_kms_key,
         sagemaker_session=job_settings.sagemaker_session,
         use_torchrun=job_settings.use_torchrun,
-        nproc_per_node=job_settings.nproc_per_node,
     )
 
     input_data_config = [
@@ -1435,6 +1446,35 @@ def _upload_serialized_spark_configuration(
     return config_file_s3_uri
 
 
+def _extend_torchrun_to_request(
+    request_dict: Dict,
+    job_settings: _JobSettings,
+) -> Dict:
+    """Extend the create training job request with torchrun configuration.
+
+    Args:
+        request_dict (Dict): create training job request dict.
+        job_settings (_JobSettings): the job settings.
+    """
+    use_torchrun = job_settings.use_torchrun
+    instance_count = job_settings.instance_count
+
+    if not use_torchrun:
+        return request_dict
+
+    if instance_count == 1:
+        return request_dict
+
+    extended_request = request_dict.copy()
+
+    for input_channel in extended_request["InputDataConfig"]:
+        s3_data_source = input_channel["DataSource"].get("S3DataSource", None)
+        if s3_data_source:
+            s3_data_source["S3DataDistributionType"] = "FullyReplicated"
+
+    return extended_request
+
+
 def _extend_spark_config_to_request(
     request_dict: Dict,
     job_settings: _JobSettings,