diff --git a/tests/unit/sagemaker/remote_function/test_job.py b/tests/unit/sagemaker/remote_function/test_job.py index 5be84fe5ba..f153b5b2ca 100644 --- a/tests/unit/sagemaker/remote_function/test_job.py +++ b/tests/unit/sagemaker/remote_function/test_job.py @@ -15,6 +15,7 @@ import os import sys +import tempfile import pytest from mock import patch, Mock, ANY, mock_open from mock.mock import MagicMock @@ -256,8 +257,6 @@ "OutputDataConfig": {"S3OutputPath": "s3://sagemaker-123/image_uri/output"}, } -OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "sm_training.env") - TEST_JOB_NAME = "my-job-name" TEST_PIPELINE_NAME = "my-pipeline" TEST_EXP_NAME = "my-exp-name" @@ -2115,37 +2114,36 @@ def test_set_env_single_node_cpu( mock_safe_serialize, mock_num_cpus, mock_num_gpus, mock_num_neurons ): with patch.dict(os.environ, {"TRAINING_JOB_NAME": "test-job"}): - set_env( - resource_config=dict( - current_host="algo-1", - hosts=["algo-1"], - current_group_name="homogeneousCluster", - current_instance_type="ml.t3.xlarge", - instance_groups=[ - dict( - instance_group_name="homogeneousCluster", - instance_type="ml.t3.xlarge", - hosts=["algo-1"], - ) - ], - network_interface_name="eth0", - ), - distribution=None, - output_file=OUTPUT_FILE, - ) + with tempfile.NamedTemporaryFile() as f: + set_env( + resource_config=dict( + current_host="algo-1", + hosts=["algo-1"], + current_group_name="homogeneousCluster", + current_instance_type="ml.t3.xlarge", + instance_groups=[ + dict( + instance_group_name="homogeneousCluster", + instance_type="ml.t3.xlarge", + hosts=["algo-1"], + ) + ], + network_interface_name="eth0", + ), + distribution=None, + output_file=f.name, + ) - mock_num_cpus.assert_called_once() - mock_num_gpus.assert_called_once() - mock_num_neurons.assert_called_once() + mock_num_cpus.assert_called_once() + mock_num_gpus.assert_called_once() + mock_num_neurons.assert_called_once() - with open(OUTPUT_FILE, "r") as f: - env_file = f.read().strip() - expected_env = _remove_extra_lines(EXPECTED_ENV_SINGLE_NODE_CPU) - env_file = _remove_extra_lines(env_file) + with open(f.name, "r") as f: + env_file = f.read().strip() + expected_env = _remove_extra_lines(EXPECTED_ENV_SINGLE_NODE_CPU) + env_file = _remove_extra_lines(env_file) - assert env_file == expected_env - os.remove(OUTPUT_FILE) - assert not os.path.exists(OUTPUT_FILE) + assert env_file == expected_env @patch( @@ -2168,37 +2166,36 @@ def test_set_env_single_node_multi_gpu( mock_safe_serialize, mock_num_cpus, mock_num_gpus, mock_num_neurons ): with patch.dict(os.environ, {"TRAINING_JOB_NAME": "test-job"}): - set_env( - resource_config=dict( - current_host="algo-1", - hosts=["algo-1"], - current_group_name="homogeneousCluster", - current_instance_type="ml.g5.12xlarge", - instance_groups=[ - dict( - instance_group_name="homogeneousCluster", - instance_type="ml.g5.12xlarge", - hosts=["algo-1"], - ) - ], - network_interface_name="eth0", - ), - distribution="torchrun", - output_file=OUTPUT_FILE, - ) + with tempfile.NamedTemporaryFile() as f: + set_env( + resource_config=dict( + current_host="algo-1", + hosts=["algo-1"], + current_group_name="homogeneousCluster", + current_instance_type="ml.g5.12xlarge", + instance_groups=[ + dict( + instance_group_name="homogeneousCluster", + instance_type="ml.g5.12xlarge", + hosts=["algo-1"], + ) + ], + network_interface_name="eth0", + ), + distribution="torchrun", + output_file=f.name, + ) - mock_num_cpus.assert_called_once() - mock_num_gpus.assert_called_once() - mock_num_neurons.assert_called_once() + mock_num_cpus.assert_called_once() + mock_num_gpus.assert_called_once() + mock_num_neurons.assert_called_once() - with open(OUTPUT_FILE, "r") as f: - env_file = f.read().strip() - expected_env = _remove_extra_lines(EXPECTED_ENV_SINGLE_NODE_MULTI_GPUS) - env_file = _remove_extra_lines(env_file) + with open(f.name, "r") as f: + env_file = f.read().strip() + expected_env = _remove_extra_lines(EXPECTED_ENV_SINGLE_NODE_MULTI_GPUS) + env_file = _remove_extra_lines(env_file) - assert env_file == expected_env - os.remove(OUTPUT_FILE) - assert not os.path.exists(OUTPUT_FILE) + assert env_file == expected_env @patch( @@ -2221,37 +2218,36 @@ def test_set_env_multi_node_multi_gpu( mock_safe_serialize, mock_num_cpus, mock_num_gpus, mock_num_neurons ): with patch.dict(os.environ, {"TRAINING_JOB_NAME": "test-job"}): - set_env( - resource_config=dict( - current_host="algo-1", - hosts=["algo-1", "algo-2", "algo-3", "algo-4"], - current_group_name="homogeneousCluster", - current_instance_type="ml.g5.2xlarge", - instance_groups=[ - dict( - instance_group_name="homogeneousCluster", - instance_type="ml.g5.2xlarge", - hosts=["algo-4", "algo-2", "algo-1", "algo-3"], - ) - ], - network_interface_name="eth0", - ), - distribution="torchrun", - output_file=OUTPUT_FILE, - ) + with tempfile.NamedTemporaryFile() as f: + set_env( + resource_config=dict( + current_host="algo-1", + hosts=["algo-1", "algo-2", "algo-3", "algo-4"], + current_group_name="homogeneousCluster", + current_instance_type="ml.g5.2xlarge", + instance_groups=[ + dict( + instance_group_name="homogeneousCluster", + instance_type="ml.g5.2xlarge", + hosts=["algo-4", "algo-2", "algo-1", "algo-3"], + ) + ], + network_interface_name="eth0", + ), + distribution="torchrun", + output_file=f.name, + ) - mock_num_cpus.assert_called_once() - mock_num_gpus.assert_called_once() - mock_num_neurons.assert_called_once() + mock_num_cpus.assert_called_once() + mock_num_gpus.assert_called_once() + mock_num_neurons.assert_called_once() - with open(OUTPUT_FILE, "r") as f: - env_file = f.read().strip() - expected_env = _remove_extra_lines(EXPECTED_ENV_MULTI_NODE_MULTI_GPUS) - env_file = _remove_extra_lines(env_file) + with open(f.name, "r") as f: + env_file = f.read().strip() + expected_env = _remove_extra_lines(EXPECTED_ENV_MULTI_NODE_MULTI_GPUS) + env_file = _remove_extra_lines(env_file) - assert env_file == expected_env - os.remove(OUTPUT_FILE) - assert not os.path.exists(OUTPUT_FILE) + assert env_file == expected_env @patch( @@ -2274,37 +2270,36 @@ def test_set_env_single_node_multi_gpu_mpirun( mock_safe_serialize, mock_num_cpus, mock_num_gpus, mock_num_neurons ): with patch.dict(os.environ, {"TRAINING_JOB_NAME": "test-job"}): - set_env( - resource_config=dict( - current_host="algo-1", - hosts=["algo-1"], - current_group_name="homogeneousCluster", - current_instance_type="ml.g5.12xlarge", - instance_groups=[ - dict( - instance_group_name="homogeneousCluster", - instance_type="ml.g5.12xlarge", - hosts=["algo-1"], - ) - ], - network_interface_name="eth0", - ), - distribution="mpirun", - output_file=OUTPUT_FILE, - ) + with tempfile.NamedTemporaryFile() as f: + set_env( + resource_config=dict( + current_host="algo-1", + hosts=["algo-1"], + current_group_name="homogeneousCluster", + current_instance_type="ml.g5.12xlarge", + instance_groups=[ + dict( + instance_group_name="homogeneousCluster", + instance_type="ml.g5.12xlarge", + hosts=["algo-1"], + ) + ], + network_interface_name="eth0", + ), + distribution="mpirun", + output_file=f.name, + ) - mock_num_cpus.assert_called_once() - mock_num_gpus.assert_called_once() - mock_num_neurons.assert_called_once() + mock_num_cpus.assert_called_once() + mock_num_gpus.assert_called_once() + mock_num_neurons.assert_called_once() - with open(OUTPUT_FILE, "r") as f: - env_file = f.read().strip() - expected_env = _remove_extra_lines(EXPECTED_ENV_SINGLE_NODE_MULTI_GPUS_MPIRUN) - env_file = _remove_extra_lines(env_file) + with open(f.name, "r") as f: + env_file = f.read().strip() + expected_env = _remove_extra_lines(EXPECTED_ENV_SINGLE_NODE_MULTI_GPUS_MPIRUN) + env_file = _remove_extra_lines(env_file) - assert env_file == expected_env - os.remove(OUTPUT_FILE) - assert not os.path.exists(OUTPUT_FILE) + assert env_file == expected_env @patch( @@ -2327,37 +2322,36 @@ def test_set_env_multi_node_multi_gpu_mpirun( mock_safe_serialize, mock_num_cpus, mock_num_gpus, mock_num_neurons ): with patch.dict(os.environ, {"TRAINING_JOB_NAME": "test-job"}): - set_env( - resource_config=dict( - current_host="algo-1", - hosts=["algo-1", "algo-2", "algo-3", "algo-4"], - current_group_name="homogeneousCluster", - current_instance_type="ml.g5.2xlarge", - instance_groups=[ - dict( - instance_group_name="homogeneousCluster", - instance_type="ml.g5.2xlarge", - hosts=["algo-4", "algo-2", "algo-1", "algo-3"], - ) - ], - network_interface_name="eth0", - ), - distribution="mpirun", - output_file=OUTPUT_FILE, - ) + with tempfile.NamedTemporaryFile() as f: + set_env( + resource_config=dict( + current_host="algo-1", + hosts=["algo-1", "algo-2", "algo-3", "algo-4"], + current_group_name="homogeneousCluster", + current_instance_type="ml.g5.2xlarge", + instance_groups=[ + dict( + instance_group_name="homogeneousCluster", + instance_type="ml.g5.2xlarge", + hosts=["algo-4", "algo-2", "algo-1", "algo-3"], + ) + ], + network_interface_name="eth0", + ), + distribution="mpirun", + output_file=f.name, + ) - mock_num_cpus.assert_called_once() - mock_num_gpus.assert_called_once() - mock_num_neurons.assert_called_once() + mock_num_cpus.assert_called_once() + mock_num_gpus.assert_called_once() + mock_num_neurons.assert_called_once() - with open(OUTPUT_FILE, "r") as f: - env_file = f.read().strip() - expected_env = _remove_extra_lines(EXPECTED_ENV_MULTI_NODE_MULTI_GPUS_MPIRUN) - env_file = _remove_extra_lines(env_file) + with open(f.name, "r") as f: + env_file = f.read().strip() + expected_env = _remove_extra_lines(EXPECTED_ENV_MULTI_NODE_MULTI_GPUS_MPIRUN) + env_file = _remove_extra_lines(env_file) - assert env_file == expected_env - os.remove(OUTPUT_FILE) - assert not os.path.exists(OUTPUT_FILE) + assert env_file == expected_env @patch("sagemaker.experiments._run_context._RunContext.get_current_run", new=mock_get_current_run) @@ -2644,40 +2638,39 @@ def test_set_env_single_node_multi_gpu_mpirun_with_nproc_per_node( mock_safe_serialize, mock_num_cpus, mock_num_gpus, mock_num_neurons ): with patch.dict(os.environ, {"TRAINING_JOB_NAME": "test-job"}): - set_env( - resource_config=dict( - current_host="algo-1", - hosts=["algo-1"], - current_group_name="homogeneousCluster", - current_instance_type="ml.g5.12xlarge", - instance_groups=[ - dict( - instance_group_name="homogeneousCluster", - instance_type="ml.g5.12xlarge", - hosts=["algo-1"], - ) - ], - network_interface_name="eth0", - ), - distribution="mpirun", - user_nproc_per_node=2, - output_file=OUTPUT_FILE, - ) + with tempfile.NamedTemporaryFile() as f: + set_env( + resource_config=dict( + current_host="algo-1", + hosts=["algo-1"], + current_group_name="homogeneousCluster", + current_instance_type="ml.g5.12xlarge", + instance_groups=[ + dict( + instance_group_name="homogeneousCluster", + instance_type="ml.g5.12xlarge", + hosts=["algo-1"], + ) + ], + network_interface_name="eth0", + ), + distribution="mpirun", + user_nproc_per_node=2, + output_file=f.name, + ) - mock_num_cpus.assert_called_once() - mock_num_gpus.assert_called_once() - mock_num_neurons.assert_called_once() + mock_num_cpus.assert_called_once() + mock_num_gpus.assert_called_once() + mock_num_neurons.assert_called_once() - with open(OUTPUT_FILE, "r") as f: - env_file = f.read().strip() - expected_env = _remove_extra_lines( - EXPECTED_ENV_SINGLE_NODE_MULTI_GPUS_MPIRUN_WITH_NPROC_PER_NODE - ) - env_file = _remove_extra_lines(env_file) + with open(f.name, "r") as f: + env_file = f.read().strip() + expected_env = _remove_extra_lines( + EXPECTED_ENV_SINGLE_NODE_MULTI_GPUS_MPIRUN_WITH_NPROC_PER_NODE + ) + env_file = _remove_extra_lines(env_file) - assert env_file == expected_env - os.remove(OUTPUT_FILE) - assert not os.path.exists(OUTPUT_FILE) + assert env_file == expected_env def _remove_extra_lines(string):