diff --git a/CHANGELOG.md b/CHANGELOG.md index 2349827551..c43a7c91db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Changelog +## v2.247.1 (2025-06-23) + +### Bug Fixes and Other Changes + + * update image_uri_configs 06-19-2025 07:18:34 PST + +## v2.247.0 (2025-06-13) + +### Features + + * Add support for MetricDefinitions in ModelTrainer + +### Bug Fixes and Other Changes + + * update jumpstart region_config, update image_uri_configs 06-12-2025 07:18:12 PST + * Add ignore_patterns in ModelTrainer to ignore specific files/folders + * Allow import failure for internal _hashlib module + ## v2.246.0 (2025-06-04) ### Features diff --git a/VERSION b/VERSION index 657c15330d..cdbe343ddb 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.246.1.dev0 +2.247.2.dev0 diff --git a/pyproject.toml b/pyproject.toml index 918e874b57..87bc0a4d3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dependencies = [ "pandas", "pathos", "platformdirs", - "protobuf>=3.12,<6.0", + "protobuf>=3.12,<6.32", "psutil", "PyYAML>=6.0.1", "requests", diff --git a/src/sagemaker/image_uri_config/pytorch.json b/src/sagemaker/image_uri_config/pytorch.json index dbff976442..58b1fdfff7 100644 --- a/src/sagemaker/image_uri_config/pytorch.json +++ b/src/sagemaker/image_uri_config/pytorch.json @@ -1705,7 +1705,8 @@ "2.3": "2.3.0", "2.4": "2.4.0", "2.5": "2.5.1", - "2.6": "2.6.0" + "2.6": "2.6.0", + "2.7": "2.7.1" }, "versions": { "0.4.0": { @@ -2946,6 +2947,51 @@ "us-west-2": "763104351884" }, "repository": "pytorch-training" + }, + "2.7.1": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-east-2": "975050140332", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ap-southeast-5": "550225433462", + "ap-southeast-7": "590183813437", + "ca-central-1": "763104351884", + "ca-west-1": "204538143572", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "me-central-1": "914824155844", + "me-south-1": "217643126080", + "mx-central-1": "637423239942", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "repository": "pytorch-training" } } } diff --git a/src/sagemaker/image_uri_config/spark.json b/src/sagemaker/image_uri_config/spark.json index bbb8c9b123..48c43fca15 100644 --- a/src/sagemaker/image_uri_config/spark.json +++ b/src/sagemaker/image_uri_config/spark.json @@ -11,6 +11,7 @@ "registries": { "af-south-1": "309385258863", "ap-east-1": "732049463269", + "ap-east-2": "533267296287", "ap-northeast-1": "411782140378", "ap-northeast-2": "860869212795", "ap-northeast-3": "102471314380", @@ -55,6 +56,7 @@ "registries": { "af-south-1": "309385258863", "ap-east-1": "732049463269", + "ap-east-2": "533267296287", "ap-northeast-1": "411782140378", "ap-northeast-2": "860869212795", "ap-northeast-3": "102471314380", @@ -99,6 +101,7 @@ "registries": { "af-south-1": "309385258863", "ap-east-1": "732049463269", + "ap-east-2": "533267296287", "ap-northeast-1": "411782140378", "ap-northeast-2": "860869212795", "ap-northeast-3": "102471314380", @@ -143,6 +146,7 @@ "registries": { "af-south-1": "309385258863", "ap-east-1": "732049463269", + "ap-east-2": "533267296287", "ap-northeast-1": "411782140378", "ap-northeast-2": "860869212795", "ap-northeast-3": "102471314380", @@ -187,6 +191,7 @@ "registries": { "af-south-1": "309385258863", "ap-east-1": "732049463269", + "ap-east-2": "533267296287", "ap-northeast-1": "411782140378", "ap-northeast-2": "860869212795", "ap-northeast-3": "102471314380", diff --git a/src/sagemaker/jumpstart/region_config.json b/src/sagemaker/jumpstart/region_config.json index 30bea6ee70..136bf8256c 100644 --- a/src/sagemaker/jumpstart/region_config.json +++ b/src/sagemaker/jumpstart/region_config.json @@ -7,6 +7,10 @@ "content_bucket": "jumpstart-cache-prod-ap-east-1", "gated_content_bucket": "jumpstart-private-cache-prod-ap-east-1" }, + "ap-east-2": { + "content_bucket": "jumpstart-cache-prod-ap-east-2", + "gated_content_bucket": "jumpstart-private-cache-prod-ap-east-2" + }, "ap-northeast-1": { "content_bucket": "jumpstart-cache-prod-ap-northeast-1", "gated_content_bucket": "jumpstart-private-cache-prod-ap-northeast-1", diff --git a/src/sagemaker/modules/configs.py b/src/sagemaker/modules/configs.py index 1ada10dff3..8fdf88e735 100644 --- a/src/sagemaker/modules/configs.py +++ b/src/sagemaker/modules/configs.py @@ -42,6 +42,7 @@ RemoteDebugConfig, SessionChainingConfig, InstanceGroup, + MetricDefinition, ) from sagemaker.modules.utils import convert_unassigned_to_none @@ -68,6 +69,7 @@ "Compute", "Networking", "InputData", + "MetricDefinition", ] diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 7d83766c9f..eaabe5972a 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -66,6 +66,7 @@ RemoteDebugConfig, SessionChainingConfig, InputData, + MetricDefinition, ) from sagemaker.modules.local_core.local_container import _LocalContainer @@ -239,6 +240,7 @@ class ModelTrainer(BaseModel): _infra_check_config: Optional[InfraCheckConfig] = PrivateAttr(default=None) _session_chaining_config: Optional[SessionChainingConfig] = PrivateAttr(default=None) _remote_debug_config: Optional[RemoteDebugConfig] = PrivateAttr(default=None) + _metric_definitions: Optional[List[MetricDefinition]] = PrivateAttr(default=None) _temp_recipe_train_dir: Optional[TemporaryDirectory] = PrivateAttr(default=None) @@ -696,6 +698,7 @@ def train( training_image_config=self.training_image_config, container_entrypoint=container_entrypoint, container_arguments=container_arguments, + metric_definitions=self._metric_definitions, ) resource_config = self.compute._to_resource_config() @@ -1290,3 +1293,33 @@ def with_checkpoint_config( """ self.checkpoint_config = checkpoint_config or configs.CheckpointConfig() return self + + def with_metric_definitions( + self, metric_definitions: List[MetricDefinition] + ) -> "ModelTrainer": # noqa: D412 + """Set the metric definitions for the training job. + + Example: + + .. code:: python + + from sagemaker.modules.train import ModelTrainer + from sagemaker.modules.configs import MetricDefinition + + metric_definitions = [ + MetricDefinition( + name="loss", + regex="Loss: (.*?)", + ) + ] + + model_trainer = ModelTrainer( + ... + ).with_metric_definitions(metric_definitions) + + Args: + metric_definitions (List[MetricDefinition]): + The metric definitions for the training job. + """ + self._metric_definitions = metric_definitions + return self diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py index cf38f26334..23ea167ecf 100644 --- a/tests/unit/sagemaker/modules/train/test_model_trainer.py +++ b/tests/unit/sagemaker/modules/train/test_model_trainer.py @@ -64,6 +64,7 @@ FileSystemDataSource, Channel, DataSource, + MetricDefinition, ) from sagemaker.modules.distributed import Torchrun, SMP, MPI from sagemaker.modules.train.sm_recipes.utils import _load_recipes_cfg @@ -705,6 +706,32 @@ def test_remote_debug_config(mock_training_job, modules_session): ) +@patch("sagemaker.modules.train.model_trainer.TrainingJob") +def test_metric_definitions(mock_training_job, modules_session): + image_uri = DEFAULT_IMAGE + role = DEFAULT_ROLE + metric_definitions = [ + MetricDefinition( + name="loss", + regex="Loss: (.*?);", + ) + ] + + model_trainer = ModelTrainer( + training_image=image_uri, sagemaker_session=modules_session, role=role + ).with_metric_definitions(metric_definitions) + + with patch("sagemaker.modules.train.model_trainer.Session.upload_data") as mock_upload_data: + mock_upload_data.return_value = "s3://dummy-bucket/dummy-prefix" + model_trainer.train() + + mock_training_job.create.assert_called_once() + assert ( + mock_training_job.create.call_args.kwargs["algorithm_specification"].metric_definitions + == metric_definitions + ) + + @patch("sagemaker.modules.train.model_trainer._get_unique_name") @patch("sagemaker.modules.train.model_trainer.TrainingJob") def test_model_trainer_full_init(mock_training_job, mock_unique_name, modules_session): @@ -822,6 +849,7 @@ def mock_upload_data(path, bucket, key_prefix): training_input_mode=training_input_mode, training_image=training_image, algorithm_name=None, + metric_definitions=None, container_entrypoint=DEFAULT_ENTRYPOINT, container_arguments=DEFAULT_ARGUMENTS, training_image_config=training_image_config,