Skip to content

Commit 2289f64

Browse files
authored
Merge branch 'master' into amtviz
2 parents f6ab044 + 31f34dd commit 2289f64

File tree

10 files changed

+141
-3
lines changed

10 files changed

+141
-3
lines changed

CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,23 @@
11
# Changelog
22

3+
## v2.247.1 (2025-06-23)
4+
5+
### Bug Fixes and Other Changes
6+
7+
* update image_uri_configs 06-19-2025 07:18:34 PST
8+
9+
## v2.247.0 (2025-06-13)
10+
11+
### Features
12+
13+
* Add support for MetricDefinitions in ModelTrainer
14+
15+
### Bug Fixes and Other Changes
16+
17+
* update jumpstart region_config, update image_uri_configs 06-12-2025 07:18:12 PST
18+
* Add ignore_patterns in ModelTrainer to ignore specific files/folders
19+
* Allow import failure for internal _hashlib module
20+
321
## v2.246.0 (2025-06-04)
422

523
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.246.1.dev0
1+
2.247.2.dev0

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ dependencies = [
4545
"pandas",
4646
"pathos",
4747
"platformdirs",
48-
"protobuf>=3.12,<6.0",
48+
"protobuf>=3.12,<6.32",
4949
"psutil",
5050
"PyYAML>=6.0.1",
5151
"requests",

src/sagemaker/image_uri_config/pytorch.json

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1705,7 +1705,8 @@
17051705
"2.3": "2.3.0",
17061706
"2.4": "2.4.0",
17071707
"2.5": "2.5.1",
1708-
"2.6": "2.6.0"
1708+
"2.6": "2.6.0",
1709+
"2.7": "2.7.1"
17091710
},
17101711
"versions": {
17111712
"0.4.0": {
@@ -2946,6 +2947,51 @@
29462947
"us-west-2": "763104351884"
29472948
},
29482949
"repository": "pytorch-training"
2950+
},
2951+
"2.7.1": {
2952+
"py_versions": [
2953+
"py312"
2954+
],
2955+
"registries": {
2956+
"af-south-1": "626614931356",
2957+
"ap-east-1": "871362719292",
2958+
"ap-east-2": "975050140332",
2959+
"ap-northeast-1": "763104351884",
2960+
"ap-northeast-2": "763104351884",
2961+
"ap-northeast-3": "364406365360",
2962+
"ap-south-1": "763104351884",
2963+
"ap-south-2": "772153158452",
2964+
"ap-southeast-1": "763104351884",
2965+
"ap-southeast-2": "763104351884",
2966+
"ap-southeast-3": "907027046896",
2967+
"ap-southeast-4": "457447274322",
2968+
"ap-southeast-5": "550225433462",
2969+
"ap-southeast-7": "590183813437",
2970+
"ca-central-1": "763104351884",
2971+
"ca-west-1": "204538143572",
2972+
"cn-north-1": "727897471807",
2973+
"cn-northwest-1": "727897471807",
2974+
"eu-central-1": "763104351884",
2975+
"eu-central-2": "380420809688",
2976+
"eu-north-1": "763104351884",
2977+
"eu-south-1": "692866216735",
2978+
"eu-south-2": "503227376785",
2979+
"eu-west-1": "763104351884",
2980+
"eu-west-2": "763104351884",
2981+
"eu-west-3": "763104351884",
2982+
"il-central-1": "780543022126",
2983+
"me-central-1": "914824155844",
2984+
"me-south-1": "217643126080",
2985+
"mx-central-1": "637423239942",
2986+
"sa-east-1": "763104351884",
2987+
"us-east-1": "763104351884",
2988+
"us-east-2": "763104351884",
2989+
"us-gov-east-1": "446045086412",
2990+
"us-gov-west-1": "442386744353",
2991+
"us-west-1": "763104351884",
2992+
"us-west-2": "763104351884"
2993+
},
2994+
"repository": "pytorch-training"
29492995
}
29502996
}
29512997
}

src/sagemaker/image_uri_config/sagemaker-base-python.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"registries": {
55
"af-south-1": "559312083959",
66
"ap-east-1": "493642496378",
7+
"ap-east-2": "938034419563",
78
"ap-northeast-1": "102112518831",
89
"ap-northeast-2": "806072073708",
910
"ap-northeast-3": "792733760839",
@@ -14,6 +15,7 @@
1415
"ap-southeast-5": "148761635175",
1516
"ap-southeast-7": "528757812139",
1617
"ca-central-1": "310906938811",
18+
"ca-west-1": "623308166672",
1719
"cn-north-1": "390048526115",
1820
"cn-northwest-1": "390780980154",
1921
"eu-central-1": "936697816551",

src/sagemaker/image_uri_config/spark.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"registries": {
1212
"af-south-1": "309385258863",
1313
"ap-east-1": "732049463269",
14+
"ap-east-2": "533267296287",
1415
"ap-northeast-1": "411782140378",
1516
"ap-northeast-2": "860869212795",
1617
"ap-northeast-3": "102471314380",
@@ -55,6 +56,7 @@
5556
"registries": {
5657
"af-south-1": "309385258863",
5758
"ap-east-1": "732049463269",
59+
"ap-east-2": "533267296287",
5860
"ap-northeast-1": "411782140378",
5961
"ap-northeast-2": "860869212795",
6062
"ap-northeast-3": "102471314380",
@@ -99,6 +101,7 @@
99101
"registries": {
100102
"af-south-1": "309385258863",
101103
"ap-east-1": "732049463269",
104+
"ap-east-2": "533267296287",
102105
"ap-northeast-1": "411782140378",
103106
"ap-northeast-2": "860869212795",
104107
"ap-northeast-3": "102471314380",
@@ -143,6 +146,7 @@
143146
"registries": {
144147
"af-south-1": "309385258863",
145148
"ap-east-1": "732049463269",
149+
"ap-east-2": "533267296287",
146150
"ap-northeast-1": "411782140378",
147151
"ap-northeast-2": "860869212795",
148152
"ap-northeast-3": "102471314380",
@@ -187,6 +191,7 @@
187191
"registries": {
188192
"af-south-1": "309385258863",
189193
"ap-east-1": "732049463269",
194+
"ap-east-2": "533267296287",
190195
"ap-northeast-1": "411782140378",
191196
"ap-northeast-2": "860869212795",
192197
"ap-northeast-3": "102471314380",

src/sagemaker/jumpstart/region_config.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
"content_bucket": "jumpstart-cache-prod-ap-east-1",
88
"gated_content_bucket": "jumpstart-private-cache-prod-ap-east-1"
99
},
10+
"ap-east-2": {
11+
"content_bucket": "jumpstart-cache-prod-ap-east-2",
12+
"gated_content_bucket": "jumpstart-private-cache-prod-ap-east-2"
13+
},
1014
"ap-northeast-1": {
1115
"content_bucket": "jumpstart-cache-prod-ap-northeast-1",
1216
"gated_content_bucket": "jumpstart-private-cache-prod-ap-northeast-1",

src/sagemaker/modules/configs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
RemoteDebugConfig,
4343
SessionChainingConfig,
4444
InstanceGroup,
45+
MetricDefinition,
4546
)
4647

4748
from sagemaker.modules.utils import convert_unassigned_to_none
@@ -68,6 +69,7 @@
6869
"Compute",
6970
"Networking",
7071
"InputData",
72+
"MetricDefinition",
7173
]
7274

7375

src/sagemaker/modules/train/model_trainer.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
RemoteDebugConfig,
6767
SessionChainingConfig,
6868
InputData,
69+
MetricDefinition,
6970
)
7071

7172
from sagemaker.modules.local_core.local_container import _LocalContainer
@@ -239,6 +240,7 @@ class ModelTrainer(BaseModel):
239240
_infra_check_config: Optional[InfraCheckConfig] = PrivateAttr(default=None)
240241
_session_chaining_config: Optional[SessionChainingConfig] = PrivateAttr(default=None)
241242
_remote_debug_config: Optional[RemoteDebugConfig] = PrivateAttr(default=None)
243+
_metric_definitions: Optional[List[MetricDefinition]] = PrivateAttr(default=None)
242244

243245
_temp_recipe_train_dir: Optional[TemporaryDirectory] = PrivateAttr(default=None)
244246

@@ -696,6 +698,7 @@ def train(
696698
training_image_config=self.training_image_config,
697699
container_entrypoint=container_entrypoint,
698700
container_arguments=container_arguments,
701+
metric_definitions=self._metric_definitions,
699702
)
700703

701704
resource_config = self.compute._to_resource_config()
@@ -1290,3 +1293,33 @@ def with_checkpoint_config(
12901293
"""
12911294
self.checkpoint_config = checkpoint_config or configs.CheckpointConfig()
12921295
return self
1296+
1297+
def with_metric_definitions(
1298+
self, metric_definitions: List[MetricDefinition]
1299+
) -> "ModelTrainer": # noqa: D412
1300+
"""Set the metric definitions for the training job.
1301+
1302+
Example:
1303+
1304+
.. code:: python
1305+
1306+
from sagemaker.modules.train import ModelTrainer
1307+
from sagemaker.modules.configs import MetricDefinition
1308+
1309+
metric_definitions = [
1310+
MetricDefinition(
1311+
name="loss",
1312+
regex="Loss: (.*?)",
1313+
)
1314+
]
1315+
1316+
model_trainer = ModelTrainer(
1317+
...
1318+
).with_metric_definitions(metric_definitions)
1319+
1320+
Args:
1321+
metric_definitions (List[MetricDefinition]):
1322+
The metric definitions for the training job.
1323+
"""
1324+
self._metric_definitions = metric_definitions
1325+
return self

tests/unit/sagemaker/modules/train/test_model_trainer.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
FileSystemDataSource,
6565
Channel,
6666
DataSource,
67+
MetricDefinition,
6768
)
6869
from sagemaker.modules.distributed import Torchrun, SMP, MPI
6970
from sagemaker.modules.train.sm_recipes.utils import _load_recipes_cfg
@@ -705,6 +706,32 @@ def test_remote_debug_config(mock_training_job, modules_session):
705706
)
706707

707708

709+
@patch("sagemaker.modules.train.model_trainer.TrainingJob")
710+
def test_metric_definitions(mock_training_job, modules_session):
711+
image_uri = DEFAULT_IMAGE
712+
role = DEFAULT_ROLE
713+
metric_definitions = [
714+
MetricDefinition(
715+
name="loss",
716+
regex="Loss: (.*?);",
717+
)
718+
]
719+
720+
model_trainer = ModelTrainer(
721+
training_image=image_uri, sagemaker_session=modules_session, role=role
722+
).with_metric_definitions(metric_definitions)
723+
724+
with patch("sagemaker.modules.train.model_trainer.Session.upload_data") as mock_upload_data:
725+
mock_upload_data.return_value = "s3://dummy-bucket/dummy-prefix"
726+
model_trainer.train()
727+
728+
mock_training_job.create.assert_called_once()
729+
assert (
730+
mock_training_job.create.call_args.kwargs["algorithm_specification"].metric_definitions
731+
== metric_definitions
732+
)
733+
734+
708735
@patch("sagemaker.modules.train.model_trainer._get_unique_name")
709736
@patch("sagemaker.modules.train.model_trainer.TrainingJob")
710737
def test_model_trainer_full_init(mock_training_job, mock_unique_name, modules_session):
@@ -822,6 +849,7 @@ def mock_upload_data(path, bucket, key_prefix):
822849
training_input_mode=training_input_mode,
823850
training_image=training_image,
824851
algorithm_name=None,
852+
metric_definitions=None,
825853
container_entrypoint=DEFAULT_ENTRYPOINT,
826854
container_arguments=DEFAULT_ARGUMENTS,
827855
training_image_config=training_image_config,

0 commit comments

Comments
 (0)