From dc5ac8f03ddeda82747b6fa7c3083d6ad8945d24 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Tue, 25 Feb 2025 13:37:20 -0800 Subject: [PATCH 01/14] Allow ModelTrainer to accept hyperparameter file and create Hyperparameter class --- src/sagemaker/modules/hyperparameters.py | 98 ++++++++++++++++++++ src/sagemaker/modules/train/model_trainer.py | 19 +++- 2 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 src/sagemaker/modules/hyperparameters.py diff --git a/src/sagemaker/modules/hyperparameters.py b/src/sagemaker/modules/hyperparameters.py new file mode 100644 index 0000000000..60288da2ec --- /dev/null +++ b/src/sagemaker/modules/hyperparameters.py @@ -0,0 +1,98 @@ +import os +import json +import dataclasses +from typing import Any, Type, TypeVar + +from sagemaker.modules import logger + +T = TypeVar("T") + + +class DictConfig: + """Class that supports both dict and dot notation access""" + + def __init__(self, **kwargs): + # Store the original dict + self._data = kwargs + + # Set all items as attributes for dot notation + for key, value in kwargs.items(): + # Recursively convert nested dicts to DictConfig + if isinstance(value, dict): + value = DictConfig(**value) + setattr(self, key, value) + + def __getitem__(self, key: str) -> Any: + """Enable dictionary-style access: config['key']""" + return self._data[key] + + def __setitem__(self, key: str, value: Any): + """Enable dictionary-style assignment: config['key'] = value""" + self._data[key] = value + setattr(self, key, value) + + def __str__(self) -> str: + """String representation""" + return str(self._data) + + def __repr__(self) -> str: + """Detailed string representation""" + return f"DictConfig({self._data})" + + +class Hyperparameters: + """Class to load hyperparameters in training container.""" + + @staticmethod + def load() -> DictConfig: + """Loads hyperparameters in training container + + Example: + + .. code:: python + from sagemaker.modules.hyperparameters import Hyperparameters + + hps = Hyperparameters.load() + print(hps.batch_size) + + Returns: + DictConfig: hyperparameters as a DictConfig object + """ + hps = json.loads(os.environ.get("SM_HPS", "{}")) + if not hps: + logger.warning("No hyperparameters found in SM_HPS environment variable.") + return DictConfig(**hps) + + @staticmethod + def load_structured(dataclass_type: Type[T]) -> T: + """Loads hyperparameters as a structured dataclass + + Example: + + .. code:: python + from sagemaker.modules.hyperparameters import Hyperparameters + + @dataclass + class TrainingConfig: + batch_size: int + learning_rate: float + + config = Hyperparameters.load_structured(TrainingConfig) + print(config.batch_size) # typed int + + Args: + dataclass_type: Dataclass type to structure the config + + Returns: + dataclass_type: Instance of provided dataclass type + """ + + if not dataclasses.is_dataclass(dataclass_type): + raise ValueError(f"{dataclass_type} is not a dataclass type.") + + hps = json.loads(os.environ.get("SM_HPS", "{}")) + if not hps: + logger.warning("No hyperparameters found in SM_HPS environment variable.") + + # Convert hyperparameters to dataclass + return dataclass_type(**hps) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index a47d8f91ad..5a2b336fd1 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -17,6 +17,7 @@ import os import json import shutil +import yaml from tempfile import TemporaryDirectory from typing import Optional, List, Union, Dict, Any, ClassVar @@ -195,8 +196,9 @@ class ModelTrainer(BaseModel): Defaults to "File". environment (Optional[Dict[str, str]]): The environment variables for the training job. - hyperparameters (Optional[Dict[str, Any]]): - The hyperparameters for the training job. + hyperparameters (Optional[Union[Dict[str, Any], str]): + The hyperparameters for the training job. Can be a dictionary of hyperparameters + or a path to hyperparameters json/yaml file. tags (Optional[List[Tag]]): An array of key-value pairs. You can use tags to categorize your AWS resources in different ways, for example, by purpose, owner, or environment. @@ -226,7 +228,7 @@ class ModelTrainer(BaseModel): checkpoint_config: Optional[CheckpointConfig] = None training_input_mode: Optional[str] = "File" environment: Optional[Dict[str, str]] = {} - hyperparameters: Optional[Dict[str, Any]] = {} + hyperparameters: Optional[Union[Dict[str, Any], str]] = {} tags: Optional[List[Tag]] = None local_container_root: Optional[str] = os.getcwd() @@ -470,6 +472,17 @@ def model_post_init(self, __context: Any): f"StoppingCondition not provided. Using default:\n{self.stopping_condition}" ) + if self.hyperparameters and isinstance(self.hyperparameters, str): + if not os.path.exists(self.hyperparameters): + raise ValueError(f"Hyperparameters file not found: {self.hyperparameters}") + logger.info(f"Loading hyperparameters from file: {self.hyperparameters}") + if self.hyperparameters.endswith(".json"): + with open(self.hyperparameters, "r") as f: + self.hyperparameters = json.load(f) + elif self.hyperparameters.endswith(".yaml"): + with open(self.hyperparameters, "r") as f: + self.hyperparameters = yaml.safe_load(f) + if self.training_mode == Mode.SAGEMAKER_TRAINING_JOB and self.output_data_config is None: session = self.sagemaker_session base_job_name = self.base_job_name From e4474fa6be0199cc292656e5b9c7dc232df5d334 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Tue, 25 Feb 2025 13:54:55 -0800 Subject: [PATCH 02/14] pylint --- src/sagemaker/modules/hyperparameters.py | 15 +++++++++++++++ src/sagemaker/modules/train/model_trainer.py | 3 +-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/modules/hyperparameters.py b/src/sagemaker/modules/hyperparameters.py index 60288da2ec..6b8c99fd01 100644 --- a/src/sagemaker/modules/hyperparameters.py +++ b/src/sagemaker/modules/hyperparameters.py @@ -1,3 +1,18 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Hyperparameters class module.""" +from __future__ import absolute_import + import os import json import dataclasses diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 5a2b336fd1..c1bdce0ecc 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -17,10 +17,9 @@ import os import json import shutil -import yaml from tempfile import TemporaryDirectory - from typing import Optional, List, Union, Dict, Any, ClassVar +import yaml from graphene.utils.str_converters import to_camel_case, to_snake_case From 1c45f1499258be4f0dd90ad9c7e51cd8dd78343f Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Fri, 28 Feb 2025 13:35:35 -0800 Subject: [PATCH 03/14] Detect hyperparameters from contents rather than file extension --- src/sagemaker/modules/train/model_trainer.py | 22 ++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index c1bdce0ecc..b5b17a0405 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -475,12 +475,22 @@ def model_post_init(self, __context: Any): if not os.path.exists(self.hyperparameters): raise ValueError(f"Hyperparameters file not found: {self.hyperparameters}") logger.info(f"Loading hyperparameters from file: {self.hyperparameters}") - if self.hyperparameters.endswith(".json"): - with open(self.hyperparameters, "r") as f: - self.hyperparameters = json.load(f) - elif self.hyperparameters.endswith(".yaml"): - with open(self.hyperparameters, "r") as f: - self.hyperparameters = yaml.safe_load(f) + with open(self.hyperparameters, "r") as f: + contents = f.read() + try: + self.hyperparameters = json.loads(contents) + logger.debug("Hyperparameters loaded as JSON") + except json.JSONDecodeError: + try: + self.hyperparameters = yaml.safe_load(contents) + if not isinstance(self.hyperparameters, dict): + raise ValueError("YAML content is not a valid mapping.") + logger.debug("Hyperparameters loaded as YAML") + except (yaml.YAMLError, ValueError) as e: + raise ValueError( + f"Invalid hyperparameters file: {self.hyperparameters}. " + "Must be a valid JSON or YAML file." + ) if self.training_mode == Mode.SAGEMAKER_TRAINING_JOB and self.output_data_config is None: session = self.sagemaker_session From f8f085470caa5f7a5352f903f5eb9c6916538848 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Fri, 28 Feb 2025 13:57:38 -0800 Subject: [PATCH 04/14] pylint --- src/sagemaker/modules/train/model_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index b5b17a0405..6c68afd918 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -486,7 +486,7 @@ def model_post_init(self, __context: Any): if not isinstance(self.hyperparameters, dict): raise ValueError("YAML content is not a valid mapping.") logger.debug("Hyperparameters loaded as YAML") - except (yaml.YAMLError, ValueError) as e: + except (yaml.YAMLError, ValueError): raise ValueError( f"Invalid hyperparameters file: {self.hyperparameters}. " "Must be a valid JSON or YAML file." From f8cc96187c7d5592fd8e6a9cdfac1ad8f3d5fab7 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 10:12:19 -0800 Subject: [PATCH 05/14] change: add integs --- .../params_script/hyperparameters.json | 14 ++++++++ .../params_script/hyperparameters.yaml | 18 ++++++++++ .../modules/train/test_model_trainer.py | 34 +++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 tests/data/modules/params_script/hyperparameters.json create mode 100644 tests/data/modules/params_script/hyperparameters.yaml diff --git a/tests/data/modules/params_script/hyperparameters.json b/tests/data/modules/params_script/hyperparameters.json new file mode 100644 index 0000000000..a7effc4d4d --- /dev/null +++ b/tests/data/modules/params_script/hyperparameters.json @@ -0,0 +1,14 @@ +{ + "integer": 1, + "boolean": true, + "float": 3.14, + "string": "Hello World", + "list": [1, 2, 3], + "dict": { + "string": "value", + "integer": 3, + "list": [1, 2, 3], + "dict": {"key": "value"}, + "boolean": true + } +} \ No newline at end of file diff --git a/tests/data/modules/params_script/hyperparameters.yaml b/tests/data/modules/params_script/hyperparameters.yaml new file mode 100644 index 0000000000..249c3ead80 --- /dev/null +++ b/tests/data/modules/params_script/hyperparameters.yaml @@ -0,0 +1,18 @@ +integer: 1 +boolean: true +float: 3.14 +string: "Hello World" +list: + - 1 + - 2 + - 3 +dict: + string: value + integer: 3 + list: + - 1 + - 2 + - 3 + dict: + key: value + boolean: true \ No newline at end of file diff --git a/tests/integ/sagemaker/modules/train/test_model_trainer.py b/tests/integ/sagemaker/modules/train/test_model_trainer.py index cd298402b2..a0ba48e177 100644 --- a/tests/integ/sagemaker/modules/train/test_model_trainer.py +++ b/tests/integ/sagemaker/modules/train/test_model_trainer.py @@ -106,3 +106,37 @@ def test_hp_contract_torchrun_script(modules_sagemaker_session): ) model_trainer.train() + + +def test_hp_contract_hyperparameter_json(modules_sagemaker_session): + source_dir = f"{DATA_DIR}/modules/params_script" + source_code = SourceCode( + source_dir=source_dir, + entry_script="train.py", + ) + model_trainer = ModelTrainer( + sagemaker_session=modules_sagemaker_session, + training_image=DEFAULT_CPU_IMAGE, + hyperparameters_file=f"{source_dir}/hyperparameters.json", + source_code=source_code, + base_job_name="hp-contract-hyperparameter-json", + ) + assert model_trainer.hyperparameters == EXPECTED_HYPERPARAMETERS + model_trainer.train() + + +def test_hp_contract_hyperparameter_yaml(modules_sagemaker_session): + source_dir = f"{DATA_DIR}/modules/params_script" + source_code = SourceCode( + source_dir=source_dir, + entry_script="train.py", + ) + model_trainer = ModelTrainer( + sagemaker_session=modules_sagemaker_session, + training_image=DEFAULT_CPU_IMAGE, + hyperparameters_file=f"{source_dir}/hyperparameters.yaml", + source_code=source_code, + base_job_name="hp-contract-hyperparameter-yaml", + ) + assert model_trainer.hyperparameters == EXPECTED_HYPERPARAMETERS + model_trainer.train() From a5dba4b771bc71f229c2b32bf0bfdc4a7eec0b35 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 10:13:36 -0800 Subject: [PATCH 06/14] change: add integs --- tests/integ/sagemaker/modules/train/test_model_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/sagemaker/modules/train/test_model_trainer.py b/tests/integ/sagemaker/modules/train/test_model_trainer.py index a0ba48e177..b093914657 100644 --- a/tests/integ/sagemaker/modules/train/test_model_trainer.py +++ b/tests/integ/sagemaker/modules/train/test_model_trainer.py @@ -117,7 +117,7 @@ def test_hp_contract_hyperparameter_json(modules_sagemaker_session): model_trainer = ModelTrainer( sagemaker_session=modules_sagemaker_session, training_image=DEFAULT_CPU_IMAGE, - hyperparameters_file=f"{source_dir}/hyperparameters.json", + hyperparameters=f"{source_dir}/hyperparameters.json", source_code=source_code, base_job_name="hp-contract-hyperparameter-json", ) @@ -134,7 +134,7 @@ def test_hp_contract_hyperparameter_yaml(modules_sagemaker_session): model_trainer = ModelTrainer( sagemaker_session=modules_sagemaker_session, training_image=DEFAULT_CPU_IMAGE, - hyperparameters_file=f"{source_dir}/hyperparameters.yaml", + hyperparameters=f"{source_dir}/hyperparameters.yaml", source_code=source_code, base_job_name="hp-contract-hyperparameter-yaml", ) From 337850e29ed1fdf880ac76ebfd4e03662c12d28d Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 14:51:41 -0800 Subject: [PATCH 07/14] change: remove custom hyperparameter tooling --- src/sagemaker/modules/hyperparameters.py | 113 ----------------------- 1 file changed, 113 deletions(-) delete mode 100644 src/sagemaker/modules/hyperparameters.py diff --git a/src/sagemaker/modules/hyperparameters.py b/src/sagemaker/modules/hyperparameters.py deleted file mode 100644 index 6b8c99fd01..0000000000 --- a/src/sagemaker/modules/hyperparameters.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -"""Hyperparameters class module.""" -from __future__ import absolute_import - -import os -import json -import dataclasses -from typing import Any, Type, TypeVar - -from sagemaker.modules import logger - -T = TypeVar("T") - - -class DictConfig: - """Class that supports both dict and dot notation access""" - - def __init__(self, **kwargs): - # Store the original dict - self._data = kwargs - - # Set all items as attributes for dot notation - for key, value in kwargs.items(): - # Recursively convert nested dicts to DictConfig - if isinstance(value, dict): - value = DictConfig(**value) - setattr(self, key, value) - - def __getitem__(self, key: str) -> Any: - """Enable dictionary-style access: config['key']""" - return self._data[key] - - def __setitem__(self, key: str, value: Any): - """Enable dictionary-style assignment: config['key'] = value""" - self._data[key] = value - setattr(self, key, value) - - def __str__(self) -> str: - """String representation""" - return str(self._data) - - def __repr__(self) -> str: - """Detailed string representation""" - return f"DictConfig({self._data})" - - -class Hyperparameters: - """Class to load hyperparameters in training container.""" - - @staticmethod - def load() -> DictConfig: - """Loads hyperparameters in training container - - Example: - - .. code:: python - from sagemaker.modules.hyperparameters import Hyperparameters - - hps = Hyperparameters.load() - print(hps.batch_size) - - Returns: - DictConfig: hyperparameters as a DictConfig object - """ - hps = json.loads(os.environ.get("SM_HPS", "{}")) - if not hps: - logger.warning("No hyperparameters found in SM_HPS environment variable.") - return DictConfig(**hps) - - @staticmethod - def load_structured(dataclass_type: Type[T]) -> T: - """Loads hyperparameters as a structured dataclass - - Example: - - .. code:: python - from sagemaker.modules.hyperparameters import Hyperparameters - - @dataclass - class TrainingConfig: - batch_size: int - learning_rate: float - - config = Hyperparameters.load_structured(TrainingConfig) - print(config.batch_size) # typed int - - Args: - dataclass_type: Dataclass type to structure the config - - Returns: - dataclass_type: Instance of provided dataclass type - """ - - if not dataclasses.is_dataclass(dataclass_type): - raise ValueError(f"{dataclass_type} is not a dataclass type.") - - hps = json.loads(os.environ.get("SM_HPS", "{}")) - if not hps: - logger.warning("No hyperparameters found in SM_HPS environment variable.") - - # Convert hyperparameters to dataclass - return dataclass_type(**hps) From e42af65a2528f9049a54d55d5d49086d9c49ae82 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 14:53:00 -0800 Subject: [PATCH 08/14] Add tests for hp contracts --- .../params_script/hyperparameters.json | 1 + .../params_script/hyperparameters.yaml | 1 + .../modules/params_script/requirements.txt | 1 + tests/data/modules/params_script/train.py | 97 ++++++++++++++++++- .../modules/train/test_model_trainer.py | 2 + 5 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 tests/data/modules/params_script/requirements.txt diff --git a/tests/data/modules/params_script/hyperparameters.json b/tests/data/modules/params_script/hyperparameters.json index a7effc4d4d..f637288dbe 100644 --- a/tests/data/modules/params_script/hyperparameters.json +++ b/tests/data/modules/params_script/hyperparameters.json @@ -7,6 +7,7 @@ "dict": { "string": "value", "integer": 3, + "float": 3.14, "list": [1, 2, 3], "dict": {"key": "value"}, "boolean": true diff --git a/tests/data/modules/params_script/hyperparameters.yaml b/tests/data/modules/params_script/hyperparameters.yaml index 249c3ead80..9e3011daf2 100644 --- a/tests/data/modules/params_script/hyperparameters.yaml +++ b/tests/data/modules/params_script/hyperparameters.yaml @@ -9,6 +9,7 @@ list: dict: string: value integer: 3 + float: 3.14 list: - 1 - 2 diff --git a/tests/data/modules/params_script/requirements.txt b/tests/data/modules/params_script/requirements.txt new file mode 100644 index 0000000000..d5e79c0b21 --- /dev/null +++ b/tests/data/modules/params_script/requirements.txt @@ -0,0 +1 @@ +omegaconf \ No newline at end of file diff --git a/tests/data/modules/params_script/train.py b/tests/data/modules/params_script/train.py index 8d3924a325..9b8cb2c82f 100644 --- a/tests/data/modules/params_script/train.py +++ b/tests/data/modules/params_script/train.py @@ -16,6 +16,9 @@ import argparse import json import os +from typing import List, Dict, Any +from dataclasses import dataclass +from omegaconf import OmegaConf EXPECTED_HYPERPARAMETERS = { "integer": 1, @@ -26,6 +29,7 @@ "dict": { "string": "value", "integer": 3, + "float": 3.14, "list": [1, 2, 3], "dict": {"key": "value"}, "boolean": True, @@ -117,7 +121,7 @@ def main(): assert isinstance(params["dict"], dict) params = json.loads(os.environ["SM_TRAINING_ENV"])["hyperparameters"] - print(params) + print(f"SM_TRAINING_ENV -> hyperparameters: {params}") assert params["string"] == EXPECTED_HYPERPARAMETERS["string"] assert params["integer"] == EXPECTED_HYPERPARAMETERS["integer"] assert params["boolean"] == EXPECTED_HYPERPARAMETERS["boolean"] @@ -132,9 +136,96 @@ def main(): assert isinstance(params["float"], float) assert isinstance(params["list"], list) assert isinstance(params["dict"], dict) - print(f"SM_TRAINING_ENV -> hyperparameters: {params}") - print("Test passed.") + # Local JSON - DictConfig OmegaConf + params = OmegaConf.load("hyperparameters.json") + + print(f"Local hyperparameters.json: {params}") + assert params.string == EXPECTED_HYPERPARAMETERS["string"] + assert params.integer == EXPECTED_HYPERPARAMETERS["integer"] + assert params.boolean == EXPECTED_HYPERPARAMETERS["boolean"] + assert params.float == EXPECTED_HYPERPARAMETERS["float"] + assert params.list == EXPECTED_HYPERPARAMETERS["list"] + assert params.dict == EXPECTED_HYPERPARAMETERS["dict"] + assert params.dict.string == EXPECTED_HYPERPARAMETERS["dict"]["string"] + assert params.dict.integer == EXPECTED_HYPERPARAMETERS["dict"]["integer"] + assert params.dict.boolean == EXPECTED_HYPERPARAMETERS["dict"]["boolean"] + assert params.dict.float == EXPECTED_HYPERPARAMETERS["dict"]["float"] + assert params.dict.list == EXPECTED_HYPERPARAMETERS["dict"]["list"] + assert params.dict.dict == EXPECTED_HYPERPARAMETERS["dict"]["dict"] + + @dataclass + class DictConfig: + string: str + integer: int + boolean: bool + float: float + list: List[int] + dict: Dict[str, Any] + + @dataclass + class HPConfig: + string: str + integer: int + boolean: bool + float: float + list: List[int] + dict: DictConfig + + # Local JSON - Structured OmegaConf + hp_config: HPConfig = OmegaConf.merge( + OmegaConf.structured(HPConfig), OmegaConf.load("hyperparameters.json") + ) + print(f"Local hyperparameters.json - Structured: {hp_config}") + assert hp_config.string == EXPECTED_HYPERPARAMETERS["string"] + assert hp_config.integer == EXPECTED_HYPERPARAMETERS["integer"] + assert hp_config.boolean == EXPECTED_HYPERPARAMETERS["boolean"] + assert hp_config.float == EXPECTED_HYPERPARAMETERS["float"] + assert hp_config.list == EXPECTED_HYPERPARAMETERS["list"] + assert hp_config.dict == EXPECTED_HYPERPARAMETERS["dict"] + assert hp_config.dict.string == EXPECTED_HYPERPARAMETERS["dict"]["string"] + assert hp_config.dict.integer == EXPECTED_HYPERPARAMETERS["dict"]["integer"] + assert hp_config.dict.boolean == EXPECTED_HYPERPARAMETERS["dict"]["boolean"] + assert hp_config.dict.float == EXPECTED_HYPERPARAMETERS["dict"]["float"] + assert hp_config.dict.list == EXPECTED_HYPERPARAMETERS["dict"]["list"] + assert hp_config.dict.dict == EXPECTED_HYPERPARAMETERS["dict"]["dict"] + + # Local YAML - Structured OmegaConf + hp_config: HPConfig = OmegaConf.merge( + OmegaConf.structured(HPConfig), OmegaConf.load("hyperparameters.yaml") + ) + print(f"Local hyperparameters.yaml - Structured: {hp_config}") + assert hp_config.string == EXPECTED_HYPERPARAMETERS["string"] + assert hp_config.integer == EXPECTED_HYPERPARAMETERS["integer"] + assert hp_config.boolean == EXPECTED_HYPERPARAMETERS["boolean"] + assert hp_config.float == EXPECTED_HYPERPARAMETERS["float"] + assert hp_config.list == EXPECTED_HYPERPARAMETERS["list"] + assert hp_config.dict == EXPECTED_HYPERPARAMETERS["dict"] + assert hp_config.dict.string == EXPECTED_HYPERPARAMETERS["dict"]["string"] + assert hp_config.dict.integer == EXPECTED_HYPERPARAMETERS["dict"]["integer"] + assert hp_config.dict.boolean == EXPECTED_HYPERPARAMETERS["dict"]["boolean"] + assert hp_config.dict.float == EXPECTED_HYPERPARAMETERS["dict"]["float"] + assert hp_config.dict.list == EXPECTED_HYPERPARAMETERS["dict"]["list"] + assert hp_config.dict.dict == EXPECTED_HYPERPARAMETERS["dict"]["dict"] + print(f"hyperparameters.yaml -> hyperparameters: {hp_config}") + + # HP Dict - Structured OmegaConf + hp_dict = json.loads(os.environ["SM_HPS"]) + hp_config: HPConfig = OmegaConf.merge(OmegaConf.structured(HPConfig), OmegaConf.create(hp_dict)) + print(f"SM_HPS - Structured: {hp_config}") + assert hp_config.string == EXPECTED_HYPERPARAMETERS["string"] + assert hp_config.integer == EXPECTED_HYPERPARAMETERS["integer"] + assert hp_config.boolean == EXPECTED_HYPERPARAMETERS["boolean"] + assert hp_config.float == EXPECTED_HYPERPARAMETERS["float"] + assert hp_config.list == EXPECTED_HYPERPARAMETERS["list"] + assert hp_config.dict == EXPECTED_HYPERPARAMETERS["dict"] + assert hp_config.dict.string == EXPECTED_HYPERPARAMETERS["dict"]["string"] + assert hp_config.dict.integer == EXPECTED_HYPERPARAMETERS["dict"]["integer"] + assert hp_config.dict.boolean == EXPECTED_HYPERPARAMETERS["dict"]["boolean"] + assert hp_config.dict.float == EXPECTED_HYPERPARAMETERS["dict"]["float"] + assert hp_config.dict.list == EXPECTED_HYPERPARAMETERS["dict"]["list"] + assert hp_config.dict.dict == EXPECTED_HYPERPARAMETERS["dict"]["dict"] + print(f"SM_HPS -> hyperparameters: {hp_config}") if __name__ == "__main__": diff --git a/tests/integ/sagemaker/modules/train/test_model_trainer.py b/tests/integ/sagemaker/modules/train/test_model_trainer.py index b093914657..bf6f1f149e 100644 --- a/tests/integ/sagemaker/modules/train/test_model_trainer.py +++ b/tests/integ/sagemaker/modules/train/test_model_trainer.py @@ -28,6 +28,7 @@ "dict": { "string": "value", "integer": 3, + "float": 3.14, "list": [1, 2, 3], "dict": {"key": "value"}, "boolean": True, @@ -40,6 +41,7 @@ def test_hp_contract_basic_py_script(modules_sagemaker_session): source_code = SourceCode( source_dir=f"{DATA_DIR}/modules/params_script", + requirements="requirements.txt", entry_script="train.py", ) From 155a6b832b53475d0dab891e4200562135f512cd Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 15:41:52 -0800 Subject: [PATCH 09/14] change: add unit tests and remove unreachable condition --- src/sagemaker/modules/train/model_trainer.py | 2 - .../modules/train/test_model_trainer.py | 68 ++++++++++++++++++- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 6c68afd918..3a48a8fdad 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -483,8 +483,6 @@ def model_post_init(self, __context: Any): except json.JSONDecodeError: try: self.hyperparameters = yaml.safe_load(contents) - if not isinstance(self.hyperparameters, dict): - raise ValueError("YAML content is not a valid mapping.") logger.debug("Hyperparameters loaded as YAML") except (yaml.YAMLError, ValueError): raise ValueError( diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py index 29da03bcd9..b8083bb82a 100644 --- a/tests/unit/sagemaker/modules/train/test_model_trainer.py +++ b/tests/unit/sagemaker/modules/train/test_model_trainer.py @@ -17,9 +17,10 @@ import tempfile import json import os +import yaml import pytest from pydantic import ValidationError -from unittest.mock import patch, MagicMock, ANY +from unittest.mock import patch, MagicMock, ANY, mock_open from sagemaker import image_uris from sagemaker_core.main.resources import TrainingJob @@ -1093,3 +1094,68 @@ def test_destructor_cleanup(mock_tmp_dir, modules_session): mock_tmp_dir.assert_not_called() del model_trainer mock_tmp_dir.cleanup.assert_called_once() + + +@patch("os.path.exists") +def test_hyperparameters_valid_json(mock_exists, modules_session): + mock_exists.return_value = True + expected_hyperparameters = {"param1": "value1", "param2": 2} + mock_file_open = mock_open(read_data=json.dumps(expected_hyperparameters)) + + with patch("builtins.open", mock_file_open): + model_trainer = ModelTrainer( + training_image=DEFAULT_IMAGE, + role=DEFAULT_ROLE, + sagemaker_session=modules_session, + compute=DEFAULT_COMPUTE_CONFIG, + hyperparameters="hyperparameters.json", + ) + assert model_trainer.hyperparameters == expected_hyperparameters + mock_file_open.assert_called_once_with("hyperparameters.json", "r") + mock_exists.assert_called_once_with("hyperparameters.json") + + +@patch("os.path.exists") +def test_hyperparameters_valid_yaml(mock_exists, modules_session): + mock_exists.return_value = True + expected_hyperparameters = {"param1": "value1", "param2": 2} + mock_file_open = mock_open(read_data=yaml.dump(expected_hyperparameters)) + + with patch("builtins.open", mock_file_open): + model_trainer = ModelTrainer( + training_image=DEFAULT_IMAGE, + role=DEFAULT_ROLE, + sagemaker_session=modules_session, + compute=DEFAULT_COMPUTE_CONFIG, + hyperparameters="hyperparameters.yaml", + ) + assert model_trainer.hyperparameters == expected_hyperparameters + mock_file_open.assert_called_once_with("hyperparameters.yaml", "r") + mock_exists.assert_called_once_with("hyperparameters.yaml") + + +def test_hyperparameters_not_exist(modules_session): + with pytest.raises(ValueError): + ModelTrainer( + training_image=DEFAULT_IMAGE, + role=DEFAULT_ROLE, + sagemaker_session=modules_session, + compute=DEFAULT_COMPUTE_CONFIG, + hyperparameters="nonexistent.json", + ) + + +@patch("os.path.exists") +def test_hyperparameters_invalid(mock_exists, modules_session): + mock_exists.return_value = True + # Must be valid YAML or JSON + mock_file_open = mock_open(read_data="invalid") + with patch("builtins.open", mock_file_open): + with pytest.raises(ValueError, match="Must be a valid JSON or YAML file."): + ModelTrainer( + training_image=DEFAULT_IMAGE, + role=DEFAULT_ROLE, + sagemaker_session=modules_session, + compute=DEFAULT_COMPUTE_CONFIG, + hyperparameters="hyperparameters.yaml", + ) From 3f0b85e6895cbc224fa14a4bc46d00b6a983dab3 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 15:54:08 -0800 Subject: [PATCH 10/14] fix integs --- .../modules/train/test_model_trainer.py | 48 +++++++------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/tests/integ/sagemaker/modules/train/test_model_trainer.py b/tests/integ/sagemaker/modules/train/test_model_trainer.py index bf6f1f149e..a92154b992 100644 --- a/tests/integ/sagemaker/modules/train/test_model_trainer.py +++ b/tests/integ/sagemaker/modules/train/test_model_trainer.py @@ -35,21 +35,22 @@ }, } -DEFAULT_CPU_IMAGE = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310" +PARAM_SCRIPT_SOURCE_DIR = f"{DATA_DIR}/modules/params_script" +PARAM_SCRIPT_SOURCE_CODE = SourceCode( + source_dir=PARAM_SCRIPT_SOURCE_DIR, + requirements="requirements.txt", + entry_script="train.py", +) +DEFAULT_CPU_IMAGE = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py31" -def test_hp_contract_basic_py_script(modules_sagemaker_session): - source_code = SourceCode( - source_dir=f"{DATA_DIR}/modules/params_script", - requirements="requirements.txt", - entry_script="train.py", - ) +def test_hp_contract_basic_py_script(modules_sagemaker_session): model_trainer = ModelTrainer( sagemaker_session=modules_sagemaker_session, training_image=DEFAULT_CPU_IMAGE, hyperparameters=EXPECTED_HYPERPARAMETERS, - source_code=source_code, + source_code=PARAM_SCRIPT_SOURCE_CODE, base_job_name="hp-contract-basic-py-script", ) @@ -59,6 +60,7 @@ def test_hp_contract_basic_py_script(modules_sagemaker_session): def test_hp_contract_basic_sh_script(modules_sagemaker_session): source_code = SourceCode( source_dir=f"{DATA_DIR}/modules/params_script", + requirements="requirements.txt", entry_script="train.sh", ) model_trainer = ModelTrainer( @@ -73,17 +75,13 @@ def test_hp_contract_basic_sh_script(modules_sagemaker_session): def test_hp_contract_mpi_script(modules_sagemaker_session): - source_code = SourceCode( - source_dir=f"{DATA_DIR}/modules/params_script", - entry_script="train.py", - ) compute = Compute(instance_type="ml.m5.xlarge", instance_count=2) model_trainer = ModelTrainer( sagemaker_session=modules_sagemaker_session, training_image=DEFAULT_CPU_IMAGE, compute=compute, hyperparameters=EXPECTED_HYPERPARAMETERS, - source_code=source_code, + source_code=PARAM_SCRIPT_SOURCE_CODE, distributed=MPI(), base_job_name="hp-contract-mpi-script", ) @@ -92,17 +90,13 @@ def test_hp_contract_mpi_script(modules_sagemaker_session): def test_hp_contract_torchrun_script(modules_sagemaker_session): - source_code = SourceCode( - source_dir=f"{DATA_DIR}/modules/params_script", - entry_script="train.py", - ) compute = Compute(instance_type="ml.m5.xlarge", instance_count=2) model_trainer = ModelTrainer( sagemaker_session=modules_sagemaker_session, training_image=DEFAULT_CPU_IMAGE, compute=compute, hyperparameters=EXPECTED_HYPERPARAMETERS, - source_code=source_code, + source_code=PARAM_SCRIPT_SOURCE_CODE, distributed=Torchrun(), base_job_name="hp-contract-torchrun-script", ) @@ -111,16 +105,11 @@ def test_hp_contract_torchrun_script(modules_sagemaker_session): def test_hp_contract_hyperparameter_json(modules_sagemaker_session): - source_dir = f"{DATA_DIR}/modules/params_script" - source_code = SourceCode( - source_dir=source_dir, - entry_script="train.py", - ) model_trainer = ModelTrainer( sagemaker_session=modules_sagemaker_session, training_image=DEFAULT_CPU_IMAGE, - hyperparameters=f"{source_dir}/hyperparameters.json", - source_code=source_code, + hyperparameters=f"{PARAM_SCRIPT_SOURCE_DIR}/hyperparameters.json", + source_code=PARAM_SCRIPT_SOURCE_CODE, base_job_name="hp-contract-hyperparameter-json", ) assert model_trainer.hyperparameters == EXPECTED_HYPERPARAMETERS @@ -128,16 +117,11 @@ def test_hp_contract_hyperparameter_json(modules_sagemaker_session): def test_hp_contract_hyperparameter_yaml(modules_sagemaker_session): - source_dir = f"{DATA_DIR}/modules/params_script" - source_code = SourceCode( - source_dir=source_dir, - entry_script="train.py", - ) model_trainer = ModelTrainer( sagemaker_session=modules_sagemaker_session, training_image=DEFAULT_CPU_IMAGE, - hyperparameters=f"{source_dir}/hyperparameters.yaml", - source_code=source_code, + hyperparameters=f"{PARAM_SCRIPT_SOURCE_DIR}/hyperparameters.yaml", + source_code=PARAM_SCRIPT_SOURCE_CODE, base_job_name="hp-contract-hyperparameter-yaml", ) assert model_trainer.hyperparameters == EXPECTED_HYPERPARAMETERS From 4643d81ba85c36ad9490898b626f1f3ce9cb5851 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 16:29:34 -0800 Subject: [PATCH 11/14] doc check fix --- tests/data/modules/params_script/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/modules/params_script/requirements.txt b/tests/data/modules/params_script/requirements.txt index d5e79c0b21..3d2e72e354 100644 --- a/tests/data/modules/params_script/requirements.txt +++ b/tests/data/modules/params_script/requirements.txt @@ -1 +1 @@ -omegaconf \ No newline at end of file +omegaconf From cf4cb16ab522167a70a6868a27d1757c4b16e523 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 18:37:22 -0800 Subject: [PATCH 12/14] fix tests --- src/sagemaker/modules/train/model_trainer.py | 4 ++++ .../modules/train/test_model_trainer.py | 2 +- .../modules/train/test_model_trainer.py | 17 +++++++++++++++-- tox.ini | 4 ++-- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 3a48a8fdad..844cb7c9ab 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -482,7 +482,11 @@ def model_post_init(self, __context: Any): logger.debug("Hyperparameters loaded as JSON") except json.JSONDecodeError: try: + logger.info(f"contents: {contents}") self.hyperparameters = yaml.safe_load(contents) + if not isinstance(self.hyperparameters, dict): + raise ValueError(f"YAML contents must be a valid mapping") + logger.info(f"hyperparameters: {self.hyperparameters}") logger.debug("Hyperparameters loaded as YAML") except (yaml.YAMLError, ValueError): raise ValueError( diff --git a/tests/integ/sagemaker/modules/train/test_model_trainer.py b/tests/integ/sagemaker/modules/train/test_model_trainer.py index a92154b992..a19f6d0e8b 100644 --- a/tests/integ/sagemaker/modules/train/test_model_trainer.py +++ b/tests/integ/sagemaker/modules/train/test_model_trainer.py @@ -42,7 +42,7 @@ entry_script="train.py", ) -DEFAULT_CPU_IMAGE = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py31" +DEFAULT_CPU_IMAGE = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310" def test_hp_contract_basic_py_script(modules_sagemaker_session): diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py index b8083bb82a..12dc14b999 100644 --- a/tests/unit/sagemaker/modules/train/test_model_trainer.py +++ b/tests/unit/sagemaker/modules/train/test_model_trainer.py @@ -1148,8 +1148,21 @@ def test_hyperparameters_not_exist(modules_session): @patch("os.path.exists") def test_hyperparameters_invalid(mock_exists, modules_session): mock_exists.return_value = True - # Must be valid YAML or JSON - mock_file_open = mock_open(read_data="invalid") + + # YAML contents must be a valid mapping + mock_file_open = mock_open(read_data="- item1\n- item2") + with patch("builtins.open", mock_file_open): + with pytest.raises(ValueError, match="Must be a valid JSON or YAML file."): + ModelTrainer( + training_image=DEFAULT_IMAGE, + role=DEFAULT_ROLE, + sagemaker_session=modules_session, + compute=DEFAULT_COMPUTE_CONFIG, + hyperparameters="hyperparameters.yaml", + ) + + # Must be valid YAML + mock_file_open = mock_open(read_data="* invalid") with patch("builtins.open", mock_file_open): with pytest.raises(ValueError, match="Must be a valid JSON or YAML file."): ModelTrainer( diff --git a/tox.ini b/tox.ini index b16c0d2f0b..cfd0beedfd 100644 --- a/tox.ini +++ b/tox.ini @@ -83,8 +83,8 @@ passenv = commands = python -c "import os; os.system('install-custom-pkgs --install-boto-wheels')" pip install 'apache-airflow==2.9.3' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.8.txt" - pip install 'torch==2.0.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html' - pip install 'torchvision==0.15.2+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html' + pip install 'torch==2.0.1' -f 'https://download.pytorch.org/whl/torch_stable.html' + pip install 'torchvision==0.15.2' -f 'https://download.pytorch.org/whl/torch_stable.html' pip install 'dill>=0.3.8' pytest {posargs} From c86082fdeb61fc343ce94b8b096fa374446e5c63 Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Mon, 3 Mar 2025 19:00:36 -0800 Subject: [PATCH 13/14] fix tox.ini --- src/sagemaker/modules/train/model_trainer.py | 2 +- tox.ini | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 844cb7c9ab..bb7c4168e6 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -485,7 +485,7 @@ def model_post_init(self, __context: Any): logger.info(f"contents: {contents}") self.hyperparameters = yaml.safe_load(contents) if not isinstance(self.hyperparameters, dict): - raise ValueError(f"YAML contents must be a valid mapping") + raise ValueError("YAML contents must be a valid mapping") logger.info(f"hyperparameters: {self.hyperparameters}") logger.debug("Hyperparameters loaded as YAML") except (yaml.YAMLError, ValueError): diff --git a/tox.ini b/tox.ini index cfd0beedfd..b16c0d2f0b 100644 --- a/tox.ini +++ b/tox.ini @@ -83,8 +83,8 @@ passenv = commands = python -c "import os; os.system('install-custom-pkgs --install-boto-wheels')" pip install 'apache-airflow==2.9.3' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.8.txt" - pip install 'torch==2.0.1' -f 'https://download.pytorch.org/whl/torch_stable.html' - pip install 'torchvision==0.15.2' -f 'https://download.pytorch.org/whl/torch_stable.html' + pip install 'torch==2.0.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html' + pip install 'torchvision==0.15.2+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html' pip install 'dill>=0.3.8' pytest {posargs} From 979e10fef2435729e5faaf588a1037feb747d51f Mon Sep 17 00:00:00 2001 From: Erick Benitez-Ramos Date: Tue, 4 Mar 2025 09:06:39 -0800 Subject: [PATCH 14/14] add unit test --- .../sagemaker/modules/train/test_model_trainer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py index 12dc14b999..194bb44988 100644 --- a/tests/unit/sagemaker/modules/train/test_model_trainer.py +++ b/tests/unit/sagemaker/modules/train/test_model_trainer.py @@ -1161,6 +1161,18 @@ def test_hyperparameters_invalid(mock_exists, modules_session): hyperparameters="hyperparameters.yaml", ) + # YAML contents must be a valid mapping + mock_file_open = mock_open(read_data="invalid") + with patch("builtins.open", mock_file_open): + with pytest.raises(ValueError, match="Must be a valid JSON or YAML file."): + ModelTrainer( + training_image=DEFAULT_IMAGE, + role=DEFAULT_ROLE, + sagemaker_session=modules_session, + compute=DEFAULT_COMPUTE_CONFIG, + hyperparameters="hyperparameters.yaml", + ) + # Must be valid YAML mock_file_open = mock_open(read_data="* invalid") with patch("builtins.open", mock_file_open):