Skip to content

Commit 98137da

Browse files
pintaoz-awspintaoz
andauthored
Update cluster creation template url with versioning (#285)
* Update cluster creation template url with versioning * update tests * add cli parameter * Update tests * Fix unit test * update custom s3 name * update default_create * Update storage parameter * update defaults --------- Co-authored-by: pintaoz <[email protected]>
1 parent 79b0342 commit 98137da

File tree

9 files changed

+27
-23
lines changed

9 files changed

+27
-23
lines changed

doc/cli/cluster_management/cli_cluster_management.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ The `config.yaml` file supports the following parameters:
358358
| `create_s3_endpoint_stack` | BOOLEAN | Create S3 Endpoint stack | true |
359359
| `enable_hp_inference_feature` | BOOLEAN | Enable inference operator | false |
360360
| `stage` | TEXT | Deployment stage ("gamma" or "prod") | "prod" |
361-
| `custom_bucket_name` | TEXT | S3 bucket name for templates | "sagemaker-hyperpod-cluster-stack-bucket" |
361+
| `custom_bucket_name` | TEXT | Custom S3 bucket name for templates | "" |
362362
| `create_life_cycle_script_stack` | BOOLEAN | Create Life Cycle Script Stack | true |
363363
| `create_s3_bucket_stack` | BOOLEAN | Create S3 Bucket Stack | true |
364364
| `s3_bucket_name` | TEXT | S3 bucket for cluster lifecycle scripts | "s3-bucket" |

hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ Parameters:
282282
Description: The path to the HyperPod Helm chart in the Helm repo.
283283
HelmOperators:
284284
Type: String
285-
Default: 'mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true'
285+
Default: 'mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true'
286286
Description: The configuration of HyperPod Helm chart
287287
Namespace:
288288
Type: String

hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class ClusterStackBase(BaseModel):
1313
namespace: Optional[str] = Field("kube-system", description="The namespace to deploy the HyperPod Helm chart")
1414
helm_repo_url: str = Field("https://github.com/aws/sagemaker-hyperpod-cli.git", description="The URL of the Helm repo containing the HyperPod Helm chart (fixed default)")
1515
helm_repo_path: str = Field("helm_chart/HyperPodHelmChart", description="The path to the HyperPod Helm chart in the Helm repo (fixed default)")
16-
helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart")
16+
helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart")
1717
helm_release: Optional[str] = Field("dependencies", description="The name used for Helm chart release")
1818
node_provisioning_mode: Optional[str] = Field("Continuous", description="Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty")
1919
node_recovery: Optional[str] = Field("Automatic", description="Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"")
@@ -35,7 +35,7 @@ class ClusterStackBase(BaseModel):
3535
create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack")
3636
enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster")
3737
stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"")
38-
custom_bucket_name: str = Field("sagemaker-hyperpod-cluster-stack-bucket", description="S3 bucket name for templates")
38+
custom_bucket_name: str = Field("", description="Custom S3 bucket name for templates")
3939
create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack")
4040
create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack")
4141
s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts")
@@ -120,7 +120,7 @@ def to_config(self, region: str = None):
120120

121121
# Set fixed defaults
122122
defaults = {
123-
'custom_bucket_name': 'sagemaker-hyperpod-cluster-stack-bucket',
123+
'custom_bucket_name': '',
124124
'github_raw_url': 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh',
125125
'helm_repo_url': 'https://github.com/aws/sagemaker-hyperpod-cli.git',
126126
'helm_repo_path': 'helm_chart/HyperPodHelmChart'

hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@
125125
"type": "null"
126126
}
127127
],
128-
"default": "mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true",
128+
"default": "mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true",
129129
"description": "The configuration of HyperPod Helm chart",
130130
"title": "Helm Operators"
131131
},
@@ -439,8 +439,8 @@
439439
"title": "Stage"
440440
},
441441
"custom_bucket_name": {
442-
"default": "sagemaker-hyperpod-cluster-stack-bucket",
443-
"description": "S3 bucket name for templates",
442+
"default": "",
443+
"description": "Custom S3 bucket name for templates",
444444
"title": "Custom Bucket Name",
445445
"type": "string"
446446
},

src/sagemaker/hyperpod/cli/commands/cluster_stack.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ def parse_status_list(ctx, param, value):
5353
@click.argument("config-file", required=True)
5454
@click.argument("stack-name", required=True)
5555
@click.option("--region", help="AWS region")
56+
@click.option("--template-version", type=click.INT, help="Version number of cluster creation template")
5657
@click.option("--debug", is_flag=True, help="Enable debug logging")
57-
def create_cluster_stack(config_file, region, debug):
58+
def create_cluster_stack(config_file, region, template_version, debug):
5859
"""Create a new HyperPod cluster stack using the provided configuration.
5960
6061
Creates a CloudFormation stack for a HyperPod cluster using settings from a YAML configuration file.
@@ -66,7 +67,7 @@ def create_cluster_stack(config_file, region, debug):
6667
.. code-block:: bash
6768
6869
# Create cluster stack with config file
69-
hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2
70+
hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 --template-version 1
7071
7172
# Create with debug logging
7273
hyp create hyp-cluster cluster-config.yaml my-stack-name --debug
@@ -95,7 +96,7 @@ def create_cluster_stack(config_file, region, debug):
9596
config = model_instance.to_config(region=region)
9697

9798
# Create the cluster stack
98-
stack_id = HpClusterStack(**config).create(region)
99+
stack_id = HpClusterStack(**config).create(region, template_version)
99100

100101
logger.info(f"Stack creation initiated successfully with ID: {stack_id}")
101102
logger.info("You can monitor the stack creation in the AWS CloudFormation console.")

src/sagemaker/hyperpod/cli/commands/init.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,9 @@ def validate():
272272

273273
@click.command(name="_default_create")
274274
@click.option("--region", "-r", default=None, help="Region to create cluster stack for, default to your region in aws configure. Not available for other templates.")
275+
@click.option("--template-version", type=click.INT, help="Version number of cluster creation template. Not available for other templates.")
275276
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_create_cli")
276-
def _default_create(region):
277+
def _default_create(region, template_version):
277278
"""
278279
Validate configuration and render template files for deployment.
279280
@@ -374,7 +375,7 @@ def _default_create(region):
374375
# Pass region to to_domain for cluster stack template
375376
if template == "cluster-stack":
376377
config = template_model.to_config(region=region)
377-
HpClusterStack(**config).create(region)
378+
HpClusterStack(**config).create(region, template_version)
378379
else:
379380
# Create from k8s.yaml
380381
k8s_file = out_dir / 'k8s.yaml'

src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
from sagemaker.hyperpod.common.telemetry.constants import Feature
1616

1717
CAPABILITIES_FOR_STACK_CREATION = [
18-
'CAPABILITY_IAM',
19-
'CAPABILITY_NAMED_IAM'
18+
'CAPABILITY_AUTO_EXPAND',
19+
'CAPABILITY_IAM',
20+
'CAPABILITY_NAMED_IAM'
2021
]
2122
log = logging.getLogger()
2223

@@ -66,7 +67,8 @@ def get_template() -> str:
6667

6768
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_cluster_stack")
6869
def create(self,
69-
region: Optional[str] = None) -> str:
70+
region: Optional[str] = None,
71+
template_version: Optional[int] = 1) -> str:
7072
"""Creates a new HyperPod cluster CloudFormation stack.
7173
7274
**Parameters:**
@@ -111,12 +113,12 @@ def create(self,
111113

112114
stack_name = f"HyperpodClusterStack-{str(uuid.uuid4())[:5]}"
113115
# Use the fixed bucket name from the model
114-
bucket_name = self.custom_bucket_name
115-
template_key = f"1.1/main-stack-eks-based-template.yaml"
116+
bucket_name = "aws-sagemaker-hyperpod-cluster-setup"
117+
template_key = f"{template_version}/templates/main-stack-eks-based-template.yaml"
116118

117119
try:
118120
# Use TemplateURL for large templates (>51KB)
119-
template_url = f"https://{bucket_name}.s3.amazonaws.com/{template_key}"
121+
template_url = f"https://{bucket_name}-{region}-{self.stage}.s3.amazonaws.com/{template_key}"
120122
response = cf.create_stack(
121123
StackName=stack_name,
122124
TemplateURL=template_url,

test/integration_tests/cluster_management/test_hp_cluster_creation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def test_create_cluster(runner, cluster_name, create_time):
190190
# Record time before submission
191191
CREATE_TIME = datetime.now(timezone.utc)
192192

193-
result = runner.invoke(create, ["--region", REGION], catch_exceptions=False)
193+
result = runner.invoke(create, ["--region", REGION, "--template-version", "1"], catch_exceptions=False)
194194
assert_command_succeeded(result)
195195

196196
# Verify expected submission messages appear

test/unit_tests/cli/test_cluster_stack.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,14 +330,14 @@ def test_create_cluster_stack_success(self, mock_hp_cluster_stack_class, mock_lo
330330

331331
from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack
332332

333-
create_cluster_stack.callback('config.yaml', 'us-west-2', False)
333+
create_cluster_stack.callback('config.yaml', 'us-west-2', 1, False)
334334

335335
mock_load_config.assert_called_once()
336336
mock_filter.assert_called_once_with({'key': 'value'})
337337
mock_model_class.assert_called_once_with(**{'key': 'value'})
338338
mock_model_instance.to_config.assert_called_once_with(region='us-west-2')
339339
mock_hp_cluster_stack_class.assert_called_once_with(**{'transformed': 'config'})
340-
mock_sdk_instance.create.assert_called_once_with('us-west-2')
340+
mock_sdk_instance.create.assert_called_once_with('us-west-2', 1)
341341

342342
@patch('os.path.exists')
343343
def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_template, mock_read_text):
@@ -347,7 +347,7 @@ def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_templat
347347

348348
from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack
349349

350-
create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', False)
350+
create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', 1, False)
351351

352352
# Assert - function should return early without error
353353
mock_exists.assert_called_once_with('nonexistent.yaml')

0 commit comments

Comments
 (0)