From d00abff2e6d4e3c4693149b33a4b3b4526c2dd07 Mon Sep 17 00:00:00 2001 From: allela-roy Date: Thu, 11 Sep 2025 10:24:02 +0200 Subject: [PATCH 1/3] Update cluster_resource_gen.go Enabling Karpenter-based autoscaling for SageMaker HyperPod clusters with EKS orchestration. --- .../aws/sagemaker/cluster_resource_gen.go | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/internal/aws/sagemaker/cluster_resource_gen.go b/internal/aws/sagemaker/cluster_resource_gen.go index 329ce2e4c..88dccaa08 100644 --- a/internal/aws/sagemaker/cluster_resource_gen.go +++ b/internal/aws/sagemaker/cluster_resource_gen.go @@ -127,6 +127,109 @@ func clusterResource(ctx context.Context) (resource.Resource, error) { stringplanmodifier.UseStateForUnknown(), }, /*END PLAN MODIFIERS*/ }, /*END ATTRIBUTE*/ + // Property: AutoScaling + // CloudFormation resource type schema: + // + // { + // "additionalProperties": false, + // "description": "Configuration for autoscaling the SageMaker HyperPod cluster.", + // "properties": { + // "Mode": { + // "description": "The autoscaling mode for the cluster.", + // "enum": [ + // "Enable", + // "Disable" + // ], + // "type": "string" + // }, + // "AutoScalerType": { + // "description": "The type of autoscaler to use.", + // "enum": [ + // "Karpenter" + // ], + // "type": "string" + // }, + // "Status": { + // "description": "The status of the autoscaling configuration.", + // "enum": [ + // "InService", + // "Pending", + // "Failed" + // ], + // "type": "string" + // } + // }, + // "type": "object" + // } + "auto_scaling": schema.SingleNestedAttribute{ /*START ATTRIBUTE*/ + Attributes: map[string]schema.Attribute{ /*START SCHEMA*/ + // Property: Mode + "mode": schema.StringAttribute{ /*START ATTRIBUTE*/ + Description: "The autoscaling mode for the cluster.", + Optional: true, + Computed: true, + Validators: []validator.String{ /*START VALIDATORS*/ + stringvalidator.OneOf( + "Enable", + "Disable", + ), + }, /*END VALIDATORS*/ + PlanModifiers: []planmodifier.String{ /*START PLAN MODIFIERS*/ + stringplanmodifier.UseStateForUnknown(), + }, /*END PLAN MODIFIERS*/ + }, /*END ATTRIBUTE*/ + // Property: AutoScalerType + "auto_scaler_type": schema.StringAttribute{ /*START ATTRIBUTE*/ + Description: "The type of autoscaler to use.", + Optional: true, + Computed: true, + Validators: []validator.String{ /*START VALIDATORS*/ + stringvalidator.OneOf( + "Karpenter", + ), + }, /*END VALIDATORS*/ + PlanModifiers: []planmodifier.String{ /*START PLAN MODIFIERS*/ + stringplanmodifier.UseStateForUnknown(), + }, /*END PLAN MODIFIERS*/ + }, /*END ATTRIBUTE*/ + // Property: Status + "status": schema.StringAttribute{ /*START ATTRIBUTE*/ + Description: "The status of the autoscaling configuration.", + Computed: true, + PlanModifiers: []planmodifier.String{ /*START PLAN MODIFIERS*/ + stringplanmodifier.UseStateForUnknown(), + }, /*END PLAN MODIFIERS*/ + }, /*END ATTRIBUTE*/ + }, /*END SCHEMA*/ + Description: "Configuration for autoscaling the SageMaker HyperPod cluster.", + Optional: true, + Computed: true, + PlanModifiers: []planmodifier.Object{ /*START PLAN MODIFIERS*/ + objectplanmodifier.UseStateForUnknown(), + }, /*END PLAN MODIFIERS*/ + }, /*END ATTRIBUTE*/ + // Property: ClusterRole + // CloudFormation resource type schema: + // + // { + // "description": "The IAM role ARN for the cluster.", + // "maxLength": 2048, + // "minLength": 20, + // "pattern": "^arn:aws[a-z\\-]*:iam::\\d{12}:role/?[a-zA-Z_0-9+=,.@\\-_/]+$", + // "type": "string" + // } + "cluster_role": schema.StringAttribute{ /*START ATTRIBUTE*/ + Description: "The IAM role ARN for the cluster.", + Optional: true, + Computed: true, + Validators: []validator.String{ /*START VALIDATORS*/ + stringvalidator.LengthBetween(20, 2048), + stringvalidator.RegexMatches(regexp.MustCompile("^arn:aws[a-z\\-]*:iam::\\d{12}:role/?[a-zA-Z_0-9+=,.@\\-_/]+$"), ""), + }, /*END VALIDATORS*/ + PlanModifiers: []planmodifier.String{ /*START PLAN MODIFIERS*/ + stringplanmodifier.UseStateForUnknown(), + }, /*END PLAN MODIFIERS*/ + }, /*END ATTRIBUTE*/ // Property: InstanceGroups // CloudFormation resource type schema: // @@ -1571,7 +1674,12 @@ func clusterResource(ctx context.Context) (resource.Resource, error) { opts = opts.WithAttributeNameMap(map[string]string{ "alarm_name": "AlarmName", "auto_rollback_configuration": "AutoRollbackConfiguration", + "auto_scaling": "AutoScaling", + "auto_scaler_type": "AutoScalerType", + "mode": "Mode", + "status": "Status", "cluster_arn": "ClusterArn", + "cluster_role": "ClusterRole", "cluster_name": "ClusterName", "cluster_status": "ClusterStatus", "creation_time": "CreationTime", From c98a3307e3b4c122641de22bf6af04fd3c13b207 Mon Sep 17 00:00:00 2001 From: allela-roy Date: Thu, 11 Sep 2025 10:26:18 +0200 Subject: [PATCH 2/3] Update sagemaker_cluster.md Updating SageMaker cluster docs to showcase EKS with Karpenter Autoscaling --- docs/resources/sagemaker_cluster.md | 107 ++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/docs/resources/sagemaker_cluster.md b/docs/resources/sagemaker_cluster.md index f616a4a1b..651dd85f1 100644 --- a/docs/resources/sagemaker_cluster.md +++ b/docs/resources/sagemaker_cluster.md @@ -115,12 +115,106 @@ data "aws_region" "current" {} data "aws_partition" "current" {} ``` +### EKS with Karpenter Autoscaling +To create a SageMaker HyperPod Cluster with EKS orchestration and Karpenter-based autoscaling enabled. + +```terraform +resource "awscc_sagemaker_cluster" "hyperpod_autoscaling" { + cluster_name = "hyperpod-eks-autoscaling" + + instance_groups = [ + { + execution_role = awscc_iam_role.execution.arn + instance_count = 1 + instance_type = "ml.c5.xlarge" + instance_group_name = "system" + life_cycle_config = { + source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/" + on_create = "on_create.sh" + } + }, + { + execution_role = awscc_iam_role.execution.arn + instance_count = 0 + instance_type = "ml.c5.xlarge" + instance_group_name = "auto-c5-az1" + life_cycle_config = { + source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/" + on_create = "on_create.sh" + } + } + ] + + orchestrator = { + eks = { + cluster_arn = var.eks_cluster_arn + } + } + + vpc_config = { + security_group_ids = [var.security_group_id] + subnets = [var.subnet_id] + } + + cluster_role = awscc_iam_role.cluster.arn + + auto_scaling = { + mode = "Enable" + auto_scaler_type = "Karpenter" + } + + node_provisioning_mode = "Continuous" + + tags = [{ + key = "AutoScaling" + value = "Enabled" + }] +} + +resource "awscc_iam_role" "cluster" { + role_name = "SageMakerHyperPodKarpenterRole" + assume_role_policy_document = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = ["hyperpod.sagemaker.amazonaws.com"] + } + Action = "sts:AssumeRole" + } + ] + }) + + policies = [ + { + policy_name = "SageMakerHyperPodKarpenterPolicy" + policy_document = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "sagemaker:BatchAddClusterNodes", + "sagemaker:BatchDeleteClusterNodes" + ] + Resource = "arn:aws:sagemaker:*:*:cluster/*" + } + ] + }) + } + ] +} +``` + ## Schema ### Optional +- `auto_scaling` (Attributes) Configuration for autoscaling the SageMaker HyperPod cluster. (see [below for nested schema](#nestedatt--auto_scaling)) - `cluster_name` (String) The name of the HyperPod Cluster. +- `cluster_role` (String) The IAM role ARN for the cluster. - `instance_groups` (Attributes List) The instance groups of the SageMaker HyperPod cluster. (see [below for nested schema](#nestedatt--instance_groups)) - `node_provisioning_mode` (String) Determines the scaling strategy for the SageMaker HyperPod cluster. When set to 'Continuous', enables continuous scaling which dynamically manages node provisioning. If the parameter is omitted, uses the standard scaling approach in previous release. - `node_recovery` (String) If node auto-recovery is set to true, faulty nodes will be replaced or rebooted when a failure is detected. If set to false, nodes will be labelled when a fault is detected. @@ -137,6 +231,19 @@ data "aws_partition" "current" {} - `failure_message` (String) The failure message of the HyperPod Cluster. - `id` (String) Uniquely identifies the resource. + +### Nested Schema for `auto_scaling` + +Optional: + +- `auto_scaler_type` (String) The type of autoscaler to use. Valid values: `Karpenter`. +- `mode` (String) The autoscaling mode for the cluster. Valid values: `Enable`, `Disable`. + +Read-Only: + +- `status` (String) The status of the autoscaling configuration. + + ### Nested Schema for `instance_groups` From 162c569edff446e5eb2f1b024f5ae2c5489053cd Mon Sep 17 00:00:00 2001 From: allela-roy Date: Thu, 11 Sep 2025 10:30:32 +0200 Subject: [PATCH 3/3] Adding sagemaker_cluster_eks_autoscaling.tf example Adding example showcasing how to create a cluster with EKS orchestration and Karpenter-based autoscaling enabled --- .../sagemaker_cluster_eks_autoscaling.tf | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 examples/resources/awscc_sagemaker_cluster/sagemaker_cluster_eks_autoscaling.tf diff --git a/examples/resources/awscc_sagemaker_cluster/sagemaker_cluster_eks_autoscaling.tf b/examples/resources/awscc_sagemaker_cluster/sagemaker_cluster_eks_autoscaling.tf new file mode 100644 index 000000000..911b0bf85 --- /dev/null +++ b/examples/resources/awscc_sagemaker_cluster/sagemaker_cluster_eks_autoscaling.tf @@ -0,0 +1,190 @@ +resource "awscc_sagemaker_cluster" "hyperpod_autoscaling" { + cluster_name = "hyperpod-eks-autoscaling" + + instance_groups = [ + { + execution_role = awscc_iam_role.execution.arn + instance_count = 1 + instance_type = "ml.c5.xlarge" + instance_group_name = "system" + life_cycle_config = { + source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/" + on_create = "on_create.sh" + } + }, + { + execution_role = awscc_iam_role.execution.arn + instance_count = 0 + instance_type = "ml.c5.xlarge" + instance_group_name = "auto-c5-az1" + life_cycle_config = { + source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/" + on_create = "on_create.sh" + } + }, + { + execution_role = awscc_iam_role.execution.arn + instance_count = 0 + instance_type = "ml.c5.4xlarge" + instance_group_name = "auto-c5-4xaz2" + life_cycle_config = { + source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/" + on_create = "on_create.sh" + } + override_vpc_config = { + security_group_ids = [var.security_group_id] + subnets = [var.subnet_2] + } + } + ] + + orchestrator = { + eks = { + cluster_arn = var.eks_cluster_arn + } + } + + vpc_config = { + security_group_ids = [var.security_group_id] + subnets = [var.subnet_1] + } + + cluster_role = awscc_iam_role.cluster.arn + + auto_scaling = { + mode = "Enable" + auto_scaler_type = "Karpenter" + } + + node_provisioning_mode = "Continuous" + + tags = [{ + key = "Environment" + value = "Development" + }, { + key = "AutoScaling" + value = "Enabled" + }] +} + +# IAM Role for Karpenter Autoscaling +resource "awscc_iam_role" "cluster" { + role_name = "SageMakerHyperPodKarpenterRole" + assume_role_policy_document = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = ["hyperpod.sagemaker.amazonaws.com"] + } + Action = "sts:AssumeRole" + } + ] + }) + + policies = [ + { + policy_name = "SageMakerHyperPodKarpenterPolicy" + policy_document = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "sagemaker:BatchAddClusterNodes", + "sagemaker:BatchDeleteClusterNodes" + ] + Resource = "arn:aws:sagemaker:*:*:cluster/*" + Condition = { + StringEquals = { + "aws:ResourceAccount" = "$${aws:PrincipalAccount}" + } + } + }, + { + Effect = "Allow" + Action = [ + "kms:CreateGrant", + "kms:DescribeKey" + ] + Resource = "arn:aws:kms:*:*:key/*" + Condition = { + StringLike = { + "kms:ViaService" = "sagemaker.*.amazonaws.com" + } + Bool = { + "kms:GrantIsForAWSResource" = "true" + } + "ForAllValues:StringEquals" = { + "kms:GrantOperations" = [ + "CreateGrant", + "Decrypt", + "DescribeKey", + "GenerateDataKeyWithoutPlaintext", + "ReEncryptTo", + "ReEncryptFrom", + "RetireGrant" + ] + } + } + } + ] + }) + } + ] +} + +resource "awscc_iam_role" "execution" { + role_name = "SageMakerHyperPodExecutionRole" + assume_role_policy_document = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = ["sagemaker.amazonaws.com"] + } + Action = "sts:AssumeRole" + } + ] + }) + + managed_policy_arns = [ + "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess" + ] +} + +resource "aws_s3_bucket" "lifecycle" { + bucket = "sagemaker-hyperpod-lifecycle-${random_id.bucket_suffix.hex}" +} + +resource "aws_s3_object" "script" { + bucket = aws_s3_bucket.lifecycle.id + key = "config/on_create.sh" + content = "#!/bin/bash\necho 'HyperPod node initialization complete'" +} + +resource "random_id" "bucket_suffix" { + byte_length = 4 +} + +variable "eks_cluster_arn" { + description = "ARN of the EKS cluster" + type = string +} + +variable "security_group_id" { + description = "Security group ID for the cluster" + type = string +} + +variable "subnet_1" { + description = "First subnet ID" + type = string +} + +variable "subnet_2" { + description = "Second subnet ID" + type = string +}