Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions docs/resources/sagemaker_cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,106 @@ data "aws_region" "current" {}
data "aws_partition" "current" {}
```

### EKS with Karpenter Autoscaling
To create a SageMaker HyperPod Cluster with EKS orchestration and Karpenter-based autoscaling enabled.

```terraform
resource "awscc_sagemaker_cluster" "hyperpod_autoscaling" {
cluster_name = "hyperpod-eks-autoscaling"

instance_groups = [
{
execution_role = awscc_iam_role.execution.arn
instance_count = 1
instance_type = "ml.c5.xlarge"
instance_group_name = "system"
life_cycle_config = {
source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/"
on_create = "on_create.sh"
}
},
{
execution_role = awscc_iam_role.execution.arn
instance_count = 0
instance_type = "ml.c5.xlarge"
instance_group_name = "auto-c5-az1"
life_cycle_config = {
source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/"
on_create = "on_create.sh"
}
}
]

orchestrator = {
eks = {
cluster_arn = var.eks_cluster_arn
}
}

vpc_config = {
security_group_ids = [var.security_group_id]
subnets = [var.subnet_id]
}

cluster_role = awscc_iam_role.cluster.arn

auto_scaling = {
mode = "Enable"
auto_scaler_type = "Karpenter"
}

node_provisioning_mode = "Continuous"

tags = [{
key = "AutoScaling"
value = "Enabled"
}]
}

resource "awscc_iam_role" "cluster" {
role_name = "SageMakerHyperPodKarpenterRole"
assume_role_policy_document = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = {
Service = ["hyperpod.sagemaker.amazonaws.com"]
}
Action = "sts:AssumeRole"
}
]
})

policies = [
{
policy_name = "SageMakerHyperPodKarpenterPolicy"
policy_document = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"sagemaker:BatchAddClusterNodes",
"sagemaker:BatchDeleteClusterNodes"
]
Resource = "arn:aws:sagemaker:*:*:cluster/*"
}
]
})
}
]
}
```

<!-- schema generated by tfplugindocs -->
## Schema

### Optional

- `auto_scaling` (Attributes) Configuration for autoscaling the SageMaker HyperPod cluster. (see [below for nested schema](#nestedatt--auto_scaling))
- `cluster_name` (String) The name of the HyperPod Cluster.
- `cluster_role` (String) The IAM role ARN for the cluster.
- `instance_groups` (Attributes List) The instance groups of the SageMaker HyperPod cluster. (see [below for nested schema](#nestedatt--instance_groups))
- `node_provisioning_mode` (String) Determines the scaling strategy for the SageMaker HyperPod cluster. When set to 'Continuous', enables continuous scaling which dynamically manages node provisioning. If the parameter is omitted, uses the standard scaling approach in previous release.
- `node_recovery` (String) If node auto-recovery is set to true, faulty nodes will be replaced or rebooted when a failure is detected. If set to false, nodes will be labelled when a fault is detected.
Expand All @@ -137,6 +231,19 @@ data "aws_partition" "current" {}
- `failure_message` (String) The failure message of the HyperPod Cluster.
- `id` (String) Uniquely identifies the resource.

<a id="nestedatt--auto_scaling"></a>
### Nested Schema for `auto_scaling`

Optional:

- `auto_scaler_type` (String) The type of autoscaler to use. Valid values: `Karpenter`.
- `mode` (String) The autoscaling mode for the cluster. Valid values: `Enable`, `Disable`.

Read-Only:

- `status` (String) The status of the autoscaling configuration.


<a id="nestedatt--instance_groups"></a>
### Nested Schema for `instance_groups`

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
resource "awscc_sagemaker_cluster" "hyperpod_autoscaling" {
cluster_name = "hyperpod-eks-autoscaling"

instance_groups = [
{
execution_role = awscc_iam_role.execution.arn
instance_count = 1
instance_type = "ml.c5.xlarge"
instance_group_name = "system"
life_cycle_config = {
source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/"
on_create = "on_create.sh"
}
},
{
execution_role = awscc_iam_role.execution.arn
instance_count = 0
instance_type = "ml.c5.xlarge"
instance_group_name = "auto-c5-az1"
life_cycle_config = {
source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/"
on_create = "on_create.sh"
}
},
{
execution_role = awscc_iam_role.execution.arn
instance_count = 0
instance_type = "ml.c5.4xlarge"
instance_group_name = "auto-c5-4xaz2"
life_cycle_config = {
source_s3_uri = "s3://${aws_s3_bucket.lifecycle.id}/config/"
on_create = "on_create.sh"
}
override_vpc_config = {
security_group_ids = [var.security_group_id]
subnets = [var.subnet_2]
}
}
]

orchestrator = {
eks = {
cluster_arn = var.eks_cluster_arn
}
}

vpc_config = {
security_group_ids = [var.security_group_id]
subnets = [var.subnet_1]
}

cluster_role = awscc_iam_role.cluster.arn

auto_scaling = {
mode = "Enable"
auto_scaler_type = "Karpenter"
}

node_provisioning_mode = "Continuous"

tags = [{
key = "Environment"
value = "Development"
}, {
key = "AutoScaling"
value = "Enabled"
}]
}

# IAM Role for Karpenter Autoscaling
resource "awscc_iam_role" "cluster" {
role_name = "SageMakerHyperPodKarpenterRole"
assume_role_policy_document = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = {
Service = ["hyperpod.sagemaker.amazonaws.com"]
}
Action = "sts:AssumeRole"
}
]
})

policies = [
{
policy_name = "SageMakerHyperPodKarpenterPolicy"
policy_document = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"sagemaker:BatchAddClusterNodes",
"sagemaker:BatchDeleteClusterNodes"
]
Resource = "arn:aws:sagemaker:*:*:cluster/*"
Condition = {
StringEquals = {
"aws:ResourceAccount" = "$${aws:PrincipalAccount}"
}
}
},
{
Effect = "Allow"
Action = [
"kms:CreateGrant",
"kms:DescribeKey"
]
Resource = "arn:aws:kms:*:*:key/*"
Condition = {
StringLike = {
"kms:ViaService" = "sagemaker.*.amazonaws.com"
}
Bool = {
"kms:GrantIsForAWSResource" = "true"
}
"ForAllValues:StringEquals" = {
"kms:GrantOperations" = [
"CreateGrant",
"Decrypt",
"DescribeKey",
"GenerateDataKeyWithoutPlaintext",
"ReEncryptTo",
"ReEncryptFrom",
"RetireGrant"
]
}
}
}
]
})
}
]
}

resource "awscc_iam_role" "execution" {
role_name = "SageMakerHyperPodExecutionRole"
assume_role_policy_document = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = {
Service = ["sagemaker.amazonaws.com"]
}
Action = "sts:AssumeRole"
}
]
})

managed_policy_arns = [
"arn:aws:iam::aws:policy/AmazonSageMakerFullAccess"
]
}

resource "aws_s3_bucket" "lifecycle" {
bucket = "sagemaker-hyperpod-lifecycle-${random_id.bucket_suffix.hex}"
}

resource "aws_s3_object" "script" {
bucket = aws_s3_bucket.lifecycle.id
key = "config/on_create.sh"
content = "#!/bin/bash\necho 'HyperPod node initialization complete'"
}

resource "random_id" "bucket_suffix" {
byte_length = 4
}

variable "eks_cluster_arn" {
description = "ARN of the EKS cluster"
type = string
}

variable "security_group_id" {
description = "Security group ID for the cluster"
type = string
}

variable "subnet_1" {
description = "First subnet ID"
type = string
}

variable "subnet_2" {
description = "Second subnet ID"
type = string
}
Loading