diff --git a/README.md b/README.md index 530ecc1b..634f088e 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ Optionally, the module supports advanced security group management for the worke * [Cluster security group rules example](./examples/add_rules_to_sg) * [Cross account KMS encryption example](./examples/cross_kms_support) * [Financial Services compliant example](./examples/fscloud) + * [GPU Worker Pool Example](./examples/gpu) * [Contributing](#contributing) diff --git a/common-dev-assets b/common-dev-assets index 3cfd2b62..c4328778 160000 --- a/common-dev-assets +++ b/common-dev-assets @@ -1 +1 @@ -Subproject commit 3cfd2b622d864175e7979de6885a65b4fb4ab6db +Subproject commit c4328778ce1a62bc85f641d9249adaac0493cfc9 diff --git a/examples/gpu/README.md b/examples/gpu/README.md new file mode 100644 index 00000000..8d3190da --- /dev/null +++ b/examples/gpu/README.md @@ -0,0 +1,36 @@ +# GPU Worker Pool Example + +This example illustrates how to create an OpenShift cluster on IBM Cloud VPC with: +1. A default worker pool with basic machines (bx2.4x16) across 3 zones +2. A second worker pool with a single GPU machine (gx3.16x80.l4) in one zone + +This configuration is useful for workloads that require both general-purpose compute nodes and specialized GPU nodes for AI/ML workloads. + +As with all examples in the Terraform IBM Modules repositories, this code is provided to demonstrate working functionality. You can use it as-is to explore the module's capabilities, or adapt it as a starting point for your own infrastructure configuration. + +## Architecture + +This example creates: +- A VPC using the landing-zone-vpc module with: + - Subnets across 3 zones for the default worker pool + - A separate subnet in zone 1 for the GPU worker pool + - Public gateways in all 3 zones +- An OpenShift cluster with: + - Default worker pool: 1 worker per zone (3 total) using bx2.4x16 machines + - GPU worker pool: 1 worker in zone 1 using gx3.16x80.l4 machine with NVIDIA L4 GPUs + +## Usage + +```bash +terraform init +terraform plan -var-file="input.tfvars" +terraform apply -var-file="input.tfvars" +``` + +## Example input.tfvars file + +```hcl +ibmcloud_api_key = "your_api_key_here" # pragma: allowlist secret +prefix = "gpu" +region = "us-south" +``` diff --git a/examples/gpu/main.tf b/examples/gpu/main.tf new file mode 100644 index 00000000..bf3d9239 --- /dev/null +++ b/examples/gpu/main.tf @@ -0,0 +1,163 @@ +######################################################################################################################## +# Resource Group +######################################################################################################################## + +module "resource_group" { + source = "terraform-ibm-modules/resource-group/ibm" + version = "1.4.0" + # if an existing resource group is not set (null) create a new one using prefix + resource_group_name = var.resource_group == null ? "${var.prefix}-resource-group" : null + existing_resource_group_name = var.resource_group +} + +######################################################################################################################## +# VPC + Subnets + Public Gateways using landing-zone-vpc module +######################################################################################################################## + +module "vpc" { + source = "terraform-ibm-modules/landing-zone-vpc/ibm" + version = "8.8.0" + resource_group_id = module.resource_group.resource_group_id + region = var.region + prefix = var.prefix + tags = var.resource_tags + name = "${var.prefix}-vpc" + + # Define subnets across 3 zones for the default worker pool + # and a separate subnet in zone 1 for the GPU worker pool + subnets = { + zone-1 = [ + { + name = "subnet-default-1" + cidr = "10.10.10.0/24" + public_gateway = true + acl_name = "vpc-acl" + }, + { + name = "subnet-gpu" + cidr = "10.10.20.0/24" + public_gateway = true + acl_name = "vpc-acl" + } + ], + zone-2 = [ + { + name = "subnet-default-2" + cidr = "10.20.10.0/24" + public_gateway = true + acl_name = "vpc-acl" + } + ], + zone-3 = [ + { + name = "subnet-default-3" + cidr = "10.30.10.0/24" + public_gateway = true + acl_name = "vpc-acl" + } + ] + } + + # Enable public gateways in all zones + use_public_gateways = { + zone-1 = true + zone-2 = true + zone-3 = true + } + + # Define network ACLs + network_acls = [ + { + name = "vpc-acl" + add_ibm_cloud_internal_rules = true + add_vpc_connectivity_rules = true + rules = [ # Opening up in this example. Adjust as needed for your scenario. + { + name = "allow-all-inbound" + action = "allow" + direction = "inbound" + destination = "0.0.0.0/0" + source = "0.0.0.0/0" + }, + { + name = "allow-all-outbound" + action = "allow" + direction = "outbound" + destination = "0.0.0.0/0" + source = "0.0.0.0/0" + } + ] + } + ] +} + +######################################################################################################################## +# OCP VPC cluster with default worker pool across 3 zones and a GPU worker pool in zone 1 +######################################################################################################################## + +locals { + # Get all subnets from the VPC module + all_subnets = module.vpc.subnet_zone_list + + # Define subnets for the default worker pool (across 3 zones) + default_vpc_subnets = { + default = [ + for subnet in local.all_subnets : + { + id = subnet.id + cidr_block = subnet.cidr + zone = subnet.zone + } + if strcontains(subnet.name, "subnet-default") + ] + } + + # Define subnet for the GPU worker pool (single zone) + gpu_vpc_subnets = { + gpu = [ + for subnet in local.all_subnets : + { + id = subnet.id + cidr_block = subnet.cidr + zone = subnet.zone + } + if strcontains(subnet.name, "subnet-gpu") # Use strcontains rather than == given that a prefix is added by landing zone vpc to subnet names + ] + } + + # Combine all subnets + cluster_vpc_subnets = merge(local.default_vpc_subnets, local.gpu_vpc_subnets) + + # Define worker pools + worker_pools = [ + { + subnet_prefix = "default" + pool_name = "default" # ibm_container_vpc_cluster automatically names default pool "default" + machine_type = var.default_worker_pool_machine_type + workers_per_zone = 1 + operating_system = "RHCOS" + }, + { + subnet_prefix = "gpu" + pool_name = "gpu" + machine_type = var.gpu_worker_pool_machine_type + workers_per_zone = 1 + operating_system = "RHCOS" + } + ] +} + +module "ocp_base" { + source = "../.." + resource_group_id = module.resource_group.resource_group_id + region = var.region + tags = var.resource_tags + cluster_name = var.prefix + force_delete_storage = true + vpc_id = module.vpc.vpc_id + vpc_subnets = local.cluster_vpc_subnets + ocp_version = var.ocp_version + worker_pools = local.worker_pools + access_tags = var.access_tags + ocp_entitlement = var.ocp_entitlement +} diff --git a/examples/gpu/outputs.tf b/examples/gpu/outputs.tf new file mode 100644 index 00000000..e7f634c5 --- /dev/null +++ b/examples/gpu/outputs.tf @@ -0,0 +1,13 @@ +######################################################################################################################## +# Outputs +######################################################################################################################## + +output "cluster_name" { + value = module.ocp_base.cluster_name + description = "The name of the provisioned cluster." +} + +output "workerpools" { + value = module.ocp_base.workerpools + description = "Worker pools created in the cluster." +} diff --git a/examples/gpu/provider.tf b/examples/gpu/provider.tf new file mode 100644 index 00000000..84b69850 --- /dev/null +++ b/examples/gpu/provider.tf @@ -0,0 +1,8 @@ +######################################################################################################################## +# Provider config +######################################################################################################################## + +provider "ibm" { + ibmcloud_api_key = var.ibmcloud_api_key + region = var.region +} diff --git a/examples/gpu/variables.tf b/examples/gpu/variables.tf new file mode 100644 index 00000000..66415363 --- /dev/null +++ b/examples/gpu/variables.tf @@ -0,0 +1,65 @@ +######################################################################################################################## +# Input variables +######################################################################################################################## + +variable "ibmcloud_api_key" { + type = string + description = "The IBM Cloud api token" + sensitive = true +} + +variable "prefix" { + type = string + description = "Prefix for name of all resource created by this example" + validation { + error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters." + condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.prefix)) + } +} + +variable "region" { + type = string + description = "Region where resources are created" +} + +variable "resource_group" { + type = string + description = "An existing resource group name to use for this example, if unset a new resource group will be created" + default = null +} + +variable "resource_tags" { + type = list(string) + description = "Optional list of tags to be added to created resources" + default = [] +} + +variable "ocp_version" { + type = string + description = "Version of the OCP cluster to provision" + default = null +} + +variable "access_tags" { + type = list(string) + description = "A list of access tags to apply to the resources created by the module." + default = [] +} + +variable "ocp_entitlement" { + type = string + description = "Value that is applied to the entitlements for OCP cluster provisioning" + default = null +} + +variable "default_worker_pool_machine_type" { + type = string + description = "The machine type for the default worker pool" + default = "bx2.4x16" +} + +variable "gpu_worker_pool_machine_type" { + type = string + description = "The machine type for the GPU worker pool" + default = "gx3.16x80.l4" +} diff --git a/examples/gpu/version.tf b/examples/gpu/version.tf new file mode 100644 index 00000000..c1d90e04 --- /dev/null +++ b/examples/gpu/version.tf @@ -0,0 +1,11 @@ +terraform { + required_version = ">=1.9.0" + + # Using the latest provider version to ensure GPU support + required_providers { + ibm = { + source = "IBM-Cloud/ibm" + version = ">= 1.79.2" + } + } +} diff --git a/tests/other_test.go b/tests/other_test.go index 1e62fef2..1b50854e 100644 --- a/tests/other_test.go +++ b/tests/other_test.go @@ -14,6 +14,7 @@ const advancedExampleDir = "examples/advanced" const basicExampleDir = "examples/basic" const fscloudExampleDir = "examples/fscloud" const crossKmsSupportExampleDir = "examples/cross_kms_support" +const gpuExampleDir = "examples/gpu" func setupOptions(t *testing.T, prefix string, terraformDir string, ocpVersion string) *testhelper.TestOptions { options := testhelper.TestOptionsDefaultWithVars(&testhelper.TestOptions{ @@ -192,3 +193,30 @@ func TestFSCloudInSchematic(t *testing.T) { err := options.RunSchematicTest() assert.Nil(t, err, "This should not have errored") } + +func TestRunGpuExample(t *testing.T) { + t.Parallel() + + options := testhelper.TestOptionsDefaultWithVars(&testhelper.TestOptions{ + Testing: t, + TerraformDir: gpuExampleDir, + Prefix: "gpu-test", + ResourceGroup: resourceGroup, + ImplicitDestroy: []string{ + "module.ocp_base.null_resource.confirm_network_healthy", + "module.ocp_base.null_resource.reset_api_key", + }, + // Do not hard fail the test if the implicit destroy steps fail to allow a full destroy of resource to occur + ImplicitRequired: false, + TerraformVars: map[string]interface{}{ + "ocp_version": ocpVersion4, + "default_worker_pool_machine_type": "bx2.4x16", + "gpu_worker_pool_machine_type": "bx2.4x16", // Use bx2.4x16 instead of gx3.16x80.l4 to reduce cost + "access_tags": permanentResources["accessTags"], + "ocp_entitlement": "cloud_pak", + }, + }) + output, err := options.RunTestConsistency() + assert.Nil(t, err, "This should not have errored") + assert.NotNil(t, output, "Expected some output") +}