Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Optionally, the module supports advanced security group management for the worke
* [Cluster security group rules example](./examples/add_rules_to_sg)
* [Cross account KMS encryption example](./examples/cross_kms_support)
* [Financial Services compliant example](./examples/fscloud)
* [GPU Worker Pool Example](./examples/gpu)
* [Contributing](#contributing)
<!-- END OVERVIEW HOOK -->

Expand Down
34 changes: 34 additions & 0 deletions examples/gpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# GPU Worker Pool Example

This example illustrates how to create an OpenShift cluster on IBM Cloud VPC with:
1. A default worker pool with basic machines (bx2.4x16) across 3 zones
2. A second worker pool with a single GPU machine (gx3.16x80.l4) in one zone

This configuration is useful for workloads that require both general-purpose compute nodes and specialized GPU nodes for AI/ML workloads.

## Architecture

This example creates:
- A VPC using the landing-zone-vpc module with:
- Subnets across 3 zones for the default worker pool
- A separate subnet in zone 1 for the GPU worker pool
- Public gateways in all 3 zones
- An OpenShift cluster with:
- Default worker pool: 1 worker per zone (3 total) using bx2.4x16 machines
- GPU worker pool: 1 worker in zone 1 using gx3.16x80.l4 machine with NVIDIA L4 GPUs

## Usage

```bash
terraform init
terraform plan -var-file="input.tfvars"
terraform apply -var-file="input.tfvars"
```

## Example input.tfvars file

```hcl
ibmcloud_api_key = "your_api_key_here" # pragma: allowlist secret
prefix = "gpu"
region = "us-south"
```
6 changes: 6 additions & 0 deletions examples/gpu/catalogValidationValues.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"ibmcloud_api_key": $VALIDATION_APIKEY,
"region": "us-south",
"resource_tags": $TAGS,
"prefix": $PREFIX
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is not needed since we no longer onboard to module registry

148 changes: 148 additions & 0 deletions examples/gpu/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
########################################################################################################################
# Resource Group
########################################################################################################################

module "resource_group" {
source = "terraform-ibm-modules/resource-group/ibm"
version = "1.4.0"
# if an existing resource group is not set (null) create a new one using prefix
resource_group_name = var.resource_group == null ? "${var.prefix}-resource-group" : null
existing_resource_group_name = var.resource_group
}

########################################################################################################################
# VPC + Subnets + Public Gateways using landing-zone-vpc module
########################################################################################################################

module "vpc" {
source = "terraform-ibm-modules/landing-zone-vpc/ibm"
version = "8.8.0"
resource_group_id = module.resource_group.resource_group_id
region = var.region
prefix = var.prefix
tags = var.resource_tags
name = "${var.prefix}-vpc"

# Define subnets across 3 zones for the default worker pool
# and a separate subnet in zone 1 for the GPU worker pool
subnets = {
zone-1 = [
{
name = "subnet-default-1"
cidr = "10.10.10.0/24"
public_gateway = true
acl_name = "vpc-acl"
},
{
name = "subnet-gpu"
cidr = "10.10.20.0/24"
public_gateway = true
acl_name = "vpc-acl"
}
],
zone-2 = [
{
name = "subnet-default-2"
cidr = "10.20.10.0/24"
public_gateway = true
acl_name = "vpc-acl"
}
],
zone-3 = [
{
name = "subnet-default-3"
cidr = "10.30.10.0/24"
public_gateway = true
acl_name = "vpc-acl"
}
]
}

# Enable public gateways in all zones
use_public_gateways = {
zone-1 = true
zone-2 = true
zone-3 = true
}

# Define network ACLs
network_acls = [
{
name = "vpc-acl"
add_ibm_cloud_internal_rules = true
add_vpc_connectivity_rules = true
rules = []
}
]
}

########################################################################################################################
# OCP VPC cluster with default worker pool across 3 zones and a GPU worker pool in zone 1
########################################################################################################################

locals {
# Get all subnets from the VPC module
all_subnets = module.vpc.subnet_zone_list

# Define subnets for the default worker pool (across 3 zones)
default_vpc_subnets = {
default = [
for subnet in local.all_subnets :
{
id = subnet.id
cidr_block = subnet.cidr
zone = subnet.zone
}
if strcontains(subnet.name, "subnet-default")
]
}

# Define subnet for the GPU worker pool (single zone)
gpu_vpc_subnets = {
gpu = [
for subnet in local.all_subnets :
{
id = subnet.id
cidr_block = subnet.cidr
zone = subnet.zone
}
if strcontains(subnet.name, "subnet-gpu") # Use strcontains rather than == given that a prefix is added by landing zone vpc to subnet names
]
}

# Combine all subnets
cluster_vpc_subnets = merge(local.default_vpc_subnets, local.gpu_vpc_subnets)

# Define worker pools
worker_pools = [
{
subnet_prefix = "default"
pool_name = "default" # ibm_container_vpc_cluster automatically names default pool "default"
machine_type = var.default_worker_pool_machine_type
workers_per_zone = 1
operating_system = "RHCOS"
},
{
subnet_prefix = "gpu"
pool_name = "gpu"
machine_type = var.gpu_worker_pool_machine_type
workers_per_zone = 1
operating_system = "RHCOS"
}
]
}

module "ocp_base" {
source = "../.."
resource_group_id = module.resource_group.resource_group_id
region = var.region
tags = var.resource_tags
cluster_name = var.prefix
force_delete_storage = true
vpc_id = module.vpc.vpc_id
vpc_subnets = local.cluster_vpc_subnets
ocp_version = var.ocp_version
worker_pools = local.worker_pools
access_tags = var.access_tags
ocp_entitlement = var.ocp_entitlement
}
13 changes: 13 additions & 0 deletions examples/gpu/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
########################################################################################################################
# Outputs
########################################################################################################################

output "cluster_name" {
value = module.ocp_base.cluster_name
description = "The name of the provisioned cluster."
}

output "workerpools" {
value = module.ocp_base.workerpools
description = "Worker pools created in the cluster."
}
8 changes: 8 additions & 0 deletions examples/gpu/provider.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
########################################################################################################################
# Provider config
########################################################################################################################

provider "ibm" {
ibmcloud_api_key = var.ibmcloud_api_key
region = var.region
}
65 changes: 65 additions & 0 deletions examples/gpu/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
########################################################################################################################
# Input variables
########################################################################################################################

variable "ibmcloud_api_key" {
type = string
description = "The IBM Cloud api token"
sensitive = true
}

variable "prefix" {
type = string
description = "Prefix for name of all resource created by this example"
validation {
error_message = "Prefix must begin and end with a letter and contain only letters, numbers, and - characters."
condition = can(regex("^([A-z]|[a-z][-a-z0-9]*[a-z0-9])$", var.prefix))
}
}

variable "region" {
type = string
description = "Region where resources are created"
}

variable "resource_group" {
type = string
description = "An existing resource group name to use for this example, if unset a new resource group will be created"
default = null
}

variable "resource_tags" {
type = list(string)
description = "Optional list of tags to be added to created resources"
default = []
}

variable "ocp_version" {
type = string
description = "Version of the OCP cluster to provision"
default = null
}

variable "access_tags" {
type = list(string)
description = "A list of access tags to apply to the resources created by the module."
default = []
}

variable "ocp_entitlement" {
type = string
description = "Value that is applied to the entitlements for OCP cluster provisioning"
default = null
}

variable "default_worker_pool_machine_type" {
type = string
description = "The machine type for the default worker pool"
default = "bx2.4x16"
}

variable "gpu_worker_pool_machine_type" {
type = string
description = "The machine type for the GPU worker pool"
default = "gx3.16x80.l4"
}
11 changes: 11 additions & 0 deletions examples/gpu/version.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
terraform {
required_version = ">=1.9.0"

# Using the latest provider version to ensure GPU support
required_providers {
ibm = {
source = "IBM-Cloud/ibm"
version = ">= 1.79.2"
}
}
}
28 changes: 28 additions & 0 deletions tests/other_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const advancedExampleDir = "examples/advanced"
const basicExampleDir = "examples/basic"
const fscloudExampleDir = "examples/fscloud"
const crossKmsSupportExampleDir = "examples/cross_kms_support"
const gpuExampleDir = "examples/gpu"

func setupOptions(t *testing.T, prefix string, terraformDir string, ocpVersion string) *testhelper.TestOptions {
options := testhelper.TestOptionsDefaultWithVars(&testhelper.TestOptions{
Expand Down Expand Up @@ -192,3 +193,30 @@ func TestFSCloudInSchematic(t *testing.T) {
err := options.RunSchematicTest()
assert.Nil(t, err, "This should not have errored")
}

func TestRunGpuExample(t *testing.T) {
t.Parallel()

options := testhelper.TestOptionsDefaultWithVars(&testhelper.TestOptions{
Testing: t,
TerraformDir: gpuExampleDir,
Prefix: "gpu-test",
ResourceGroup: resourceGroup,
ImplicitDestroy: []string{
"module.ocp_base.null_resource.confirm_network_healthy",
"module.ocp_base.null_resource.reset_api_key",
},
// Do not hard fail the test if the implicit destroy steps fail to allow a full destroy of resource to occur
ImplicitRequired: false,
TerraformVars: map[string]interface{}{
"ocp_version": ocpVersion4,
"default_worker_pool_machine_type": "bx2.4x16",
"gpu_worker_pool_machine_type": "bx2.4x16", // Use bx2.4x16 instead of gx3.16x80.l4 to reduce cost
"access_tags": permanentResources["accessTags"],
"ocp_entitlement": "cloud_pak",
},
})
output, err := options.RunTestConsistency()
assert.Nil(t, err, "This should not have errored")
assert.NotNil(t, output, "Expected some output")
}