Skip to content

Commit 881ccb2

Browse files
feat: add example for GKE confidential nodes with GPU
1 parent 893ba59 commit 881ccb2

File tree

36 files changed

+1164
-39
lines changed

36 files changed

+1164
-39
lines changed

autogen/main/cluster.tf.tmpl

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ resource "google_container_cluster" "primary" {
8383
for_each = local.confidential_node_config
8484
content {
8585
enabled = confidential_nodes.value.enabled
86+
confidential_instance_type = lookup(var.node_pools[0], "confidential_instance_type", null)
8687
}
8788
}
8889

@@ -594,13 +595,39 @@ resource "google_container_cluster" "primary" {
594595
min_cpu_platform = lookup(var.node_pools[0], "min_cpu_platform", "")
595596
enable_confidential_storage = lookup(var.node_pools[0], "enable_confidential_storage", false)
596597
disk_type = lookup(var.node_pools[0], "disk_type", null)
598+
preemptible = lookup(var.node_pools[0], "preemptible", false)
599+
spot = lookup(var.node_pools[0], "spot", false)
597600
dynamic "gcfs_config" {
598601
for_each = lookup(var.node_pools[0], "enable_gcfs", null) != null ? [var.node_pools[0].enable_gcfs] : []
599602
content {
600603
enabled = gcfs_config.value
601604
}
602605
}
603606

607+
dynamic "guest_accelerator" {
608+
for_each = lookup(var.node_pools[0], "accelerator_count", 0) > 0 ? [1] : []
609+
content {
610+
type = lookup(var.node_pools[0], "accelerator_type", "")
611+
count = lookup(var.node_pools[0], "accelerator_count", 0)
612+
gpu_partition_size = lookup(var.node_pools[0], "gpu_partition_size", null)
613+
614+
dynamic "gpu_driver_installation_config" {
615+
for_each = lookup(var.node_pools[0], "gpu_driver_version", "") != "" ? [1] : []
616+
content {
617+
gpu_driver_version = lookup(var.node_pools[0], "gpu_driver_version", "")
618+
}
619+
}
620+
621+
dynamic "gpu_sharing_config" {
622+
for_each = lookup(var.node_pools[0], "gpu_sharing_strategy", "") != "" ? [1] : []
623+
content {
624+
gpu_sharing_strategy = lookup(var.node_pools[0], "gpu_sharing_strategy", "")
625+
max_shared_clients_per_gpu = lookup(var.node_pools[0], "max_shared_clients_per_gpu", 2)
626+
}
627+
}
628+
}
629+
}
630+
604631
dynamic "gvnic" {
605632
for_each = lookup(var.node_pools[0], "enable_gvnic", false) ? [true] : []
606633
content {
@@ -1300,9 +1327,10 @@ resource "google_container_node_pool" "windows_pools" {
13001327
}
13011328

13021329
dynamic "confidential_nodes" {
1303-
for_each = lookup(each.value, "enable_confidential_nodes", null) != null ? [each.value.enable_confidential_nodes] : []
1330+
for_each = lookup(each.value, "enable_confidential_nodes", null) != null ? [{ enabled = each.value.enable_confidential_nodes, confidential_instance_type = lookup(each.value, "confidential_instance_type", null) }] : []
13041331
content {
1305-
enabled = confidential_nodes.value
1332+
enabled = confidential_nodes.enabled
1333+
confidential_instance_type = confidential_nodes.confidential_instance_type
13061334
}
13071335
}
13081336

cluster.tf

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ resource "google_container_cluster" "primary" {
7676
dynamic "confidential_nodes" {
7777
for_each = local.confidential_node_config
7878
content {
79-
enabled = confidential_nodes.value.enabled
79+
enabled = confidential_nodes.value.enabled
80+
confidential_instance_type = lookup(var.node_pools[0], "confidential_instance_type", null)
8081
}
8182
}
8283

@@ -454,13 +455,39 @@ resource "google_container_cluster" "primary" {
454455
min_cpu_platform = lookup(var.node_pools[0], "min_cpu_platform", "")
455456
enable_confidential_storage = lookup(var.node_pools[0], "enable_confidential_storage", false)
456457
disk_type = lookup(var.node_pools[0], "disk_type", null)
458+
preemptible = lookup(var.node_pools[0], "preemptible", false)
459+
spot = lookup(var.node_pools[0], "spot", false)
457460
dynamic "gcfs_config" {
458461
for_each = lookup(var.node_pools[0], "enable_gcfs", null) != null ? [var.node_pools[0].enable_gcfs] : []
459462
content {
460463
enabled = gcfs_config.value
461464
}
462465
}
463466

467+
dynamic "guest_accelerator" {
468+
for_each = lookup(var.node_pools[0], "accelerator_count", 0) > 0 ? [1] : []
469+
content {
470+
type = lookup(var.node_pools[0], "accelerator_type", "")
471+
count = lookup(var.node_pools[0], "accelerator_count", 0)
472+
gpu_partition_size = lookup(var.node_pools[0], "gpu_partition_size", null)
473+
474+
dynamic "gpu_driver_installation_config" {
475+
for_each = lookup(var.node_pools[0], "gpu_driver_version", "") != "" ? [1] : []
476+
content {
477+
gpu_driver_version = lookup(var.node_pools[0], "gpu_driver_version", "")
478+
}
479+
}
480+
481+
dynamic "gpu_sharing_config" {
482+
for_each = lookup(var.node_pools[0], "gpu_sharing_strategy", "") != "" ? [1] : []
483+
content {
484+
gpu_sharing_strategy = lookup(var.node_pools[0], "gpu_sharing_strategy", "")
485+
max_shared_clients_per_gpu = lookup(var.node_pools[0], "max_shared_clients_per_gpu", 2)
486+
}
487+
}
488+
}
489+
}
490+
464491
dynamic "gvnic" {
465492
for_each = lookup(var.node_pools[0], "enable_gvnic", false) ? [true] : []
466493
content {
@@ -984,9 +1011,10 @@ resource "google_container_node_pool" "pools" {
9841011
}
9851012

9861013
dynamic "confidential_nodes" {
987-
for_each = lookup(each.value, "enable_confidential_nodes", null) != null ? [each.value.enable_confidential_nodes] : []
1014+
for_each = lookup(each.value, "enable_confidential_nodes", null) != null ? [{ enabled = each.value.enable_confidential_nodes, confidential_instance_type = lookup(each.value, "confidential_instance_type", null) }] : []
9881015
content {
989-
enabled = confidential_nodes.value
1016+
enabled = confidential_nodes.enabled
1017+
confidential_instance_type = confidential_nodes.confidential_instance_type
9901018
}
9911019
}
9921020

@@ -1317,9 +1345,10 @@ resource "google_container_node_pool" "windows_pools" {
13171345
}
13181346

13191347
dynamic "confidential_nodes" {
1320-
for_each = lookup(each.value, "enable_confidential_nodes", null) != null ? [each.value.enable_confidential_nodes] : []
1348+
for_each = lookup(each.value, "enable_confidential_nodes", null) != null ? [{ enabled = each.value.enable_confidential_nodes, confidential_instance_type = lookup(each.value, "confidential_instance_type", null) }] : []
13211349
content {
1322-
enabled = confidential_nodes.value
1350+
enabled = confidential_nodes.enabled
1351+
confidential_instance_type = confidential_nodes.confidential_instance_type
13231352
}
13241353
}
13251354

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Confidential GPU GKE Cluster
2+
3+
This example illustrates how to instantiate the Beta Public Cluster module
4+
with confidential nodes enabled, database encrypted with KMS key
5+
and encrypted GPU Workload with NVIDIA Confidential Computing.
6+
This module also installs the NVIDIA drivers on the GPU, so it's
7+
ready to receive workloads.
8+
See more: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus-confidential-nodes.
9+
10+
<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
11+
## Inputs
12+
13+
| Name | Description | Type | Default | Required |
14+
|------|-------------|------|---------|:--------:|
15+
| project\_id | The project ID to host the cluster in. | `string` | n/a | yes |
16+
| region | The region to host the cluster in. | `string` | `"us-central1"` | no |
17+
| zones | The zones to host the nodes in. The nodes must be in a zone that supports NVIDIA Confidential Computing. For more information, [view supported zones](https://cloud.google.com/confidential-computing/confidential-vm/docs/supported-configurations#nvidia-confidential-computing_1). | `list(string)` | <pre>[<br> "us-central1-a"<br>]</pre> | no |
18+
19+
## Outputs
20+
21+
| Name | Description |
22+
|------|-------------|
23+
| ca\_certificate | The cluster ca certificate (base64 encoded). |
24+
| cluster\_name | Cluster name. |
25+
| keyring | The name of the keyring. |
26+
| kms\_key\_name | KMS Key Name. |
27+
| kubernetes\_endpoint | The cluster endpoint. |
28+
| location | n/a |
29+
| master\_kubernetes\_version | Kubernetes version of the master. |
30+
| network\_name | The name of the VPC being created. |
31+
| project\_id | The project ID the cluster is in. |
32+
| region | The region in which the cluster resides. |
33+
| service\_account | The service account to default running nodes as if not overridden in `node_pools`. |
34+
| subnet\_names | The names of the subnet being created. |
35+
| zones | List of zones in which the cluster resides. |
36+
37+
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
38+
39+
To provision this example, run the following from within this directory:
40+
- `terraform init` to get the plugins
41+
- `terraform plan` to see the infrastructure plan
42+
- `terraform apply` to apply the infrastructure build
43+
- `terraform destroy` to destroy the built infrastructure
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/**
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
locals {
18+
key_name = "gke-key-${random_string.suffix.result}"
19+
}
20+
21+
module "kms" {
22+
source = "terraform-google-modules/kms/google"
23+
version = "~> 4.0"
24+
project_id = var.project_id
25+
location = var.region
26+
keyring = "gke-keyring-${random_string.suffix.result}"
27+
keys = [local.key_name]
28+
prevent_destroy = false
29+
}
30+
31+
resource "google_project_service_identity" "container_identity" {
32+
provider = google-beta
33+
project = var.project_id
34+
service = "container.googleapis.com"
35+
}
36+
37+
resource "google_kms_crypto_key_iam_member" "sm_sa_encrypter_decrypter" {
38+
role = "roles/cloudkms.cryptoKeyEncrypterDecrypter"
39+
member = google_project_service_identity.container_identity.member
40+
crypto_key_id = module.kms.keys[local.key_name]
41+
}
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/**
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
locals {
18+
cluster_type = "confidential-gpu"
19+
network_name = "confidential-gpu-network-${random_string.suffix.result}"
20+
subnet_name = "confidential-gpu-subnet"
21+
master_auth_subnetwork = "confidential-gpu-master-subnet"
22+
pods_range_name = "ip-range-pods-${random_string.suffix.result}"
23+
svc_range_name = "ip-range-svc-${random_string.suffix.result}"
24+
subnet_names = [for subnet_self_link in module.gcp-network.subnets_self_links : split("/", subnet_self_link)[length(split("/", subnet_self_link)) - 1]]
25+
}
26+
27+
resource "random_string" "suffix" {
28+
length = 4
29+
special = false
30+
upper = false
31+
}
32+
33+
data "google_project" "main" {
34+
project_id = var.project_id
35+
}
36+
37+
resource "google_kms_crypto_key_iam_member" "main" {
38+
crypto_key_id = module.kms.keys[local.key_name]
39+
role = "roles/cloudkms.cryptoKeyEncrypterDecrypter"
40+
member = "serviceAccount:service-${data.google_project.main.number}@compute-system.iam.gserviceaccount.com"
41+
}
42+
43+
data "google_client_config" "default" {}
44+
45+
provider "kubernetes" {
46+
host = "https://${module.gke.endpoint}"
47+
token = data.google_client_config.default.access_token
48+
cluster_ca_certificate = base64decode(module.gke.ca_certificate)
49+
}
50+
51+
module "gke" {
52+
source = "terraform-google-modules/kubernetes-engine/google//modules/beta-public-cluster"
53+
version = "~> 37.0"
54+
55+
project_id = var.project_id
56+
name = "${local.cluster_type}-cluster-${random_string.suffix.result}"
57+
region = var.region
58+
zones = var.zones
59+
network = module.gcp-network.network_name
60+
subnetwork = local.subnet_names[index(module.gcp-network.subnets_names, local.subnet_name)]
61+
ip_range_pods = local.pods_range_name
62+
ip_range_services = local.svc_range_name
63+
create_service_account = false
64+
initial_node_count = 1
65+
remove_default_node_pool = true
66+
disable_legacy_metadata_endpoints = false
67+
deletion_protection = false
68+
service_account = "default"
69+
logging_variant = "MAX_THROUGHPUT"
70+
dns_allow_external_traffic = true
71+
72+
enable_confidential_nodes = true
73+
74+
database_encryption = [
75+
{
76+
"key_name" : module.kms.keys[local.key_name],
77+
"state" : "ENCRYPTED"
78+
}
79+
]
80+
81+
node_pools = [
82+
{
83+
name = "default"
84+
machine_type = "a3-highgpu-1g"
85+
confidential_instance_type = "TDX"
86+
spot = true
87+
disk_type = "hyperdisk-balanced"
88+
boot_disk_kms_key = module.kms.keys[local.key_name]
89+
enable_confidential_storage = true
90+
accelerator_count = 1
91+
accelerator_type = "nvidia-h100-80gb"
92+
gpu_driver_version = "INSTALLATION_DISABLED"
93+
node_locations = join(",", var.zones)
94+
local_ssd_ephemeral_storage_count = 2
95+
},
96+
]
97+
}
98+
99+
module "kubectl" {
100+
source = "terraform-google-modules/gcloud/google//modules/kubectl-wrapper"
101+
version = "~> 3.0"
102+
103+
project_id = var.project_id
104+
cluster_name = module.gke.name
105+
cluster_location = module.gke.location
106+
module_depends_on = [module.gke.endpoint]
107+
kubectl_create_command = "kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/refs/heads/master/nvidia-driver-installer/cos/daemonset-confidential.yaml"
108+
kubectl_destroy_command = "kubectl delete -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/refs/heads/master/nvidia-driver-installer/cos/daemonset-confidential.yaml"
109+
skip_download = true
110+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/**
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
module "gcp-network" {
18+
source = "terraform-google-modules/network/google"
19+
version = "~> 11.0"
20+
21+
project_id = var.project_id
22+
network_name = local.network_name
23+
routing_mode = "GLOBAL"
24+
25+
subnets = [
26+
{
27+
subnet_name = local.subnet_name
28+
subnet_ip = "10.0.0.0/17"
29+
subnet_region = var.region
30+
subnet_private_access = true
31+
},
32+
{
33+
subnet_name = local.master_auth_subnetwork
34+
subnet_ip = "10.60.0.0/17"
35+
subnet_region = var.region
36+
subnet_private_access = true
37+
},
38+
]
39+
40+
secondary_ranges = {
41+
(local.subnet_name) = [
42+
{
43+
range_name = local.pods_range_name
44+
ip_cidr_range = "192.168.0.0/18"
45+
},
46+
{
47+
range_name = local.svc_range_name
48+
ip_cidr_range = "192.168.64.0/18"
49+
},
50+
]
51+
}
52+
}

0 commit comments

Comments
 (0)