Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions community/examples/xpk-n2-filestore/xpk-n2-filestore.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,10 @@ deployment_groups:
- source: $(vars.storage_crd_path)
# Server-side applies avoid last-applied-configuration and associated annotation length issues
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_applicationprofiles.yaml
server_side_apply: true
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_jobtemplates.yaml
server_side_apply: true
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_rayclustertemplates.yaml
server_side_apply: true
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_rayjobtemplates.yaml
server_side_apply: true
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_volumebundles.yaml
server_side_apply: true

- id: homefs
source: modules/file-system/filestore
Expand Down
2 changes: 2 additions & 0 deletions community/modules/scheduler/slinky/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ resource "helm_release" "cert_manager" {
version = var.cert_manager_chart_version
namespace = "cert-manager"
create_namespace = true
wait = true

values = concat(
[yamlencode({
Expand All @@ -75,6 +76,7 @@ resource "helm_release" "slurm_operator" {
version = var.slurm_operator_chart_version
namespace = var.slurm_operator_namespace
create_namespace = true
wait = true

# The Cert Manager webhook deployment must be running to provision the Operator
depends_on = [
Expand Down
12 changes: 12 additions & 0 deletions community/modules/scheduler/slinky/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,21 @@
output "slurm_namespace" {
description = "namespace for the slurm chart"
value = var.slurm_namespace
depends_on = [
helm_release.cert_manager,
helm_release.slurm_operator,
helm_release.slurm,
helm_release.prometheus
]
}

output "slurm_operator_namespace" {
description = "namespace for the slinky operator chart"
value = var.slurm_operator_namespace
depends_on = [
helm_release.cert_manager,
helm_release.slurm_operator,
helm_release.slurm,
helm_release.prometheus
]
}
1 change: 0 additions & 1 deletion examples/gke-h4d/gke-h4d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ deployment_groups:
install: true
apply_manifests:
- source: "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.7.0/deploy/v2beta1/mpi-operator.yaml"
server_side_apply: true

# Filestore
- id: filestore
Expand Down
6 changes: 3 additions & 3 deletions modules/compute/gke-node-pool/gpu_direct.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ locals {
"a3-highgpu-8g" = {
# Manifest to be installed for enabling TCPX on a3-highgpu-8g machines
gpu_direct_manifests = [
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-tcpx-installer.yaml", # nccl_plugin v3.1.9 for tcpx
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-config.yaml", # nccl_configmap
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/gpudirect-tcpx/nccl-tcpx-installer.yaml", # nccl_plugin v3.1.9 for tcpx
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/gpudirect-tcpx/nccl-config.yaml", # nccl_configmap
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/nri_device_injector/nri-device-injector.yaml", # nri_plugin
]
updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
Expand Down
37 changes: 19 additions & 18 deletions modules/management/kubectl-apply/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ This module simplifies the following functionality:
* A single URL to a manifest file. Ex.: `https://github.com/.../myrepo/manifest.yaml`.

> **Note:** Applying from a URL has important limitations. Please review the [Considerations & Callouts for Applying from URLs](#applying-manifests-from-urls-considerations--callouts) section below.
* A single local YAML manifest file (`.yaml`). Ex.: `./manifest.yaml`.
* A single local YAML manifest file (`.yaml` or `.yml`). Ex.: `./manifest.yaml`.
* A template file (`.tftpl`) to generate a manifest. Ex.: `./template.yaml.tftpl`. You can pass the variables to format the template file in `template_vars`.
* A directory containing multiple YAML or template files. Ex: `./manifests/`. You can pass the variables to format the template files in `template_vars`.
* A directory containing multiple YAML or template files. Ex: `./manifests/` or `./manifests`. The module correctly identifies directories even if the trailing slash is omitted. For security and stability, the module only processes files with `.yaml`, `.yml`, or `.tftpl` extensions. Other files in the directory (like `README.md` etc. ) are automatically ignored.

#### Manifest Example

Expand Down Expand Up @@ -115,6 +115,17 @@ The `path` field accepts a template file. You will need to provide variables for

## Callouts

### Helm-based Manifest Application

#### 1. Large Manifests and CRDs
Helm stores the entire release state (including the generated manifests) as a standard Kubernetes Secret in the release namespace. Before storing the state, Helm runs the YAML through [GZIP compression and base64 encoding](https://helm.sh/docs/topics/kubernetes_apis/#:~:text=The%20manifest%20is,of%20the%20release.). This effectively raises the limit to ~1MB or more, allowing for the deployment of very large manifests and complex CRDs without requiring Server-Side Apply (SSA). This behaviour is guaranteed because the [Terraform Helm Provider](https://github.com/hashicorp/terraform-provider-helm) directly imports the official [Helm Go SDK](https://github.com/helm/helm/tree/main/pkg/action).

#### 2. Release Suffixes
The module introduces a `random_id` to generate a unique 4-byte suffix for each Helm release (e.g., `manifest-apply-ceab0dfc-0`). This prevents name collisions when multiple module instances (e.g., `gke-cluster` and `gke-node-pool`) instantiate the `kubectl-apply` source simultaneously within the same blueprint. This ID is stored in the Terraform state, ensuring the release name remains stable across re-deployments.

#### 3. Re-deployment Conflicts
If a deployment fails, the `atomic = true` setting ensures that Helm automatically rolls back the release, preventing the cluster from being left in a "half-applied" state. If you encounter persistent conflicts during re-deployment due to immutable fields, you may need to manually delete the resource or the Helm release before re-applying.

### Applying Manifests from URLs: Considerations & Callouts

While this module supports applying manifests directly from remote `http://` or `https://` URLs, this method introduces complexities not present when using local files. For production environments, we recommend sourcing manifests from local paths or a version-controlled Git repository. Moreover, this method will be deprecated soon. Hence we recommend to use other methods to source manifests.
Expand Down Expand Up @@ -142,7 +153,6 @@ To ensure a reliable deployment, you must manually enforce the correct order of
apply_manifests:
# This manifest contains the CRDs for Kueue
- source: "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/modules/management/kubectl-apply/manifests/kueue-v0.11.4.yaml"
server_side_apply: true
```

2. **Run the deployment** (`gcluster deploy` or `terraform apply`).
Expand All @@ -156,26 +166,14 @@ To ensure a reliable deployment, you must manually enforce the correct order of
apply_manifests:
# The CRD manifest is still present
- source: "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/modules/management/kubectl-apply/manifests/kueue-v0.11.4.yaml"
server_side_apply: true

# Now, add your configuration manifest
- source: "https://gist.githubusercontent.com/YourUser/..." # Your configuration URL
server_side_apply: true
```

4. **Run the deployment command again.** Since the CRDs are now guaranteed to exist in the cluster, this second apply will succeed reliably.

#### **2. Large Manifests (CRDs)**

* **Issue:** Applying very large manifests can fail with a `metadata.annotations: Too long` error.
* **Solution:** Enable Server-Side Apply by setting `server_side_apply: true` for the manifest entry.

#### **3. Conflicts on Re-application**

* **Issue:** Re-running a deployment after a partial failure can cause server-side apply field manager `conflicts`.
* **Solution:** Forcibly take ownership of the resource fields by setting `force_conflicts: true`.

#### **4. Terraform Template Files (`.tftpl`)**
#### **2. Terraform Template Files (`.tftpl`)**

* **Limitation:** This module **cannot** render a template file (`.tftpl`) when sourced from a remote URL.
* **Workaround:** You must render the template into a pure YAML file locally, host that rendered file at a URL, and provide the URL of the rendered file in your blueprint.
Expand Down Expand Up @@ -206,13 +204,15 @@ limitations under the License.
| <a name="requirement_helm"></a> [helm](#requirement\_helm) | ~> 2.17 |
| <a name="requirement_http"></a> [http](#requirement\_http) | ~> 3.0 |
| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |
| <a name="requirement_random"></a> [random](#requirement\_random) | >= 2.1 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_google"></a> [google](#provider\_google) | >= 7.2 |
| <a name="provider_http"></a> [http](#provider\_http) | ~> 3.0 |
| <a name="provider_random"></a> [random](#provider\_random) | >= 2.1 |
| <a name="provider_terraform"></a> [terraform](#provider\_terraform) | n/a |

## Modules
Expand All @@ -226,12 +226,13 @@ limitations under the License.
| <a name="module_install_jobset"></a> [install\_jobset](#module\_install\_jobset) | ./helm_install | n/a |
| <a name="module_install_kueue"></a> [install\_kueue](#module\_install\_kueue) | ./helm_install | n/a |
| <a name="module_install_nvidia_dra_driver"></a> [install\_nvidia\_dra\_driver](#module\_install\_nvidia\_dra\_driver) | ./helm_install | n/a |
| <a name="module_kubectl_apply_manifests"></a> [kubectl\_apply\_manifests](#module\_kubectl\_apply\_manifests) | ./kubectl | n/a |
| <a name="module_kubectl_apply_manifests"></a> [kubectl\_apply\_manifests](#module\_kubectl\_apply\_manifests) | ./helm_install | n/a |

## Resources

| Name | Type |
|------|------|
| [random_id.release_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource |
| [terraform_data.gib_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
| [terraform_data.initial_gib_version](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
| [terraform_data.jobset_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
Expand All @@ -244,7 +245,7 @@ limitations under the License.

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md).<br/> NOTE: The `enable` input acts as a FF to apply a manifest or not. By default it is always set to `true`. | <pre>list(object({<br/> enable = optional(bool, true)<br/> content = optional(string, null)<br/> source = optional(string, null)<br/> template_vars = optional(map(any), null)<br/> server_side_apply = optional(bool, false)<br/> wait_for_rollout = optional(bool, true)<br/> }))</pre> | `[]` | no |
| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to the GKE cluster using helm\_install. For more details on the underlying deployment mechanism, see the [helm\_install module](helm\_install/README.md). The `enable` input acts as a FF to apply a manifest or not. By default it is always set to `true`. | <pre>list(object({<br/> enable = optional(bool, true)<br/> content = optional(string, null)<br/> source = optional(string, null)<br/> template_vars = optional(map(any), null)<br/> wait_for_rollout = optional(bool, true)<br/> namespace = optional(string, null)<br/> }))</pre> | `[]` | no |
| <a name="input_asapd_lite"></a> [asapd\_lite](#input\_asapd\_lite) | Install the asapd-lite daemonset for A4X-Max Bare Metal. | <pre>object({<br/> install = bool<br/> config_path = string<br/> })</pre> | <pre>{<br/> "config_path": "",<br/> "install": false<br/>}</pre> | no |
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
| <a name="input_gib"></a> [gib](#input\_gib) | Install the NCCL gIB plugin | <pre>object({<br/> install = bool<br/> path = string<br/> template_vars = object({<br/> image = optional(string, "us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib")<br/> version = string<br/> node_affinity = optional(any, {<br/> requiredDuringSchedulingIgnoredDuringExecution = {<br/> nodeSelectorTerms = [{<br/> matchExpressions = [{<br/> key = "cloud.google.com/gke-gpu",<br/> operator = "In",<br/> values = ["true"]<br/> }]<br/> }]<br/> }<br/> })<br/> accelerator_count = number<br/> max_unavailable = optional(string, "50%")<br/> })<br/> })</pre> | <pre>{<br/> "install": false,<br/> "path": "",<br/> "template_vars": {<br/> "accelerator_count": 0,<br/> "version": ""<br/> }<br/>}</pre> | no |
Expand Down
2 changes: 1 addition & 1 deletion modules/management/kubectl-apply/helm_install/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ No modules.
| <a name="input_keyring"></a> [keyring](#input\_keyring) | Location of public keys used for verification ('helm install --keyring'). Used if 'verify' is true. | `string` | `null` | no |
| <a name="input_lint"></a> [lint](#input\_lint) | Run the helm chart linter during the plan ('helm lint'). | `bool` | `false` | no |
| <a name="input_max_history"></a> [max\_history](#input\_max\_history) | Limit the maximum number of revisions saved per release ('helm upgrade --history-max'). 0 for no limit. | `number` | `null` | no |
| <a name="input_namespace"></a> [namespace](#input\_namespace) | Kubernetes namespace to install the Helm release into. | `string` | `"default"` | no |
| <a name="input_namespace"></a> [namespace](#input\_namespace) | Kubernetes namespace to install the Helm release into. | `string` | `null` | no |
| <a name="input_pass_credentials"></a> [pass\_credentials](#input\_pass\_credentials) | Pass credentials to all domains ('helm install --pass-credentials'). Use with caution. | `bool` | `false` | no |
| <a name="input_postrender"></a> [postrender](#input\_postrender) | Configuration for a post-rendering executable ('helm install --post-renderer'). Should be an object with 'binary\_path' attribute. | <pre>object({<br/> binary_path = string # Path to the post-renderer executable<br/> })</pre> | `null` | no |
| <a name="input_recreate_pods"></a> [recreate\_pods](#input\_recreate\_pods) | Perform pods restart for the resource if applicable ('helm upgrade --recreate-pods'). Note: This flag is deprecated in Helm CLI v3 itself. | `bool` | `false` | no |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ variable "devel" {
variable "namespace" {
description = "Kubernetes namespace to install the Helm release into."
type = string
default = "default"
default = null
}

variable "create_namespace" {
Expand Down
Loading
Loading