Skip to content

Commit cc16e8e

Browse files
Migrate kubectl_apply_manifest module to helm (#5282)
The tests failing are unrelated to this PR changes.
1 parent c85760d commit cc16e8e

File tree

12 files changed

+108
-71
lines changed

12 files changed

+108
-71
lines changed

community/examples/xpk-n2-filestore/xpk-n2-filestore.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,10 @@ deployment_groups:
101101
- source: $(vars.storage_crd_path)
102102
# Server-side applies avoid last-applied-configuration and associated annotation length issues
103103
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_applicationprofiles.yaml
104-
server_side_apply: true
105104
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_jobtemplates.yaml
106-
server_side_apply: true
107105
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_rayclustertemplates.yaml
108-
server_side_apply: true
109106
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_rayjobtemplates.yaml
110-
server_side_apply: true
111107
- source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_volumebundles.yaml
112-
server_side_apply: true
113108

114109
- id: homefs
115110
source: modules/file-system/filestore

community/modules/scheduler/slinky/main.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ resource "helm_release" "cert_manager" {
4949
version = var.cert_manager_chart_version
5050
namespace = "cert-manager"
5151
create_namespace = true
52+
wait = true
5253

5354
values = concat(
5455
[yamlencode({
@@ -75,6 +76,7 @@ resource "helm_release" "slurm_operator" {
7576
version = var.slurm_operator_chart_version
7677
namespace = var.slurm_operator_namespace
7778
create_namespace = true
79+
wait = true
7880

7981
# The Cert Manager webhook deployment must be running to provision the Operator
8082
depends_on = [

community/modules/scheduler/slinky/outputs.tf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,21 @@
1515
output "slurm_namespace" {
1616
description = "namespace for the slurm chart"
1717
value = var.slurm_namespace
18+
depends_on = [
19+
helm_release.cert_manager,
20+
helm_release.slurm_operator,
21+
helm_release.slurm,
22+
helm_release.prometheus
23+
]
1824
}
1925

2026
output "slurm_operator_namespace" {
2127
description = "namespace for the slinky operator chart"
2228
value = var.slurm_operator_namespace
29+
depends_on = [
30+
helm_release.cert_manager,
31+
helm_release.slurm_operator,
32+
helm_release.slurm,
33+
helm_release.prometheus
34+
]
2335
}

examples/gke-h4d/gke-h4d.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ deployment_groups:
182182
install: true
183183
apply_manifests:
184184
- source: "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.7.0/deploy/v2beta1/mpi-operator.yaml"
185-
server_side_apply: true
186185

187186
# Filestore
188187
- id: filestore

modules/compute/gke-node-pool/gpu_direct.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ locals {
2626
"a3-highgpu-8g" = {
2727
# Manifest to be installed for enabling TCPX on a3-highgpu-8g machines
2828
gpu_direct_manifests = [
29-
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-tcpx-installer.yaml", # nccl_plugin v3.1.9 for tcpx
30-
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-config.yaml", # nccl_configmap
31-
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
29+
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/gpudirect-tcpx/nccl-tcpx-installer.yaml", # nccl_plugin v3.1.9 for tcpx
30+
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/gpudirect-tcpx/nccl-config.yaml", # nccl_configmap
31+
"https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/nri_device_injector/nri-device-injector.yaml", # nri_plugin
3232
]
3333
updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
3434
rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9

modules/management/kubectl-apply/README.md

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ This module simplifies the following functionality:
1515
* A single URL to a manifest file. Ex.: `https://github.com/.../myrepo/manifest.yaml`.
1616

1717
> **Note:** Applying from a URL has important limitations. Please review the [Considerations & Callouts for Applying from URLs](#applying-manifests-from-urls-considerations--callouts) section below.
18-
* A single local YAML manifest file (`.yaml`). Ex.: `./manifest.yaml`.
18+
* A single local YAML manifest file (`.yaml` or `.yml`). Ex.: `./manifest.yaml`.
1919
* A template file (`.tftpl`) to generate a manifest. Ex.: `./template.yaml.tftpl`. You can pass the variables to format the template file in `template_vars`.
20-
* A directory containing multiple YAML or template files. Ex: `./manifests/`. You can pass the variables to format the template files in `template_vars`.
20+
* A directory containing multiple YAML or template files. Ex: `./manifests/` or `./manifests`. The module correctly identifies directories even if the trailing slash is omitted. For security and stability, the module only processes files with `.yaml`, `.yml`, or `.tftpl` extensions. Other files in the directory (like `README.md` etc. ) are automatically ignored.
2121

2222
#### Manifest Example
2323

@@ -115,6 +115,17 @@ The `path` field accepts a template file. You will need to provide variables for
115115

116116
## Callouts
117117

118+
### Helm-based Manifest Application
119+
120+
#### 1. Large Manifests and CRDs
121+
Helm stores the entire release state (including the generated manifests) as a standard Kubernetes Secret in the release namespace. Before storing the state, Helm runs the YAML through [GZIP compression and base64 encoding](https://helm.sh/docs/topics/kubernetes_apis/#:~:text=The%20manifest%20is,of%20the%20release.). This effectively raises the limit to ~1MB or more, allowing for the deployment of very large manifests and complex CRDs without requiring Server-Side Apply (SSA). This behaviour is guaranteed because the [Terraform Helm Provider](https://github.com/hashicorp/terraform-provider-helm) directly imports the official [Helm Go SDK](https://github.com/helm/helm/tree/main/pkg/action).
122+
123+
#### 2. Release Suffixes
124+
The module introduces a `random_id` to generate a unique 4-byte suffix for each Helm release (e.g., `manifest-apply-ceab0dfc-0`). This prevents name collisions when multiple module instances (e.g., `gke-cluster` and `gke-node-pool`) instantiate the `kubectl-apply` source simultaneously within the same blueprint. This ID is stored in the Terraform state, ensuring the release name remains stable across re-deployments.
125+
126+
#### 3. Re-deployment Conflicts
127+
If a deployment fails, the `atomic = true` setting ensures that Helm automatically rolls back the release, preventing the cluster from being left in a "half-applied" state. If you encounter persistent conflicts during re-deployment due to immutable fields, you may need to manually delete the resource or the Helm release before re-applying.
128+
118129
### Applying Manifests from URLs: Considerations & Callouts
119130

120131
While this module supports applying manifests directly from remote `http://` or `https://` URLs, this method introduces complexities not present when using local files. For production environments, we recommend sourcing manifests from local paths or a version-controlled Git repository. Moreover, this method will be deprecated soon. Hence we recommend to use other methods to source manifests.
@@ -142,7 +153,6 @@ To ensure a reliable deployment, you must manually enforce the correct order of
142153
apply_manifests:
143154
# This manifest contains the CRDs for Kueue
144155
- source: "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/modules/management/kubectl-apply/manifests/kueue-v0.11.4.yaml"
145-
server_side_apply: true
146156
```
147157

148158
2. **Run the deployment** (`gcluster deploy` or `terraform apply`).
@@ -156,26 +166,14 @@ To ensure a reliable deployment, you must manually enforce the correct order of
156166
apply_manifests:
157167
# The CRD manifest is still present
158168
- source: "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/modules/management/kubectl-apply/manifests/kueue-v0.11.4.yaml"
159-
server_side_apply: true
160169
161170
# Now, add your configuration manifest
162171
- source: "https://gist.githubusercontent.com/YourUser/..." # Your configuration URL
163-
server_side_apply: true
164172
```
165173

166174
4. **Run the deployment command again.** Since the CRDs are now guaranteed to exist in the cluster, this second apply will succeed reliably.
167175

168-
#### **2. Large Manifests (CRDs)**
169-
170-
* **Issue:** Applying very large manifests can fail with a `metadata.annotations: Too long` error.
171-
* **Solution:** Enable Server-Side Apply by setting `server_side_apply: true` for the manifest entry.
172-
173-
#### **3. Conflicts on Re-application**
174-
175-
* **Issue:** Re-running a deployment after a partial failure can cause server-side apply field manager `conflicts`.
176-
* **Solution:** Forcibly take ownership of the resource fields by setting `force_conflicts: true`.
177-
178-
#### **4. Terraform Template Files (`.tftpl`)**
176+
#### **2. Terraform Template Files (`.tftpl`)**
179177

180178
* **Limitation:** This module **cannot** render a template file (`.tftpl`) when sourced from a remote URL.
181179
* **Workaround:** You must render the template into a pure YAML file locally, host that rendered file at a URL, and provide the URL of the rendered file in your blueprint.
@@ -206,13 +204,15 @@ limitations under the License.
206204
| <a name="requirement_helm"></a> [helm](#requirement\_helm) | ~> 2.17 |
207205
| <a name="requirement_http"></a> [http](#requirement\_http) | ~> 3.0 |
208206
| <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |
207+
| <a name="requirement_random"></a> [random](#requirement\_random) | >= 2.1 |
209208

210209
## Providers
211210

212211
| Name | Version |
213212
|------|---------|
214213
| <a name="provider_google"></a> [google](#provider\_google) | >= 7.2 |
215214
| <a name="provider_http"></a> [http](#provider\_http) | ~> 3.0 |
215+
| <a name="provider_random"></a> [random](#provider\_random) | >= 2.1 |
216216
| <a name="provider_terraform"></a> [terraform](#provider\_terraform) | n/a |
217217

218218
## Modules
@@ -226,12 +226,13 @@ limitations under the License.
226226
| <a name="module_install_jobset"></a> [install\_jobset](#module\_install\_jobset) | ./helm_install | n/a |
227227
| <a name="module_install_kueue"></a> [install\_kueue](#module\_install\_kueue) | ./helm_install | n/a |
228228
| <a name="module_install_nvidia_dra_driver"></a> [install\_nvidia\_dra\_driver](#module\_install\_nvidia\_dra\_driver) | ./helm_install | n/a |
229-
| <a name="module_kubectl_apply_manifests"></a> [kubectl\_apply\_manifests](#module\_kubectl\_apply\_manifests) | ./kubectl | n/a |
229+
| <a name="module_kubectl_apply_manifests"></a> [kubectl\_apply\_manifests](#module\_kubectl\_apply\_manifests) | ./helm_install | n/a |
230230

231231
## Resources
232232

233233
| Name | Type |
234234
|------|------|
235+
| [random_id.release_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource |
235236
| [terraform_data.gib_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
236237
| [terraform_data.initial_gib_version](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
237238
| [terraform_data.jobset_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
@@ -244,7 +245,7 @@ limitations under the License.
244245

245246
| Name | Description | Type | Default | Required |
246247
|------|-------------|------|---------|:--------:|
247-
| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md).<br/> NOTE: The `enable` input acts as a FF to apply a manifest or not. By default it is always set to `true`. | <pre>list(object({<br/> enable = optional(bool, true)<br/> content = optional(string, null)<br/> source = optional(string, null)<br/> template_vars = optional(map(any), null)<br/> server_side_apply = optional(bool, false)<br/> wait_for_rollout = optional(bool, true)<br/> }))</pre> | `[]` | no |
248+
| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to the GKE cluster using helm\_install. For more details on the underlying deployment mechanism, see the [helm\_install module](helm\_install/README.md). The `enable` input acts as a FF to apply a manifest or not. By default it is always set to `true`. | <pre>list(object({<br/> enable = optional(bool, true)<br/> content = optional(string, null)<br/> source = optional(string, null)<br/> template_vars = optional(map(any), null)<br/> wait_for_rollout = optional(bool, true)<br/> namespace = optional(string, null)<br/> }))</pre> | `[]` | no |
248249
| <a name="input_asapd_lite"></a> [asapd\_lite](#input\_asapd\_lite) | Install the asapd-lite daemonset for A4X-Max Bare Metal. | <pre>object({<br/> install = bool<br/> config_path = string<br/> })</pre> | <pre>{<br/> "config_path": "",<br/> "install": false<br/>}</pre> | no |
249250
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
250251
| <a name="input_gib"></a> [gib](#input\_gib) | Install the NCCL gIB plugin | <pre>object({<br/> install = bool<br/> path = string<br/> template_vars = object({<br/> image = optional(string, "us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib")<br/> version = string<br/> node_affinity = optional(any, {<br/> requiredDuringSchedulingIgnoredDuringExecution = {<br/> nodeSelectorTerms = [{<br/> matchExpressions = [{<br/> key = "cloud.google.com/gke-gpu",<br/> operator = "In",<br/> values = ["true"]<br/> }]<br/> }]<br/> }<br/> })<br/> accelerator_count = number<br/> max_unavailable = optional(string, "50%")<br/> })<br/> })</pre> | <pre>{<br/> "install": false,<br/> "path": "",<br/> "template_vars": {<br/> "accelerator_count": 0,<br/> "version": ""<br/> }<br/>}</pre> | no |

modules/management/kubectl-apply/helm_install/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ No modules.
4343
| <a name="input_keyring"></a> [keyring](#input\_keyring) | Location of public keys used for verification ('helm install --keyring'). Used if 'verify' is true. | `string` | `null` | no |
4444
| <a name="input_lint"></a> [lint](#input\_lint) | Run the helm chart linter during the plan ('helm lint'). | `bool` | `false` | no |
4545
| <a name="input_max_history"></a> [max\_history](#input\_max\_history) | Limit the maximum number of revisions saved per release ('helm upgrade --history-max'). 0 for no limit. | `number` | `null` | no |
46-
| <a name="input_namespace"></a> [namespace](#input\_namespace) | Kubernetes namespace to install the Helm release into. | `string` | `"default"` | no |
46+
| <a name="input_namespace"></a> [namespace](#input\_namespace) | Kubernetes namespace to install the Helm release into. | `string` | `null` | no |
4747
| <a name="input_pass_credentials"></a> [pass\_credentials](#input\_pass\_credentials) | Pass credentials to all domains ('helm install --pass-credentials'). Use with caution. | `bool` | `false` | no |
4848
| <a name="input_postrender"></a> [postrender](#input\_postrender) | Configuration for a post-rendering executable ('helm install --post-renderer'). Should be an object with 'binary\_path' attribute. | <pre>object({<br/> binary_path = string # Path to the post-renderer executable<br/> })</pre> | `null` | no |
4949
| <a name="input_recreate_pods"></a> [recreate\_pods](#input\_recreate\_pods) | Perform pods restart for the resource if applicable ('helm upgrade --recreate-pods'). Note: This flag is deprecated in Helm CLI v3 itself. | `bool` | `false` | no |

modules/management/kubectl-apply/helm_install/variables.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ variable "devel" {
4848
variable "namespace" {
4949
description = "Kubernetes namespace to install the Helm release into."
5050
type = string
51-
default = "default"
51+
default = null
5252
}
5353

5454
variable "create_namespace" {

0 commit comments

Comments
 (0)