Migrate kubectl_apply_manifest module to helm (#5282)

agrawalkhushi18 · web-flow · commit cc16e8e5e692 · 2026-03-31T11:54:01.000+05:30
The tests failing are  unrelated to this PR changes.
diff --git a/community/examples/xpk-n2-filestore/xpk-n2-filestore.yaml b/community/examples/xpk-n2-filestore/xpk-n2-filestore.yaml
@@ -101,15 +101,10 @@ deployment_groups:
       - source: $(vars.storage_crd_path)
         # Server-side applies avoid last-applied-configuration and associated annotation length issues
       - source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_applicationprofiles.yaml
-        server_side_apply: true
       - source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_jobtemplates.yaml
-        server_side_apply: true
       - source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_rayclustertemplates.yaml
-        server_side_apply: true
       - source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_rayjobtemplates.yaml
-        server_side_apply: true
       - source: https://raw.githubusercontent.com/kubernetes-sigs/kjob/$(vars.kjob_version)/config/crd/bases/kjobctl.x-k8s.io_volumebundles.yaml
-        server_side_apply: true
 
   - id: homefs
     source: modules/file-system/filestore
diff --git a/community/modules/scheduler/slinky/main.tf b/community/modules/scheduler/slinky/main.tf
@@ -49,6 +49,7 @@ resource "helm_release" "cert_manager" {
   version          = var.cert_manager_chart_version
   namespace        = "cert-manager"
   create_namespace = true
+  wait             = true
 
   values = concat(
     [yamlencode({
@@ -75,6 +76,7 @@ resource "helm_release" "slurm_operator" {
   version          = var.slurm_operator_chart_version
   namespace        = var.slurm_operator_namespace
   create_namespace = true
+  wait             = true
 
   # The Cert Manager webhook deployment must be running to provision the Operator
   depends_on = [
diff --git a/community/modules/scheduler/slinky/outputs.tf b/community/modules/scheduler/slinky/outputs.tf
@@ -15,9 +15,21 @@
 output "slurm_namespace" {
   description = "namespace for the slurm chart"
   value       = var.slurm_namespace
+  depends_on = [
+    helm_release.cert_manager,
+    helm_release.slurm_operator,
+    helm_release.slurm,
+    helm_release.prometheus
+  ]
 }
 
 output "slurm_operator_namespace" {
   description = "namespace for the slinky operator chart"
   value       = var.slurm_operator_namespace
+  depends_on = [
+    helm_release.cert_manager,
+    helm_release.slurm_operator,
+    helm_release.slurm,
+    helm_release.prometheus
+  ]
 }
diff --git a/examples/gke-h4d/gke-h4d.yaml b/examples/gke-h4d/gke-h4d.yaml
@@ -182,7 +182,6 @@ deployment_groups:
         install: true
       apply_manifests:
       - source: "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.7.0/deploy/v2beta1/mpi-operator.yaml"
-        server_side_apply: true
 
   # Filestore
   - id: filestore
diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -26,9 +26,9 @@ locals {
     "a3-highgpu-8g" = {
       # Manifest to be installed for enabling TCPX on a3-highgpu-8g machines
       gpu_direct_manifests = [
-        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-tcpx-installer.yaml",      # nccl_plugin v3.1.9 for tcpx
-        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-config.yaml",              # nccl_configmap
-        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
+        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/gpudirect-tcpx/nccl-tcpx-installer.yaml",      # nccl_plugin v3.1.9 for tcpx
+        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/gpudirect-tcpx/nccl-config.yaml",              # nccl_configmap
+        "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/204a57cb807ec5f440811607cead422e3c65d987/nri_device_injector/nri-device-injector.yaml", # nri_plugin
       ]
       updated_workload_path   = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
       rxdm_version            = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md
@@ -15,9 +15,9 @@ This module simplifies the following functionality:
     * A single URL to a manifest file. Ex.: `https://github.com/.../myrepo/manifest.yaml`.
 
     > **Note:** Applying from a URL has important limitations. Please review the [Considerations & Callouts for Applying from URLs](#applying-manifests-from-urls-considerations--callouts) section below.
-    * A single local YAML manifest file (`.yaml`). Ex.: `./manifest.yaml`.
+    * A single local YAML manifest file (`.yaml` or `.yml`). Ex.: `./manifest.yaml`.
     * A template file (`.tftpl`) to generate a manifest. Ex.: `./template.yaml.tftpl`. You can pass the variables to format the template file in `template_vars`.
-    * A directory containing multiple YAML or template files. Ex: `./manifests/`. You can pass the variables to format the template files in `template_vars`.
+    * A directory containing multiple YAML or template files. Ex: `./manifests/` or `./manifests`. The module correctly identifies directories even if the trailing slash is omitted. For security and stability, the module only processes files with `.yaml`, `.yml`, or `.tftpl` extensions. Other files in the directory (like `README.md` etc. ) are automatically ignored.
 
 #### Manifest Example
 
@@ -115,6 +115,17 @@ The `path` field accepts a template file. You will need to provide variables for
 
 ## Callouts
 
+### Helm-based Manifest Application
+
+#### 1. Large Manifests and CRDs
+Helm stores the entire release state (including the generated manifests) as a standard Kubernetes Secret in the release namespace. Before storing the state, Helm runs the YAML through [GZIP compression and base64 encoding](https://helm.sh/docs/topics/kubernetes_apis/#:~:text=The%20manifest%20is,of%20the%20release.). This effectively raises the limit to ~1MB or more, allowing for the deployment of very large manifests and complex CRDs without requiring Server-Side Apply (SSA). This behaviour is guaranteed because the [Terraform Helm Provider](https://github.com/hashicorp/terraform-provider-helm) directly imports the official [Helm Go SDK](https://github.com/helm/helm/tree/main/pkg/action).
+
+#### 2. Release Suffixes
+The module introduces a `random_id` to generate a unique 4-byte suffix for each Helm release (e.g., `manifest-apply-ceab0dfc-0`). This prevents name collisions when multiple module instances (e.g., `gke-cluster` and `gke-node-pool`) instantiate the `kubectl-apply` source simultaneously within the same blueprint. This ID is stored in the Terraform state, ensuring the release name remains stable across re-deployments.
+
+#### 3. Re-deployment Conflicts
+If a deployment fails, the `atomic = true` setting ensures that Helm automatically rolls back the release, preventing the cluster from being left in a "half-applied" state. If you encounter persistent conflicts during re-deployment due to immutable fields, you may need to manually delete the resource or the Helm release before re-applying.
+
 ### Applying Manifests from URLs: Considerations & Callouts
 
 While this module supports applying manifests directly from remote `http://` or `https://` URLs, this method introduces complexities not present when using local files. For production environments, we recommend sourcing manifests from local paths or a version-controlled Git repository. Moreover, this method will be deprecated soon. Hence we recommend to use other methods to source manifests.
@@ -142,7 +153,6 @@ To ensure a reliable deployment, you must manually enforce the correct order of
       apply_manifests:
       # This manifest contains the CRDs for Kueue
       - source: "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/modules/management/kubectl-apply/manifests/kueue-v0.11.4.yaml"
-        server_side_apply: true
     ```
 
 2. **Run the deployment** (`gcluster deploy` or `terraform apply`).
@@ -156,26 +166,14 @@ To ensure a reliable deployment, you must manually enforce the correct order of
       apply_manifests:
       # The CRD manifest is still present
       - source: "https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/develop/modules/management/kubectl-apply/manifests/kueue-v0.11.4.yaml"
-        server_side_apply: true
 
       # Now, add your configuration manifest
       - source: "https://gist.githubusercontent.com/YourUser/..." # Your configuration URL
-        server_side_apply: true
     ```
 
 4. **Run the deployment command again.** Since the CRDs are now guaranteed to exist in the cluster, this second apply will succeed reliably.
 
-#### **2. Large Manifests (CRDs)**
-
-* **Issue:** Applying very large manifests can fail with a `metadata.annotations: Too long` error.
-* **Solution:** Enable Server-Side Apply by setting `server_side_apply: true` for the manifest entry.
-
-#### **3. Conflicts on Re-application**
-
-* **Issue:** Re-running a deployment after a partial failure can cause server-side apply field manager `conflicts`.
-* **Solution:** Forcibly take ownership of the resource fields by setting `force_conflicts: true`.
-
-#### **4. Terraform Template Files (`.tftpl`)**
+#### **2. Terraform Template Files (`.tftpl`)**
 
 * **Limitation:** This module **cannot** render a template file (`.tftpl`) when sourced from a remote URL.
 * **Workaround:** You must render the template into a pure YAML file locally, host that rendered file at a URL, and provide the URL of the rendered file in your blueprint.
@@ -206,13 +204,15 @@ limitations under the License.
 | <a name="requirement_helm"></a> [helm](#requirement\_helm) | ~> 2.17 |
 | <a name="requirement_http"></a> [http](#requirement\_http) | ~> 3.0 |
 | <a name="requirement_kubectl"></a> [kubectl](#requirement\_kubectl) | >= 1.7.0 |
+| <a name="requirement_random"></a> [random](#requirement\_random) | >= 2.1 |
 
 ## Providers
 
 | Name | Version |
 |------|---------|
 | <a name="provider_google"></a> [google](#provider\_google) | >= 7.2 |
 | <a name="provider_http"></a> [http](#provider\_http) | ~> 3.0 |
+| <a name="provider_random"></a> [random](#provider\_random) | >= 2.1 |
 | <a name="provider_terraform"></a> [terraform](#provider\_terraform) | n/a |
 
 ## Modules
@@ -226,12 +226,13 @@ limitations under the License.
 | <a name="module_install_jobset"></a> [install\_jobset](#module\_install\_jobset) | ./helm_install | n/a |
 | <a name="module_install_kueue"></a> [install\_kueue](#module\_install\_kueue) | ./helm_install | n/a |
 | <a name="module_install_nvidia_dra_driver"></a> [install\_nvidia\_dra\_driver](#module\_install\_nvidia\_dra\_driver) | ./helm_install | n/a |
-| <a name="module_kubectl_apply_manifests"></a> [kubectl\_apply\_manifests](#module\_kubectl\_apply\_manifests) | ./kubectl | n/a |
+| <a name="module_kubectl_apply_manifests"></a> [kubectl\_apply\_manifests](#module\_kubectl\_apply\_manifests) | ./helm_install | n/a |
 
 ## Resources
 
 | Name | Type |
 |------|------|
+| [random_id.release_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource |
 | [terraform_data.gib_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
 | [terraform_data.initial_gib_version](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
 | [terraform_data.jobset_validations](https://registry.terraform.io/providers/hashicorp/terraform/latest/docs/resources/data) | resource |
@@ -244,7 +245,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md).<br/> NOTE: The `enable` input acts as a FF to apply a manifest or not. By default it is always set to `true`. | <pre>list(object({<br/>    enable            = optional(bool, true)<br/>    content           = optional(string, null)<br/>    source            = optional(string, null)<br/>    template_vars     = optional(map(any), null)<br/>    server_side_apply = optional(bool, false)<br/>    wait_for_rollout  = optional(bool, true)<br/>  }))</pre> | `[]` | no |
+| <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to the GKE cluster using helm\_install. For more details on the underlying deployment mechanism, see the [helm\_install module](helm\_install/README.md). The `enable` input acts as a FF to apply a manifest or not. By default it is always set to `true`. | <pre>list(object({<br/>    enable           = optional(bool, true)<br/>    content          = optional(string, null)<br/>    source           = optional(string, null)<br/>    template_vars    = optional(map(any), null)<br/>    wait_for_rollout = optional(bool, true)<br/>    namespace        = optional(string, null)<br/>  }))</pre> | `[]` | no |
 | <a name="input_asapd_lite"></a> [asapd\_lite](#input\_asapd\_lite) | Install the asapd-lite daemonset for A4X-Max Bare Metal. | <pre>object({<br/>    install     = bool<br/>    config_path = string<br/>  })</pre> | <pre>{<br/>  "config_path": "",<br/>  "install": false<br/>}</pre> | no |
 | <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
 | <a name="input_gib"></a> [gib](#input\_gib) | Install the NCCL gIB plugin | <pre>object({<br/>    install = bool<br/>    path    = string<br/>    template_vars = object({<br/>      image   = optional(string, "us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib")<br/>      version = string<br/>      node_affinity = optional(any, {<br/>        requiredDuringSchedulingIgnoredDuringExecution = {<br/>          nodeSelectorTerms = [{<br/>            matchExpressions = [{<br/>              key      = "cloud.google.com/gke-gpu",<br/>              operator = "In",<br/>              values   = ["true"]<br/>            }]<br/>          }]<br/>        }<br/>      })<br/>      accelerator_count = number<br/>      max_unavailable   = optional(string, "50%")<br/>    })<br/>  })</pre> | <pre>{<br/>  "install": false,<br/>  "path": "",<br/>  "template_vars": {<br/>    "accelerator_count": 0,<br/>    "version": ""<br/>  }<br/>}</pre> | no |
diff --git a/modules/management/kubectl-apply/helm_install/README.md b/modules/management/kubectl-apply/helm_install/README.md
@@ -43,7 +43,7 @@ No modules.
 | <a name="input_keyring"></a> [keyring](#input\_keyring) | Location of public keys used for verification ('helm install --keyring'). Used if 'verify' is true. | `string` | `null` | no |
 | <a name="input_lint"></a> [lint](#input\_lint) | Run the helm chart linter during the plan ('helm lint'). | `bool` | `false` | no |
 | <a name="input_max_history"></a> [max\_history](#input\_max\_history) | Limit the maximum number of revisions saved per release ('helm upgrade --history-max'). 0 for no limit. | `number` | `null` | no |
-| <a name="input_namespace"></a> [namespace](#input\_namespace) | Kubernetes namespace to install the Helm release into. | `string` | `"default"` | no |
+| <a name="input_namespace"></a> [namespace](#input\_namespace) | Kubernetes namespace to install the Helm release into. | `string` | `null` | no |
 | <a name="input_pass_credentials"></a> [pass\_credentials](#input\_pass\_credentials) | Pass credentials to all domains ('helm install --pass-credentials'). Use with caution. | `bool` | `false` | no |
 | <a name="input_postrender"></a> [postrender](#input\_postrender) | Configuration for a post-rendering executable ('helm install --post-renderer'). Should be an object with 'binary\_path' attribute. | <pre>object({<br/>    binary_path = string # Path to the post-renderer executable<br/>  })</pre> | `null` | no |
 | <a name="input_recreate_pods"></a> [recreate\_pods](#input\_recreate\_pods) | Perform pods restart for the resource if applicable ('helm upgrade --recreate-pods'). Note: This flag is deprecated in Helm CLI v3 itself. | `bool` | `false` | no |
diff --git a/modules/management/kubectl-apply/helm_install/variables.tf b/modules/management/kubectl-apply/helm_install/variables.tf
@@ -48,7 +48,7 @@ variable "devel" {
 variable "namespace" {
   description = "Kubernetes namespace to install the Helm release into."
   type        = string
-  default     = "default"
+  default     = null
 }
 
 variable "create_namespace" {
diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
diff --git a/modules/management/kubectl-apply/versions.tf b/modules/management/kubectl-apply/versions.tf
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-a3-high.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-a3-high.yml

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ variable "devel" {`
`48`	`48`	`variable "namespace" {`
`49`	`49`	`description = "Kubernetes namespace to install the Helm release into."`
`50`	`50`	`type = string`
`51`		`- default = "default"`
	`51`	`+ default = null`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`variable "create_namespace" {`