diff --git a/examples/gke-a3-megagpu/chs-cronjob.yaml.tftpl b/examples/gke-a3-megagpu/chs-cronjob.yaml.tftpl deleted file mode 100644 index 413002bbf1..0000000000 --- a/examples/gke-a3-megagpu/chs-cronjob.yaml.tftpl +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: batch/v1 -kind: CronJob -metadata: - name: cluster-health-scanner-cronjob -spec: - schedule: "${cronjob_schedule}" - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 - failedJobsHistoryLimit: 1 - suspend: false - jobTemplate: - spec: - template: - spec: - serviceAccountName: workload-identity-k8s-sa - containers: - - name: chs-runner - image: python:3.11-slim-buster - imagePullPolicy: Always - command: - - /bin/bash - - -c - - | - set -ex - set -x - apt-get update && apt-get install -y git curl gnupg -y - git clone https://github.com/GoogleCloudPlatform/cluster-health-scanner - cd cluster-health-scanner - apt-get install -y apt-transport-https ca-certificates - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list - apt-get update - apt-get install -y google-cloud-cli kubectl - apt-get install -y google-cloud-cli-gke-gcloud-auth-plugin - curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - pip3 install -r cli/requirements.txt - gcloud container clusters get-credentials ${deployment_name} --region ${region} --project ${project_id} - OUTPUT_DIR="/mnt/output" - mkdir -p $OUTPUT_DIR - TIMESTAMP="`date "+%Y-%m-%d %H:%M:%S"`" - OUTPUT_FILENAME="${deployment_name}_healthscan_result_$TIMESTAMP.txt" - FULL_OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FILENAME" - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c gpu --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c nccl --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c straggler --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c neper --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c tinymax --run_only_on_available_nodes - #python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c status --run_only_on_available_nodes > "$FULL_OUTPUT_PATH" 2>&1 - kubectl get nodes -o custom-columns="NODE:.metadata.name,NCCL_MARK:.metadata.labels.aiinfra/nccl-healthcheck-test,NCCL_BANDWIDTH:.metadata.labels.aiinfra/nccl-healthcheck-bandwidth,NCCL_RESULT:.metadata.labels.aiinfra/nccl-healthcheck-result,NCCL_RUNTIME:.metadata.labels.aiinfra/nccl-healthcheck-runtime-sec,TINYMAX_MARK:.metadata.labels.aiinfra/tinymax-healthcheck-test,TINYMAX_RESULT:.metadata.labels.aiinfra/tinymax-healthcheck-result,TINYMAX_RUNTIME:.metadata.labels.aiinfra/tinymax-healthcheck-runtime-sec,GPU_MARK:.metadata.labels.aiinfra/gpu-healthcheck-test,GPU_RESULT:.metadata.labels.aiinfra/gpu-healthcheck-result,GPU_RUNTIME:.metadata.labels.aiinfra/gpu-healthcheck-runtime-sec" > "$FULL_OUTPUT_PATH" 2>&1 - echo "Health scan outputs saved to $OUTPUT_DIR" - echo "Final output file: $OUTPUT_FILENAME" - volumeMounts: - - name: ${gcs_bucket} - mountPath: /mnt/output - volumes: - - name: ${gcs_bucket} - persistentVolumeClaim: - claimName: ${gcs_pvc} - restartPolicy: Never - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "components.gke.io/gke-managed-components" - operator: "Exists" - effect: "NoSchedule" - backoffLimit: 0 diff --git a/examples/gke-a3-megagpu/chs-permissions.yaml.tftpl b/examples/gke-a3-megagpu/chs-permissions.yaml.tftpl deleted file mode 100644 index a9a9f0a170..0000000000 --- a/examples/gke-a3-megagpu/chs-permissions.yaml.tftpl +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: workload-identity-k8s-sa - namespace: default - annotations: - iam.gke.io/gcp-service-account: ${deployment_name}-gke-wl-sa@${project_id}.iam.gserviceaccount.com ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: cluster-health-scanner-job-role -rules: - - apiGroups: [""] - resources: - - "pods" - - "pods/log" - - "pods/exec" - - "nodes" - - "events" - - "services" - - "secrets" - - "configmaps" - - "serviceaccounts" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["apps"] - resources: - - "daemonsets" - - "deployments" - - "replicasets" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["batch"] - resources: - - "jobs" - - "jobs/status" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["rbac.authorization.k8s.io"] - resources: - - "clusterrolebindings" - - "clusterroles" - - "roles" - - "rolebindings" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: cluster-health-scanner-job-binding -subjects: -- kind: ServiceAccount - name: workload-identity-k8s-sa - namespace: default -roleRef: - kind: ClusterRole - name: cluster-health-scanner-job-role - apiGroup: rbac.authorization.k8s.io diff --git a/examples/gke-a3-megagpu/chs-pvc.yaml.tftpl b/examples/gke-a3-megagpu/chs-pvc.yaml.tftpl deleted file mode 100644 index d9618f058c..0000000000 --- a/examples/gke-a3-megagpu/chs-pvc.yaml.tftpl +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: ${pvc_name} -spec: - accessModes: - - ${access_mode} - resources: - requests: - storage: ${capacity} - storageClassName: ${storage_class_name} - diff --git a/examples/gke-a3-megagpu/gke-a3-megagpu-deployment.yaml b/examples/gke-a3-megagpu/gke-a3-megagpu-deployment.yaml index 53a75766d5..dadce78f31 100644 --- a/examples/gke-a3-megagpu/gke-a3-megagpu-deployment.yaml +++ b/examples/gke-a3-megagpu/gke-a3-megagpu-deployment.yaml @@ -45,6 +45,3 @@ vars: # To target a BLOCK_NAME, the name of the extended reservation # can be inputted as /reservationBlocks/ reservation: RESERVATION_NAME - - enable_periodic_health_checks: false # Make this true to run CHS (healthchecks) - health_check_schedule: "0 0 * * 0" # Run the health check at 12:00 AM (midnight) every Sunday diff --git a/examples/gke-a3-megagpu/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu/gke-a3-megagpu.yaml index ca639e164c..6e3891a20e 100644 --- a/examples/gke-a3-megagpu/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu/gke-a3-megagpu.yaml @@ -50,15 +50,6 @@ vars: accelerator_type: nvidia-h100-mega-80gb version_prefix: "1.32." - enable_periodic_health_checks: false # Make this true to run CHS (healthchecks) - health_check_schedule: "0 0 * * 0" # Run the health check at 12:00 AM (midnight) every Sunday - - permissions_file_staged_path: $(ghpc_stage("./chs-permissions.yaml.tftpl")) - chs_output_bucket_name: chs-result - chs_pvc_claim_name: chs-output-pvc - chs_cronjob_rendered_path: $(ghpc_stage("./chs-cronjob.yaml.tftpl")) - chs_pvc_rendered_path: $(ghpc_stage("./chs-pvc.yaml.tftpl")) - deployment_groups: - group: primary modules: @@ -149,28 +140,6 @@ deployment_groups: use: [gke_cluster] settings: apply_manifests: - - source: $(vars.permissions_file_staged_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - project_id: $(vars.project_id) - deployment_name: $(vars.deployment_name) - - source: $(vars.chs_pvc_rendered_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - pvc_name: $(vars.chs_pvc_claim_name) - access_mode: ReadWriteOnce - capacity: 1Gi - storage_class_name: standard-rwo - - source: $(vars.chs_cronjob_rendered_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - project_id: $(vars.project_id) - deployment_name: $(vars.deployment_name) - region: $(vars.region) - machine_type: a3-megagpu-8g - gcs_bucket: $(vars.chs_output_bucket_name) - gcs_pvc: $(vars.chs_pvc_claim_name) - cronjob_schedule: $(vars.health_check_schedule) kueue: install: true wait: true diff --git a/examples/gke-a3-megagpu/read-chs-logs-job.yaml b/examples/gke-a3-megagpu/read-chs-logs-job.yaml deleted file mode 100644 index 85d40387ff..0000000000 --- a/examples/gke-a3-megagpu/read-chs-logs-job.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: batch/v1 -kind: Job -metadata: - name: chs-output-reader-job - namespace: default -spec: - template: - spec: - serviceAccountName: workload-identity-k8s-sa - containers: - - name: reader-container - image: busybox - imagePullPolicy: IfNotPresent - command: - - /bin/sh - - -c - - | - echo "--- Listing all contents of /mnt/output ---" - ls -la /mnt/output - - echo "" - echo "--- Attempting to read the latest health scan result file ---" - - # Find the latest .txt file in /mnt/output using ls -t (sort by time, newest first) - # and head -1 to get only the first (newest) result. - LATEST_FILE=$(ls -t /mnt/output/*_healthscan_result_*.txt 2>/dev/null | head -n 1) - - if [ -n "$LATEST_FILE" ]; then # Check if LATEST_FILE is not empty - echo "Found latest health scan file: $LATEST_FILE" - echo "" - cat "$LATEST_FILE" - else - echo "No 'healthscan_result_*.txt' files found in /mnt/output." - fi - - echo "" - echo "--- Reading complete. Pod will exit. ---" - volumeMounts: - - name: chs-result - mountPath: /mnt/output - volumes: - - name: chs-result - persistentVolumeClaim: - claimName: chs-output-pvc - restartPolicy: Never - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "components.gke.io/gke-managed-components" - operator: "Exists" - effect: "NoSchedule" - backoffLimit: 0 diff --git a/examples/gke-a3-ultragpu/chs-cronjob.yaml.tftpl b/examples/gke-a3-ultragpu/chs-cronjob.yaml.tftpl deleted file mode 100644 index 413002bbf1..0000000000 --- a/examples/gke-a3-ultragpu/chs-cronjob.yaml.tftpl +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: batch/v1 -kind: CronJob -metadata: - name: cluster-health-scanner-cronjob -spec: - schedule: "${cronjob_schedule}" - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 - failedJobsHistoryLimit: 1 - suspend: false - jobTemplate: - spec: - template: - spec: - serviceAccountName: workload-identity-k8s-sa - containers: - - name: chs-runner - image: python:3.11-slim-buster - imagePullPolicy: Always - command: - - /bin/bash - - -c - - | - set -ex - set -x - apt-get update && apt-get install -y git curl gnupg -y - git clone https://github.com/GoogleCloudPlatform/cluster-health-scanner - cd cluster-health-scanner - apt-get install -y apt-transport-https ca-certificates - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list - apt-get update - apt-get install -y google-cloud-cli kubectl - apt-get install -y google-cloud-cli-gke-gcloud-auth-plugin - curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - pip3 install -r cli/requirements.txt - gcloud container clusters get-credentials ${deployment_name} --region ${region} --project ${project_id} - OUTPUT_DIR="/mnt/output" - mkdir -p $OUTPUT_DIR - TIMESTAMP="`date "+%Y-%m-%d %H:%M:%S"`" - OUTPUT_FILENAME="${deployment_name}_healthscan_result_$TIMESTAMP.txt" - FULL_OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FILENAME" - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c gpu --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c nccl --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c straggler --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c neper --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c tinymax --run_only_on_available_nodes - #python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c status --run_only_on_available_nodes > "$FULL_OUTPUT_PATH" 2>&1 - kubectl get nodes -o custom-columns="NODE:.metadata.name,NCCL_MARK:.metadata.labels.aiinfra/nccl-healthcheck-test,NCCL_BANDWIDTH:.metadata.labels.aiinfra/nccl-healthcheck-bandwidth,NCCL_RESULT:.metadata.labels.aiinfra/nccl-healthcheck-result,NCCL_RUNTIME:.metadata.labels.aiinfra/nccl-healthcheck-runtime-sec,TINYMAX_MARK:.metadata.labels.aiinfra/tinymax-healthcheck-test,TINYMAX_RESULT:.metadata.labels.aiinfra/tinymax-healthcheck-result,TINYMAX_RUNTIME:.metadata.labels.aiinfra/tinymax-healthcheck-runtime-sec,GPU_MARK:.metadata.labels.aiinfra/gpu-healthcheck-test,GPU_RESULT:.metadata.labels.aiinfra/gpu-healthcheck-result,GPU_RUNTIME:.metadata.labels.aiinfra/gpu-healthcheck-runtime-sec" > "$FULL_OUTPUT_PATH" 2>&1 - echo "Health scan outputs saved to $OUTPUT_DIR" - echo "Final output file: $OUTPUT_FILENAME" - volumeMounts: - - name: ${gcs_bucket} - mountPath: /mnt/output - volumes: - - name: ${gcs_bucket} - persistentVolumeClaim: - claimName: ${gcs_pvc} - restartPolicy: Never - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "components.gke.io/gke-managed-components" - operator: "Exists" - effect: "NoSchedule" - backoffLimit: 0 diff --git a/examples/gke-a3-ultragpu/chs-permissions.yaml.tftpl b/examples/gke-a3-ultragpu/chs-permissions.yaml.tftpl deleted file mode 100644 index a9a9f0a170..0000000000 --- a/examples/gke-a3-ultragpu/chs-permissions.yaml.tftpl +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: workload-identity-k8s-sa - namespace: default - annotations: - iam.gke.io/gcp-service-account: ${deployment_name}-gke-wl-sa@${project_id}.iam.gserviceaccount.com ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: cluster-health-scanner-job-role -rules: - - apiGroups: [""] - resources: - - "pods" - - "pods/log" - - "pods/exec" - - "nodes" - - "events" - - "services" - - "secrets" - - "configmaps" - - "serviceaccounts" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["apps"] - resources: - - "daemonsets" - - "deployments" - - "replicasets" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["batch"] - resources: - - "jobs" - - "jobs/status" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["rbac.authorization.k8s.io"] - resources: - - "clusterrolebindings" - - "clusterroles" - - "roles" - - "rolebindings" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: cluster-health-scanner-job-binding -subjects: -- kind: ServiceAccount - name: workload-identity-k8s-sa - namespace: default -roleRef: - kind: ClusterRole - name: cluster-health-scanner-job-role - apiGroup: rbac.authorization.k8s.io diff --git a/examples/gke-a3-ultragpu/chs-pvc.yaml.tftpl b/examples/gke-a3-ultragpu/chs-pvc.yaml.tftpl deleted file mode 100644 index d9618f058c..0000000000 --- a/examples/gke-a3-ultragpu/chs-pvc.yaml.tftpl +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: ${pvc_name} -spec: - accessModes: - - ${access_mode} - resources: - requests: - storage: ${capacity} - storageClassName: ${storage_class_name} - diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml index 322425a06f..f14629f29b 100644 --- a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml +++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml @@ -36,5 +36,3 @@ vars: # disk for each node of the system node pool. a3ultra_node_pool_disk_size_gb: A3ULTRA_NODE_POOL_DISK_SIZE_GB # the size of # disk for each node. - enable_periodic_health_checks: # Make this true to run CHS (healthchecks) - health_check_schedule: # Run the health check at 12:00 AM (midnight) every Sunday diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml index e1a09334b4..f550310b57 100644 --- a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml +++ b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml @@ -35,15 +35,6 @@ vars: accelerator_type: nvidia-h200-141gb version_prefix: "1.33." - enable_periodic_health_checks: false # Make this true to run CHS (healthchecks) - health_check_schedule: "0 0 * * 0" # Run the health check at 12:00 AM (midnight) every Sunday - - permissions_file_staged_path: $(ghpc_stage("./chs-permissions.yaml.tftpl")) - chs_output_bucket_name: chs-result - chs_pvc_claim_name: chs-output-pvc - chs_cronjob_rendered_path: $(ghpc_stage("./chs-cronjob.yaml.tftpl")) - chs_pvc_rendered_path: $(ghpc_stage("./chs-pvc.yaml.tftpl")) - # # To enable Managed-Lustre please uncomment this section and fill out the settings. # # Additionally, please uncomment the private_service_access, lustre_firewall_rule, managed-lustre and lustre-pv modules. # # Managed Lustre is only supported in specific regions and zones @@ -286,28 +277,6 @@ deployment_groups: use: [a3-ultragpu-cluster] settings: apply_manifests: - - source: $(vars.permissions_file_staged_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - project_id: $(vars.project_id) - deployment_name: $(vars.deployment_name) - - source: $(vars.chs_pvc_rendered_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - pvc_name: $(vars.chs_pvc_claim_name) - access_mode: ReadWriteOnce - capacity: 1Gi - storage_class_name: standard-rwo - - source: $(vars.chs_cronjob_rendered_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - project_id: $(vars.project_id) - deployment_name: $(vars.deployment_name) - region: $(vars.region) - machine_type: a3-ultragpu-8g - gcs_bucket: $(vars.chs_output_bucket_name) - gcs_pvc: $(vars.chs_pvc_claim_name) - cronjob_schedule: $(vars.health_check_schedule) kueue: install: true config_path: $(vars.kueue_configuration_path) diff --git a/examples/gke-a3-ultragpu/read-chs-logs-job.yaml b/examples/gke-a3-ultragpu/read-chs-logs-job.yaml deleted file mode 100644 index 85d40387ff..0000000000 --- a/examples/gke-a3-ultragpu/read-chs-logs-job.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: batch/v1 -kind: Job -metadata: - name: chs-output-reader-job - namespace: default -spec: - template: - spec: - serviceAccountName: workload-identity-k8s-sa - containers: - - name: reader-container - image: busybox - imagePullPolicy: IfNotPresent - command: - - /bin/sh - - -c - - | - echo "--- Listing all contents of /mnt/output ---" - ls -la /mnt/output - - echo "" - echo "--- Attempting to read the latest health scan result file ---" - - # Find the latest .txt file in /mnt/output using ls -t (sort by time, newest first) - # and head -1 to get only the first (newest) result. - LATEST_FILE=$(ls -t /mnt/output/*_healthscan_result_*.txt 2>/dev/null | head -n 1) - - if [ -n "$LATEST_FILE" ]; then # Check if LATEST_FILE is not empty - echo "Found latest health scan file: $LATEST_FILE" - echo "" - cat "$LATEST_FILE" - else - echo "No 'healthscan_result_*.txt' files found in /mnt/output." - fi - - echo "" - echo "--- Reading complete. Pod will exit. ---" - volumeMounts: - - name: chs-result - mountPath: /mnt/output - volumes: - - name: chs-result - persistentVolumeClaim: - claimName: chs-output-pvc - restartPolicy: Never - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "components.gke.io/gke-managed-components" - operator: "Exists" - effect: "NoSchedule" - backoffLimit: 0 diff --git a/examples/gke-a4/chs-cronjob.yaml.tftpl b/examples/gke-a4/chs-cronjob.yaml.tftpl deleted file mode 100644 index 413002bbf1..0000000000 --- a/examples/gke-a4/chs-cronjob.yaml.tftpl +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: batch/v1 -kind: CronJob -metadata: - name: cluster-health-scanner-cronjob -spec: - schedule: "${cronjob_schedule}" - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 - failedJobsHistoryLimit: 1 - suspend: false - jobTemplate: - spec: - template: - spec: - serviceAccountName: workload-identity-k8s-sa - containers: - - name: chs-runner - image: python:3.11-slim-buster - imagePullPolicy: Always - command: - - /bin/bash - - -c - - | - set -ex - set -x - apt-get update && apt-get install -y git curl gnupg -y - git clone https://github.com/GoogleCloudPlatform/cluster-health-scanner - cd cluster-health-scanner - apt-get install -y apt-transport-https ca-certificates - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list - apt-get update - apt-get install -y google-cloud-cli kubectl - apt-get install -y google-cloud-cli-gke-gcloud-auth-plugin - curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - pip3 install -r cli/requirements.txt - gcloud container clusters get-credentials ${deployment_name} --region ${region} --project ${project_id} - OUTPUT_DIR="/mnt/output" - mkdir -p $OUTPUT_DIR - TIMESTAMP="`date "+%Y-%m-%d %H:%M:%S"`" - OUTPUT_FILENAME="${deployment_name}_healthscan_result_$TIMESTAMP.txt" - FULL_OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FILENAME" - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c gpu --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c nccl --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c straggler --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c neper --run_only_on_available_nodes - python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c tinymax --run_only_on_available_nodes - #python3 cli/cluster_diag.py -o gke healthscan ${machine_type} -c status --run_only_on_available_nodes > "$FULL_OUTPUT_PATH" 2>&1 - kubectl get nodes -o custom-columns="NODE:.metadata.name,NCCL_MARK:.metadata.labels.aiinfra/nccl-healthcheck-test,NCCL_BANDWIDTH:.metadata.labels.aiinfra/nccl-healthcheck-bandwidth,NCCL_RESULT:.metadata.labels.aiinfra/nccl-healthcheck-result,NCCL_RUNTIME:.metadata.labels.aiinfra/nccl-healthcheck-runtime-sec,TINYMAX_MARK:.metadata.labels.aiinfra/tinymax-healthcheck-test,TINYMAX_RESULT:.metadata.labels.aiinfra/tinymax-healthcheck-result,TINYMAX_RUNTIME:.metadata.labels.aiinfra/tinymax-healthcheck-runtime-sec,GPU_MARK:.metadata.labels.aiinfra/gpu-healthcheck-test,GPU_RESULT:.metadata.labels.aiinfra/gpu-healthcheck-result,GPU_RUNTIME:.metadata.labels.aiinfra/gpu-healthcheck-runtime-sec" > "$FULL_OUTPUT_PATH" 2>&1 - echo "Health scan outputs saved to $OUTPUT_DIR" - echo "Final output file: $OUTPUT_FILENAME" - volumeMounts: - - name: ${gcs_bucket} - mountPath: /mnt/output - volumes: - - name: ${gcs_bucket} - persistentVolumeClaim: - claimName: ${gcs_pvc} - restartPolicy: Never - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "components.gke.io/gke-managed-components" - operator: "Exists" - effect: "NoSchedule" - backoffLimit: 0 diff --git a/examples/gke-a4/chs-permissions.yaml.tftpl b/examples/gke-a4/chs-permissions.yaml.tftpl deleted file mode 100644 index a9a9f0a170..0000000000 --- a/examples/gke-a4/chs-permissions.yaml.tftpl +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: workload-identity-k8s-sa - namespace: default - annotations: - iam.gke.io/gcp-service-account: ${deployment_name}-gke-wl-sa@${project_id}.iam.gserviceaccount.com ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: cluster-health-scanner-job-role -rules: - - apiGroups: [""] - resources: - - "pods" - - "pods/log" - - "pods/exec" - - "nodes" - - "events" - - "services" - - "secrets" - - "configmaps" - - "serviceaccounts" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["apps"] - resources: - - "daemonsets" - - "deployments" - - "replicasets" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["batch"] - resources: - - "jobs" - - "jobs/status" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] - - - apiGroups: ["rbac.authorization.k8s.io"] - resources: - - "clusterrolebindings" - - "clusterroles" - - "roles" - - "rolebindings" - verbs: ["list", "get", "create", "delete", "watch", "patch", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: cluster-health-scanner-job-binding -subjects: -- kind: ServiceAccount - name: workload-identity-k8s-sa - namespace: default -roleRef: - kind: ClusterRole - name: cluster-health-scanner-job-role - apiGroup: rbac.authorization.k8s.io diff --git a/examples/gke-a4/chs-pvc.yaml.tftpl b/examples/gke-a4/chs-pvc.yaml.tftpl deleted file mode 100644 index d9618f058c..0000000000 --- a/examples/gke-a4/chs-pvc.yaml.tftpl +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: ${pvc_name} -spec: - accessModes: - - ${access_mode} - resources: - requests: - storage: ${capacity} - storageClassName: ${storage_class_name} - diff --git a/examples/gke-a4/gke-a4-deployment.yaml b/examples/gke-a4/gke-a4-deployment.yaml index a3d31aa771..10bc4d7475 100644 --- a/examples/gke-a4/gke-a4-deployment.yaml +++ b/examples/gke-a4/gke-a4-deployment.yaml @@ -51,6 +51,3 @@ vars: # The disk size of a4 node pool for this deployment. a4_node_pool_disk_size_gb: - - enable_periodic_health_checks: # Make this true to run CHS (healthchecks) - health_check_schedule: # Run the health check at 12:00 AM (midnight) every Sunday diff --git a/examples/gke-a4/gke-a4.yaml b/examples/gke-a4/gke-a4.yaml index 8c033c7caa..38ae86eac5 100644 --- a/examples/gke-a4/gke-a4.yaml +++ b/examples/gke-a4/gke-a4.yaml @@ -50,15 +50,6 @@ vars: accelerator_type: nvidia-b200 version_prefix: "1.33." - enable_periodic_health_checks: false # Make this true to run CHS (healthchecks) - health_check_schedule: "0 0 * * 0" # Run the health check at 12:00 AM (midnight) every Sunday - - permissions_file_staged_path: $(ghpc_stage("./chs-permissions.yaml.tftpl")) - chs_output_bucket_name: chs-result - chs_pvc_claim_name: chs-output-pvc - chs_cronjob_rendered_path: $(ghpc_stage("./chs-cronjob.yaml.tftpl")) - chs_pvc_rendered_path: $(ghpc_stage("./chs-pvc.yaml.tftpl")) - # # To enable Managed-Lustre please uncomment this section and fill out the settings. # # Additionally, please uncomment the private_service_access, lustre_firewall_rule, managed-lustre and lustre-pv modules. # # Managed Lustre is only supported in specific regions and zones @@ -302,28 +293,6 @@ deployment_groups: use: [a4-cluster] settings: apply_manifests: - - source: $(vars.permissions_file_staged_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - project_id: $(vars.project_id) - deployment_name: $(vars.deployment_name) - - source: $(vars.chs_pvc_rendered_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - pvc_name: $(vars.chs_pvc_claim_name) - access_mode: ReadWriteOnce - capacity: 1Gi - storage_class_name: standard-rwo - - source: $(vars.chs_cronjob_rendered_path) - enable: $(vars.enable_periodic_health_checks) - template_vars: - project_id: $(vars.project_id) - deployment_name: $(vars.deployment_name) - region: $(vars.region) - machine_type: a4-highgpu-8g - gcs_bucket: $(vars.chs_output_bucket_name) - gcs_pvc: $(vars.chs_pvc_claim_name) - cronjob_schedule: $(vars.health_check_schedule) kueue: install: true config_path: $(vars.kueue_configuration_path) diff --git a/examples/gke-a4/read-chs-logs-job.yaml b/examples/gke-a4/read-chs-logs-job.yaml deleted file mode 100644 index 85d40387ff..0000000000 --- a/examples/gke-a4/read-chs-logs-job.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: batch/v1 -kind: Job -metadata: - name: chs-output-reader-job - namespace: default -spec: - template: - spec: - serviceAccountName: workload-identity-k8s-sa - containers: - - name: reader-container - image: busybox - imagePullPolicy: IfNotPresent - command: - - /bin/sh - - -c - - | - echo "--- Listing all contents of /mnt/output ---" - ls -la /mnt/output - - echo "" - echo "--- Attempting to read the latest health scan result file ---" - - # Find the latest .txt file in /mnt/output using ls -t (sort by time, newest first) - # and head -1 to get only the first (newest) result. - LATEST_FILE=$(ls -t /mnt/output/*_healthscan_result_*.txt 2>/dev/null | head -n 1) - - if [ -n "$LATEST_FILE" ]; then # Check if LATEST_FILE is not empty - echo "Found latest health scan file: $LATEST_FILE" - echo "" - cat "$LATEST_FILE" - else - echo "No 'healthscan_result_*.txt' files found in /mnt/output." - fi - - echo "" - echo "--- Reading complete. Pod will exit. ---" - volumeMounts: - - name: chs-result - mountPath: /mnt/output - volumes: - - name: chs-result - persistentVolumeClaim: - claimName: chs-output-pvc - restartPolicy: Never - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "components.gke.io/gke-managed-components" - operator: "Exists" - effect: "NoSchedule" - backoffLimit: 0