Neelabh94 · Neelabh94 · Oct 16, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -276,7 +276,7 @@ limitations under the License.
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_bucket"></a> [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 6.1 |
+| <a name="module_bucket"></a> [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | >= 6.1 |
 | <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a |
 | <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a |
 | <a name="module_login"></a> [login](#module\_login) | ../../internal/slurm-gcp/login | n/a |

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf
@@ -24,7 +24,7 @@ locals {
 
 module "bucket" {
   source  = "terraform-google-modules/cloud-storage/google"
-  version = "~> 6.1"
+  version = ">= 6.1"
 
   count = var.create_bucket ? 1 : 0
 

diff --git a/examples/gke-managed-hyperdisk.yaml b/examples/gke-managed-hyperdisk.yaml
@@ -122,106 +122,24 @@ deployment_groups:
     settings:
       name: sample-pool
       zones: [$(vars.zone)]
-      machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
+      machine_type: c3-standard-88 # Hyperdisk-extreme requires C3 machine with 88 or more vCPUs
       auto_upgrade: true
 
-  # Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
-  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
-  - id: hyperdisk-balanced-job
+  # This is an example job that will install and run an `fio`benchmark against the hyperdisk volumes.
+  # For more FIO tests, see https://cloud.google.com/compute/docs/disks/benchmark-hyperdisk-performance
+  - id: fio-bench-job-template
     source: modules/compute/gke-job-template
     use:
     - gke_cluster
     - hyperdisk-balanced-setup
-    settings:
-      name: tensorflow
-      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
-      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
-      - key: runAsUser
-        value: 1000
-      - key: runAsGroup
-        value: 100
-      - key: fsGroup
-        value: 100
-      command:
-      - bash
-      - -c
-      - |
-        pip install transformers datasets
-        python - <<EOF
-        from datasets import load_dataset
-        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-balanced-pvc-0')
-        dataset = dataset["train"]
-        from transformers import AutoTokenizer
-        import numpy as np
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        sentences = [str(s) for s in dataset["sentence"]]
-        tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
-        tokenized_data = dict(tokenized_data)
-        labels = np.array(dataset["label"])
-        from transformers import TFAutoModelForSequenceClassification
-        from tensorflow.keras.optimizers import Adam
-        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-        model.compile(optimizer=Adam(3e-5))
-        model.fit(tokenized_data, labels)
-        EOF
-      node_count: 1
-    outputs: [instructions]
-
-  # Train a TensorFlow model with Keras and Hyperdisk Extreme on GKE
-  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
-  - id: hyperdisk-extreme-job
-    source: modules/compute/gke-job-template
-    use:
-    - gke_cluster
     - hyperdisk-extreme-setup
-    settings:
-      name: tensorflow
-      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
-      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
-      - key: runAsUser
-        value: 1000
-      - key: runAsGroup
-        value: 100
-      - key: fsGroup
-        value: 100
-      command:
-      - bash
-      - -c
-      - |
-        pip install transformers datasets
-        python - <<EOF
-        from datasets import load_dataset
-        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-extreme-pvc-0')
-        dataset = dataset["train"]
-        from transformers import AutoTokenizer
-        import numpy as np
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        sentences = [str(s) for s in dataset["sentence"]]
-        tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
-        tokenized_data = dict(tokenized_data)
-        labels = np.array(dataset["label"])
-        from transformers import TFAutoModelForSequenceClassification
-        from tensorflow.keras.optimizers import Adam
-        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-        model.compile(optimizer=Adam(3e-5))
-        model.fit(tokenized_data, labels)
-        EOF
-      node_count: 1
-    outputs: [instructions]
-
-  # Train a TensorFlow model with Keras and Hyperdisk Throughput on GKE
-  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
-  - id: hyperdisk-throughput-job
-    source: modules/compute/gke-job-template
-    use:
-    - gke_cluster
     - hyperdisk-throughput-setup
     settings:
-      name: tensorflow
-      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
-      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      name: fio-benchmark
+      image: ubuntu:latest
+      security_context:  # to make sure the job have enough access to install the fio packages
       - key: runAsUser
-        value: 1000
+        value: 0
       - key: runAsGroup
         value: 100
       - key: fsGroup
@@ -230,23 +148,56 @@ deployment_groups:
       - bash
       - -c
       - |
-        pip install transformers datasets
-        python - <<EOF
-        from datasets import load_dataset
-        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-throughput-pvc-0')
-        dataset = dataset["train"]
-        from transformers import AutoTokenizer
-        import numpy as np
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        sentences = [str(s) for s in dataset["sentence"]]
-        tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
-        tokenized_data = dict(tokenized_data)
-        labels = np.array(dataset["label"])
-        from transformers import TFAutoModelForSequenceClassification
-        from tensorflow.keras.optimizers import Adam
-        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
-        model.compile(optimizer=Adam(3e-5))
-        model.fit(tokenized_data, labels)
-        EOF
+
+        set -eux
+
+        cleanup() {
+          # This function will be called on script exit
+          if [ -n "${TAG:-}" ]; then
+            echo "--- Cleaning up temporary directories for tag ${TAG} ---"
+            rm -rf "/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}"
+            rm -rf "/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}"
+            rm -rf "/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}"
+          fi
+          }
+        trap cleanup EXIT
+
+        export DEBIAN_FRONTEND=noninteractive
+
+        # Install fio
+        apt update -y && apt install -y fio
+
+        # Use a tag to create a unique path for tests
+        TAG=`date +%s`
+
+        # Verify mountpoints
+        df -h
+        mountpoint /data/hyperdisk-balanced-pvc-0
+        mountpoint /data/hyperdisk-extreme-pvc-0
+        mountpoint /data/hyperdisk-throughput-pvc-0
+
+        # Create temporary directory for fio benchmarks
+        mkdir -p "/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}"
+        mkdir -p "/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}"
+        mkdir -p "/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}"
+
+        # Perform hyperdisk balanced performance (Mixed IOPS) test
+        fio --name=hyperdisk-balanced-iops --ioengine=libaio --iodepth=256 --rw=randrw \
+        --bs=4k --direct=1 --size=10G --numjobs=16 --group_reporting --time_based --runtime=300s \
+        --ramp_time=10s --iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
+        --directory="/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}" --filename_format=fiotest-balanced-iops
+
+        # Perform hyperdisk extreme performance test (Max IOPS)
+        fio --name=hyperdisk-extreme-iops --ioengine=libaio --iodepth=256 --rw=randwrite \
+        --bs=4k --direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
+        --iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
+        --directory="/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}" --filename_format=fiotest-extreme-iops
+
+        # Perform hyperdisk throughput performance test
+        fio --name=hyperdisk-throughput-bw --ioengine=libaio --iodepth=64 --rw=write --bs=1M \
+        --direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
+        --iodepth_batch_submit=64 --iodepth_batch_complete_max=64 \
+        --directory="/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}" --filename_format=fiotest-throughput-bw
       node_count: 1
+
     outputs: [instructions]
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-a3-high.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-a3-high.yml
@@ -35,6 +35,13 @@
   debug:
     msg: "{{nccl_test_file_contents.stdout}}"
 
+- name: Pin nccl-plugin-gpudirecttcpx-dev to latest tag
+  delegate_to: localhost
+  ansible.builtin.replace:
+    path: "{{ workspace }}/examples/nccl-test.yaml"
+    regexp: 'nccl-plugin-gpudirecttcpx-dev:v3.1.9'
+    replace: 'nccl-plugin-gpudirecttcpx-dev:latest'
+
 - name: Create NCCL config map and deploy NCCL test pods
   delegate_to: localhost
   ansible.builtin.shell: |

diff --git a/.../cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml b/.../cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-hyperdisk.yml
@@ -12,30 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+- name: Assert variables are defined
+  ansible.builtin.assert:
+    that:
+    - region is defined
+    - custom_vars.project is defined
+
 - name: Get cluster credentials for kubectl
   delegate_to: localhost
   ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} --verbosity=debug
 
-- name: Execute the job
+- name: Run the FIO benchmark job and get its name
   delegate_to: localhost
   ansible.builtin.shell: |
-    jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
-    for job in "${jobs[@]}"; do
-      kubectl create -f "$job" -v=9
-    done
+    job_file=({{ workspace }}/{{ deployment_name }}/primary/fio-benchmark*)
+    # Assuming only one benchmark file matches
+    kubectl create -f "${job_file[0]}" -o=jsonpath='{.metadata.name}'
   args:
     executable: /bin/bash
-  changed_when: False
+  register: fio_job_create_output
+
+- name: Set FIO job name
+  ansible.builtin.set_fact:
+    fio_job_name: "{{ fio_job_create_output.stdout }}"
 
-- name: Wait for job to complete
+- name: Wait for FIO Job to complete
+  # The FIO job should take approximately 20 minutes, process times out after a max wait of 40 mins
   delegate_to: localhost
-  ansible.builtin.command: |
-    kubectl get job --field-selector  status.successful=1 -v=9
-  register: job_completion
-  until: job_completion.stdout_lines | length > 3 # 3 jobs total
-  retries: 80
-  delay: 15
-
-- name: Print job_completion debug output
-  ansible.builtin.debug:
-    var: job_completion.stdout_lines
+  ansible.builtin.command: "kubectl wait --for=condition=complete --timeout=40m job/{{ fio_job_name }}"
+  changed_when: false
+
+- name: Fetch logs from the FIO job pod and save to fio_pod_logs.txt
+  delegate_to: localhost
+  ansible.builtin.shell: |
+    pod_name="$(kubectl get pods -l job-name={{ fio_job_name }} -o jsonpath='{.items[0].metadata.name}')"
+    kubectl logs "$pod_name" > fio_pod_logs.txt
+
+- name: Print the FIO test logs
+  debug:
+    msg: "{{ lookup('file', 'fio_pod_logs.txt') }}"
+
+- name: Clean up FIO job
+  delegate_to: localhost
+  ansible.builtin.shell: |
+    kubectl delete job {{ fio_job_name }} -v=9
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-nccl.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-nccl.yml
@@ -0,0 +1,97 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+- name: Run prerequisite NCCL scripts
+  shell: |
+    set -x -e
+    cd {{ ansible_user_dir }}
+    rm -rf cluster-toolkit
+    git clone https://github.com/GoogleCloudPlatform/cluster-toolkit.git
+    cd cluster-toolkit/{{ nccl_test_path }}
+    bash import_pytorch_container.sh
+    sbatch --wait build-nccl-tests.sh
+  args:
+    chdir: "{{ ansible_user_dir }}"
+    executable: /bin/bash
+  register: build_result
+
+- name: Wait for aperture devices on compute nodes
+  shell: |
+    srun -N 2 --partition=a3mega --pty bash -c '
+      max_attempts=60 # Wait for up to 5 minute
+      attempt=0
+      while [ -z "$(ls -A /dev/aperture_devices 2>/dev/null)" ]; do
+        echo "Node $(hostname): Waiting for aperture devices... (Attempt $(($attempt + 1)))"
+        if [ $attempt -ge $max_attempts ]; then
+          echo "ERROR: Node $(hostname): Aperture devices not found after $max_attempts attempts." >&2
+          exit 1
+        fi
+        attempt=$(($attempt + 1))
+        sleep 5
+      done
+      echo "Node $(hostname): Aperture devices are ready."
+    '
+  args:
+    chdir: "{{ ansible_user_dir }}/cluster-toolkit/{{ nccl_test_path }}"
+    executable: /bin/bash
+
+- name: Submit NCCL test job and get job ID
+  shell: "sbatch run-nccl-tests.sh | awk '{print $4}'"
+  args:
+    chdir: "{{ ansible_user_dir }}/cluster-toolkit/{{ nccl_test_path }}"
+    executable: /bin/bash
+  register: sbatch_output
+  changed_when: false
+
+- name: Set job_id fact
+  set_fact:
+    job_id: "{{ sbatch_output.stdout | trim }}"
+
+- name: Wait for NCCL test job to complete
+  command: "sacct -j {{ job_id }} -n -o State"
+  register: job_status
+  until: "'COMPLETED' in job_status.stdout or 'FAILED' in job_status.stdout or 'TIMEOUT' in job_status.stdout"
+  retries: 60 # wait for 30 minutes
+  delay: 30
+  changed_when: false
+
+- name: Check final job status
+  fail:
+    msg: "NCCL test job {{ job_id }} failed with status {{ job_status.stdout }}. Check slurm-{{ job_id }}.out on the login node."
+  when: "'COMPLETED' not in job_status.stdout"
+
+- name: Verify NCCL test output
+  shell: "cat slurm-{{ job_id }}.out | grep '# Avg bus bandwidth'"
+  args:
+    chdir: "{{ ansible_user_dir }}/cluster-toolkit/{{ nccl_test_path }}"
+    executable: /bin/bash
+  register: nccl_output
+  changed_when: false
+
+- name: Display NCCL Test Result
+  debug:
+    msg: "NCCL Test Result: {{ nccl_output.stdout }}"
+  when: nccl_output.stdout is defined and nccl_output.stdout != ""
+
+- name: Extract average bus bandwidth
+  shell: "echo '{{ nccl_output.stdout }}' | awk -F':' '{print $NF}'"
+  register: avg_bus_bandwidth
+  changed_when: false
+  when: nccl_output.stdout is defined and nccl_output.stdout != ""
+
+- name: Ensure average bus bandwidth is sufficient
+  fail:
+    msg: "Average bus bandwidth is {{ avg_bus_bandwidth.stdout | trim }} GB/s, which is below the threshold of 30 GB/s."
+  when: (avg_bus_bandwidth.stdout | default('0') | float) < 30
diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-ubuntu.yml b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-ubuntu.yml
@@ -25,12 +25,14 @@ controller_node: "{{ slurm_cluster_name }}-controller"
 region: us-west4
 zone: us-west4-a
 network: "{{ deployment_name }}-net-0"
+nccl_test_path: "examples/machine-learning/a3-megagpu-8g/nccl-tests"
 sub_network: "{{ deployment_name }}-sub-net-0"
 post_deploy_tests:
 - test-validation/test-mounts.yml
 - test-validation/test-partitions.yml
 - test-validation/test-enroot.yml
 - test-validation/test-gpus-slurm.yml
+- test-validation/test-nccl.yml
 post_destroy_tasks:
 - post-destroy-tasks/delete-image.yml
 custom_vars: