Skip to content
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ limitations under the License.

| Name | Source | Version |
|------|--------|---------|
| <a name="module_bucket"></a> [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 6.1 |
| <a name="module_bucket"></a> [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | >= 6.1 |
| <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a |
| <a name="module_gpu"></a> [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a |
| <a name="module_login"></a> [login](#module\_login) | ../../internal/slurm-gcp/login | n/a |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ locals {

module "bucket" {
source = "terraform-google-modules/cloud-storage/google"
version = "~> 6.1"
version = ">= 6.1"

count = var.create_bucket ? 1 : 0

Expand Down
167 changes: 59 additions & 108 deletions examples/gke-managed-hyperdisk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,106 +122,24 @@ deployment_groups:
settings:
name: sample-pool
zones: [$(vars.zone)]
machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
machine_type: c3-standard-88 # Hyperdisk-extreme requires C3 machine with 88 or more vCPUs
auto_upgrade: true

# Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
- id: hyperdisk-balanced-job
# This is an example job that will install and run an `fio`benchmark against the hyperdisk volumes.
# For more FIO tests, see https://cloud.google.com/compute/docs/disks/benchmark-hyperdisk-performance
- id: fio-bench-job-template
source: modules/compute/gke-job-template
use:
- gke_cluster
- hyperdisk-balanced-setup
settings:
name: tensorflow
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
- key: runAsUser
value: 1000
- key: runAsGroup
value: 100
- key: fsGroup
value: 100
command:
- bash
- -c
- |
pip install transformers datasets
python - <<EOF
from datasets import load_dataset
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-balanced-pvc-0')
dataset = dataset["train"]
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentences = [str(s) for s in dataset["sentence"]]
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)
EOF
node_count: 1
outputs: [instructions]

# Train a TensorFlow model with Keras and Hyperdisk Extreme on GKE
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
- id: hyperdisk-extreme-job
source: modules/compute/gke-job-template
use:
- gke_cluster
- hyperdisk-extreme-setup
settings:
name: tensorflow
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
- key: runAsUser
value: 1000
- key: runAsGroup
value: 100
- key: fsGroup
value: 100
command:
- bash
- -c
- |
pip install transformers datasets
python - <<EOF
from datasets import load_dataset
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-extreme-pvc-0')
dataset = dataset["train"]
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentences = [str(s) for s in dataset["sentence"]]
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)
EOF
node_count: 1
outputs: [instructions]

# Train a TensorFlow model with Keras and Hyperdisk Throughput on GKE
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
- id: hyperdisk-throughput-job
source: modules/compute/gke-job-template
use:
- gke_cluster
- hyperdisk-throughput-setup
settings:
name: tensorflow
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
security_context: # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
name: fio-benchmark
image: ubuntu:latest
security_context: # to make sure the job have enough access to install the fio packages
- key: runAsUser
value: 1000
value: 0
- key: runAsGroup
value: 100
- key: fsGroup
Expand All @@ -230,23 +148,56 @@ deployment_groups:
- bash
- -c
- |
pip install transformers datasets
python - <<EOF
from datasets import load_dataset
dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-throughput-pvc-0')
dataset = dataset["train"]
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sentences = [str(s) for s in dataset["sentence"]]
tokenized_data = tokenizer(sentences, return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)
EOF

set -eux

cleanup() {
# This function will be called on script exit
if [ -n "${TAG:-}" ]; then
echo "--- Cleaning up temporary directories for tag ${TAG} ---"
rm -rf "/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}"
rm -rf "/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}"
rm -rf "/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}"
fi
}
trap cleanup EXIT

export DEBIAN_FRONTEND=noninteractive

# Install fio
apt update -y && apt install -y fio

# Use a tag to create a unique path for tests
TAG=`date +%s`

# Verify mountpoints
df -h
mountpoint /data/hyperdisk-balanced-pvc-0
mountpoint /data/hyperdisk-extreme-pvc-0
mountpoint /data/hyperdisk-throughput-pvc-0

# Create temporary directory for fio benchmarks
mkdir -p "/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}"
mkdir -p "/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}"
mkdir -p "/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}"

# Perform hyperdisk balanced performance (Mixed IOPS) test
fio --name=hyperdisk-balanced-iops --ioengine=libaio --iodepth=256 --rw=randrw \
--bs=4k --direct=1 --size=10G --numjobs=16 --group_reporting --time_based --runtime=300s \
--ramp_time=10s --iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
--directory="/data/hyperdisk-balanced-pvc-0/fio-benchmarks-${TAG}" --filename_format=fiotest-balanced-iops

# Perform hyperdisk extreme performance test (Max IOPS)
fio --name=hyperdisk-extreme-iops --ioengine=libaio --iodepth=256 --rw=randwrite \
--bs=4k --direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
--iodepth_batch_submit=256 --iodepth_batch_complete_max=256 \
--directory="/data/hyperdisk-extreme-pvc-0/fio-benchmarks-${TAG}" --filename_format=fiotest-extreme-iops

# Perform hyperdisk throughput performance test
fio --name=hyperdisk-throughput-bw --ioengine=libaio --iodepth=64 --rw=write --bs=1M \
--direct=1 --size=10G --numjobs=32 --group_reporting --time_based --runtime=300s --ramp_time=10s \
--iodepth_batch_submit=64 --iodepth_batch_complete_max=64 \
--directory="/data/hyperdisk-throughput-pvc-0/fio-benchmarks-${TAG}" --filename_format=fiotest-throughput-bw
node_count: 1

outputs: [instructions]
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@
debug:
msg: "{{nccl_test_file_contents.stdout}}"

- name: Pin nccl-plugin-gpudirecttcpx-dev to latest tag
delegate_to: localhost
ansible.builtin.replace:
path: "{{ workspace }}/examples/nccl-test.yaml"
regexp: 'nccl-plugin-gpudirecttcpx-dev:v3.1.9'
replace: 'nccl-plugin-gpudirecttcpx-dev:latest'

- name: Create NCCL config map and deploy NCCL test pods
delegate_to: localhost
ansible.builtin.shell: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,47 @@
# See the License for the specific language governing permissions and
# limitations under the License.

- name: Assert variables are defined
ansible.builtin.assert:
that:
- region is defined
- custom_vars.project is defined

- name: Get cluster credentials for kubectl
delegate_to: localhost
ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} --verbosity=debug

- name: Execute the job
- name: Run the FIO benchmark job and get its name
delegate_to: localhost
ansible.builtin.shell: |
jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
for job in "${jobs[@]}"; do
kubectl create -f "$job" -v=9
done
job_file=({{ workspace }}/{{ deployment_name }}/primary/fio-benchmark*)
# Assuming only one benchmark file matches
kubectl create -f "${job_file[0]}" -o=jsonpath='{.metadata.name}'
args:
executable: /bin/bash
changed_when: False
register: fio_job_create_output

- name: Set FIO job name
ansible.builtin.set_fact:
fio_job_name: "{{ fio_job_create_output.stdout }}"

- name: Wait for job to complete
- name: Wait for FIO Job to complete
# The FIO job should take approximately 20 minutes, process times out after a max wait of 40 mins
delegate_to: localhost
ansible.builtin.command: |
kubectl get job --field-selector status.successful=1 -v=9
register: job_completion
until: job_completion.stdout_lines | length > 3 # 3 jobs total
retries: 80
delay: 15

- name: Print job_completion debug output
ansible.builtin.debug:
var: job_completion.stdout_lines
ansible.builtin.command: "kubectl wait --for=condition=complete --timeout=40m job/{{ fio_job_name }}"
changed_when: false

- name: Fetch logs from the FIO job pod and save to fio_pod_logs.txt
delegate_to: localhost
ansible.builtin.shell: |
pod_name="$(kubectl get pods -l job-name={{ fio_job_name }} -o jsonpath='{.items[0].metadata.name}')"
kubectl logs "$pod_name" > fio_pod_logs.txt

- name: Print the FIO test logs
debug:
msg: "{{ lookup('file', 'fio_pod_logs.txt') }}"

- name: Clean up FIO job
delegate_to: localhost
ansible.builtin.shell: |
kubectl delete job {{ fio_job_name }} -v=9
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
- name: Run prerequisite NCCL scripts
shell: |
set -x -e
cd {{ ansible_user_dir }}
rm -rf cluster-toolkit
git clone https://github.com/GoogleCloudPlatform/cluster-toolkit.git
cd cluster-toolkit/{{ nccl_test_path }}
bash import_pytorch_container.sh
sbatch --wait build-nccl-tests.sh
args:
chdir: "{{ ansible_user_dir }}"
executable: /bin/bash
register: build_result

- name: Wait for aperture devices on compute nodes
shell: |
srun -N 2 --partition=a3mega --pty bash -c '
max_attempts=60 # Wait for up to 5 minute
attempt=0
while [ -z "$(ls -A /dev/aperture_devices 2>/dev/null)" ]; do
echo "Node $(hostname): Waiting for aperture devices... (Attempt $(($attempt + 1)))"
if [ $attempt -ge $max_attempts ]; then
echo "ERROR: Node $(hostname): Aperture devices not found after $max_attempts attempts." >&2
exit 1
fi
attempt=$(($attempt + 1))
sleep 5
done
echo "Node $(hostname): Aperture devices are ready."
'
args:
chdir: "{{ ansible_user_dir }}/cluster-toolkit/{{ nccl_test_path }}"
executable: /bin/bash

- name: Submit NCCL test job and get job ID
shell: "sbatch run-nccl-tests.sh | awk '{print $4}'"
args:
chdir: "{{ ansible_user_dir }}/cluster-toolkit/{{ nccl_test_path }}"
executable: /bin/bash
register: sbatch_output
changed_when: false

- name: Set job_id fact
set_fact:
job_id: "{{ sbatch_output.stdout | trim }}"

- name: Wait for NCCL test job to complete
command: "sacct -j {{ job_id }} -n -o State"
register: job_status
until: "'COMPLETED' in job_status.stdout or 'FAILED' in job_status.stdout or 'TIMEOUT' in job_status.stdout"
retries: 60 # wait for 30 minutes
delay: 30
changed_when: false

- name: Check final job status
fail:
msg: "NCCL test job {{ job_id }} failed with status {{ job_status.stdout }}. Check slurm-{{ job_id }}.out on the login node."
when: "'COMPLETED' not in job_status.stdout"

- name: Verify NCCL test output
shell: "cat slurm-{{ job_id }}.out | grep '# Avg bus bandwidth'"
args:
chdir: "{{ ansible_user_dir }}/cluster-toolkit/{{ nccl_test_path }}"
executable: /bin/bash
register: nccl_output
changed_when: false

- name: Display NCCL Test Result
debug:
msg: "NCCL Test Result: {{ nccl_output.stdout }}"
when: nccl_output.stdout is defined and nccl_output.stdout != ""

- name: Extract average bus bandwidth
shell: "echo '{{ nccl_output.stdout }}' | awk -F':' '{print $NF}'"
register: avg_bus_bandwidth
changed_when: false
when: nccl_output.stdout is defined and nccl_output.stdout != ""

- name: Ensure average bus bandwidth is sufficient
fail:
msg: "Average bus bandwidth is {{ avg_bus_bandwidth.stdout | trim }} GB/s, which is below the threshold of 30 GB/s."
when: (avg_bus_bandwidth.stdout | default('0') | float) < 30
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@ controller_node: "{{ slurm_cluster_name }}-controller"
region: us-west4
zone: us-west4-a
network: "{{ deployment_name }}-net-0"
nccl_test_path: "examples/machine-learning/a3-megagpu-8g/nccl-tests"
sub_network: "{{ deployment_name }}-sub-net-0"
post_deploy_tests:
- test-validation/test-mounts.yml
- test-validation/test-partitions.yml
- test-validation/test-enroot.yml
- test-validation/test-gpus-slurm.yml
- test-validation/test-nccl.yml
post_destroy_tasks:
- post-destroy-tasks/delete-image.yml
custom_vars:
Expand Down