feat: Update Llama 3.1 405B recipe for 64-GPU training

ngu3 · ngu3 · commit abc0e447296a · 2025-12-06T07:24:03.000Z
diff --git a/training/a4x/llama3-1-405b/README.md b/training/a4x/llama3-1-405b/README.md
@@ -22,6 +22,13 @@ This recipe has been optimized for and tested with the following configuration:
 Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4x)
 to create your a4x GKE cluster.
 
+> [NOTE]
+> **GKE version and workload placement**
+>
+> For GKE cluster versions `1.34.0-gke.1502000` and later, workload placement is mandatory. You must provide your own placement policy name. You can do this by editing `values.yaml` to set `workload.nodeSelector.cloud.google.com/placement-policy-name`
+>
+> For GKE cluster versions before `1.34.0-gke.1502000`, you can remove the `nodeSelector` section in `values.yaml`.
+
 ## Training dataset
 
 This recipe uses a mock pretraining dataset provided by the NeMo framework.
@@ -92,7 +99,7 @@ your client:
     export WORKLOAD_NAME=$USER-a4x-llama3-1-405b
     helm install $WORKLOAD_NAME . -f values.yaml \
     --set-file workload_launcher=launcher.sh \
-    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048.py \
+    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus64.py \
     --set workload.image=nvcr.io/nvidia/nemo:25.07 \
     --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
     --set volumes.gcsMounts[0].mountPath=/job-logs \
@@ -110,7 +117,7 @@ your client:
     export WORKLOAD_NAME=$USER-a4x-llama3-1-405b
     helm install $WORKLOAD_NAME . -f values.yaml \
     --set-file workload_launcher=launcher.sh \
-    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048.py \
+    --set-file workload_config=llama3-1-405b-fp8cs-gbs2048-gpus64.py \
     --set workload.image=nvcr.io/nvidia/nemo:25.07 \
     --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
     --set volumes.gcsMounts[0].mountPath=/job-logs \
diff --git a/training/a4x/llama3-1-405b/llama3-1-405b-fp8cs-gbs2048-gpus64.py b/training/a4x/llama3-1-405b/llama3-1-405b-fp8cs-gbs2048-gpus64.py
diff --git a/training/a4x/llama3-1-405b/templates/workload-job.yaml b/training/a4x/llama3-1-405b/templates/workload-job.yaml
@@ -96,6 +96,8 @@ spec:
               {{- end }}
               {{- end }}
           spec:
+            nodeSelector:
+            {{- toYaml .Values.workload.nodeSelector | nindent 14 }}
             {{- if $root.Values.network.hostNetwork }}
             hostNetwork: true
             dnsPolicy: ClusterFirstWithHostNet
diff --git a/training/a4x/llama3-1-405b/values.yaml b/training/a4x/llama3-1-405b/values.yaml
@@ -30,6 +30,9 @@ volumes:
   psVolumes: false
   ssdMountPath: "/ssd"
 workload:
+  nodeSelector:
+    cloud.google.com/gke-accelerator: nvidia-gb200
+    cloud.google.com/placement-policy-name: a4x-workload-policy-95cbc61c
   arguments[]: null
   configFile: llama3-1-405b-fp8cs-gbs2048-gpus64.py
   configPath: /workload/configs/
@@ -41,7 +44,7 @@ workload:
   - --compute_dtype=fp8
   - --fp8_recipe=cs
   - --global_batch_size=2048
-  - --max_steps=5
+  - --max_steps=30
   - --micro_batch_size=1
   - --tensor_parallel_size=2
   - --context_parallel_size=1