Skip to content

Commit ff16a47

Browse files
CopybaraCopybara
authored andcommitted
Copybara import of gpu-recipes:
- 2963f1f33f1822d384dc3a7167f990421bcf60ab Add aotc option. GitOrigin-RevId: 2963f1f33f1822d384dc3a7167f990421bcf60ab
1 parent 7fa4518 commit ff16a47

File tree

3 files changed

+51
-18
lines changed

3 files changed

+51
-18
lines changed

src/helm-charts/a3ultra/nemo-training/templates/nemo-launcher-job.yaml

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
{{ $nodes := div .Values.workload.gpus 8 | max 1 }}
2020
{{ $gpusPerNode := min .Values.workload.gpus 8 }}
21+
{{ $aotc := default 0 .Values.workload.aotc }}
2122

2223
{{- $root := . -}}
2324

@@ -152,6 +153,11 @@ spec:
152153
- name: SSD_MOUNT_PATH
153154
value: "{{ $root.Values.volumes.ssdMountPath }}"
154155

156+
{{- if $aotc }}
157+
- name: AOTC
158+
value: "{{ $aotc }}"
159+
{{- end }}
160+
155161
# The following settings are specific to the Torch distributed launcher:
156162
{{- range $gcs := $root.Values.volumes.gcsMounts }}
157163
- name: GCS_FUSE_BUCKET
@@ -253,9 +259,10 @@ spec:
253259
cat /etc/workload-configuration/nemo-configuration.yaml | sed 's/^/| /'
254260
echo ""
255261
262+
touch /workspace/workload_arguments.txt
256263
echo "Detected the following additional workload arguments:"
257264
{{- range $root.Values.workload.arguments }}
258-
echo "{{ . }}"
265+
echo "{{ . }}" | tee -a /workspace/workload_arguments.txt
259266
{{- end }}
260267
261268
@@ -275,22 +282,31 @@ spec:
275282
nvidia-smi dmon -d 20 -s pum &
276283
fi
277284
278-
OMP_NUM_THREADS=12 torchrun \
279-
--nproc-per-node="$GPUS_PER_NODE" \
280-
--nnodes="$NNODES" \
281-
--node_rank="$NODE_RANK" \
282-
--rdzv_id="$JOB_IDENTIFIER" \
283-
--master_addr="$MASTER_ADDR" \
284-
--master_port="$MASTER_PORT" \
285-
${TORCH_DISTRIBUTED_TARGET} \
286-
--config-path="/etc/workload-configuration" \
287-
--config-name="nemo-configuration.yaml" \
288-
+trainer.num_nodes="$NNODES" \
289-
+exp_manager.version="$JOB_IDENTIFIER" \
290-
+exp_manager.dllogger_logger_kwargs.json_file="/gcs/nemo-experiments/$JOB_IDENTIFIER/dllogger/rank-$NODE_RANK/dllogger.json" \
291-
{{- range $root.Values.workload.arguments }}
292-
{{ . }} \
293-
{{- end }}
285+
if [[ "{{ $aotc }}" == "0" ]]; then
286+
OMP_NUM_THREADS=12 torchrun \
287+
--nproc-per-node="$GPUS_PER_NODE" \
288+
--nnodes="$NNODES" \
289+
--node_rank="$NODE_RANK" \
290+
--rdzv_id="$JOB_IDENTIFIER" \
291+
--master_addr="$MASTER_ADDR" \
292+
--master_port="$MASTER_PORT" \
293+
${TORCH_DISTRIBUTED_TARGET} \
294+
--config-path="/etc/workload-configuration" \
295+
--config-name="nemo-configuration.yaml" \
296+
+trainer.num_nodes="$NNODES" \
297+
+exp_manager.version="$JOB_IDENTIFIER" \
298+
+exp_manager.dllogger_logger_kwargs.json_file="/gcs/nemo-experiments/$JOB_IDENTIFIER/dllogger/rank-$NODE_RANK/dllogger.json" \
299+
{{- range $root.Values.workload.arguments }}
300+
{{ . }} \
301+
{{- end }}
302+
303+
else
304+
echo "Using AOTC"
305+
export ENV_FILE=/workspace/env_vars.txt
306+
env > /workspace/env_vars.txt
307+
python -m aotc.executor
308+
309+
fi
294310
295311
echo "Copying log files"
296312
cp -r /workspace/nemo_experiments/megatron_gpt/$JOB_IDENTIFIER/* /gcs/nemo-experiments/$JOB_IDENTIFIER/

training/a3ultra/llama-3.1-70b/nemo-pretraining-gke/README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
108108

109109
### Build and push a docker container image to Artifact Registry
110110

111+
Note:If you'd like to use AotC-based [library](https://github.com/AI-Hypercomputer/aotc) image, you may skip this step.
112+
111113
To build the container, complete the following steps from your client:
112114

113115
1. Use Cloud Build to build and push the container image.
@@ -169,7 +171,21 @@ for this job. To do this, we can set the new arguments using `--set workload.arg
169171
--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
170172
--set workload.arguments="{trainer.max_steps=100}" \
171173
$USER-llama-3-1-70b-nemo-fp8 \
172-
$REPO_ROOT/src/helm-charts/a3mega/nemo-training
174+
$REPO_ROOT/src/helm-charts/a3ultra/nemo-training
175+
```
176+
177+
- To use the AotC-based image, run the following command from your client:
178+
179+
```bash
180+
cd $RECIPE_ROOT
181+
export IMAGE=us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo@sha256:7a84264e71f82f225be639dd20fcf9104c80936c0f4f38f94b88dfb60303c70e
182+
helm install -f values.yaml \
183+
--set-file nemo_config=$REPO_ROOT/src/frameworks/a3ultra/nemo-configs/llama-3.1-70b-256gpus-a3ultra-fp8.yaml \
184+
--set workload.image=${IMAGE} \
185+
--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
186+
--set-string workload.aotc=true \
187+
$USER-llama-3-1-70b-nemo-fp8 \
188+
$REPO_ROOT/src/helm-charts/a3ultra/nemo-training
173189
```
174190

175191
### Monitor the job

training/a3ultra/llama-3.1-70b/nemo-pretraining-gke/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ volumes:
2525
workload:
2626
torchDistributedTarget: "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
2727
gpus: 256 # This should be one of: {<= 8, multiple of 8}
28+
aotc: false
2829

2930
network:
3031
ncclSettings:

0 commit comments

Comments
 (0)