Skip to content

Commit 1a3020b

Browse files
author
Copybara
committed
Copybara import of gpu-recipes:
- e237cb042b5c8e27d3dcef9497276bac40f129fe Adding Llama3.1-405B NeMo pretraining recipe for a3ultra - d3ea5606f29a49544aa116f71d5270ecde879ab1 Removed redundant artifact registry variable from README GitOrigin-RevId: d3ea5606f29a49544aa116f71d5270ecde879ab1
1 parent 2db62ee commit 1a3020b

File tree

9 files changed

+398
-26
lines changed

9 files changed

+398
-26
lines changed

src/frameworks/a3ultra/nemo-configs/llama-3.1-405b-576gpus-a3ultra-bf16.yaml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ trainer:
1010
enable_checkpointing: false
1111
use_distributed_sampler: false
1212
max_epochs: null
13-
max_steps: 20
13+
max_steps: 15
1414
max_time: 05:23:30:00
1515
log_every_n_steps: 1
1616
val_check_interval: null
@@ -44,6 +44,7 @@ exp_manager:
4444
seconds_to_sleep: 60
4545
explicit_log_dir: null
4646
model:
47+
tp_only_amax_red: false
4748
mcore_gpt: true
4849
micro_batch_size: 1
4950
global_batch_size: 2016
@@ -123,12 +124,6 @@ model:
123124
deterministic_mode: false
124125
transformer_engine: true
125126
fp8: false
126-
fp8_e4m3: false
127-
fp8_hybrid: false
128-
fp8_margin: 0
129-
fp8_interval: 1
130-
fp8_amax_history_len: 1024
131-
fp8_amax_compute_algo: max
132127
ub_tp_comm_overlap: false
133128
use_flash_attention: true
134129
fsdp: false

src/frameworks/a3ultra/nemo-configs/llama-3.1-405b-576gpus-a3ultra-fp8.yaml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ exp_manager:
4444
seconds_to_sleep: 60
4545
explicit_log_dir: null
4646
model:
47+
tp_only_amax_red: true
4748
mcore_gpt: true
4849
micro_batch_size: 1
4950
global_batch_size: 2016
@@ -84,7 +85,7 @@ model:
8485
apply_rope_fusion: true
8586
attention_type: multihead
8687
share_embeddings_and_output_weights: false
87-
scale_positional_embedding: false
88+
scale_positional_embedding: true
8889
tokenizer:
8990
library: megatron
9091
type: GPT2BPETokenizer
@@ -101,7 +102,7 @@ model:
101102
grad_allreduce_chunk_size_mb: 125
102103
grad_div_ar_fusion: true
103104
gradient_accumulation_fusion: true
104-
cross_entropy_loss_fusion: false
105+
cross_entropy_loss_fusion: true
105106
bias_activation_fusion: true
106107
bias_dropout_add_fusion: true
107108
masked_softmax_fusion: true
@@ -118,7 +119,7 @@ model:
118119
num_micro_batches_with_partial_activation_checkpoints: null
119120
activations_checkpoint_layers_per_pipeline: null
120121
sequence_parallel: true
121-
defer_embedding_wgrad_compute: false
122+
defer_embedding_wgrad_compute: true
122123
wgrad_deferral_limit: 50
123124
deterministic_mode: false
124125
transformer_engine: true
@@ -135,18 +136,18 @@ model:
135136
fsdp_sharding_strategy: full
136137
fsdp_grad_reduce_dtype: bf16
137138
fsdp_sharded_checkpoint: false
138-
overlap_p2p_comm: false
139+
overlap_p2p_comm: true
139140
batch_p2p_comm: false
140141
gc_interval: 100
141142
optim:
142-
name: distributed_fused_adam
143+
name: mcore_distributed_optim
143144
grad_sync_dtype: bf16
144145
lr: 0.00015
145146
weight_decay: 0.1
146147
betas:
147148
- 0.9
148149
- 0.95
149-
bucket_cap_mb: 120
150+
bucket_cap_mb: 125
150151
overlap_grad_sync: true
151152
overlap_param_sync: true
152153
contiguous_grad_buffer: true

src/helm-charts/a3ultra/nemo-training/templates/nemo-launcher-job.yaml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ spec:
4242
template:
4343
metadata:
4444
annotations:
45-
{{- if $root.Values.queue }}
46-
kueue.x-k8s.io/podset-preferred-topology: {{ $root.Values.podsetPreferredTopology }}
47-
{{- end}}
45+
{{- if and (eq $root.Values.tasSettings.useLegacyTAS false) $root.Values.queue $root.Values.tasSettings.topologyRequest }}
46+
{{- toYaml .Values.tasSettings.topologyRequest | nindent 8 }}
47+
{{- end }}
4848
kubectl.kubernetes.io/default-container: megatron
4949
{{- if $root.Values.volumes.gcsMounts }}
5050
gke-gcsfuse/volumes: "true"
@@ -72,6 +72,10 @@ spec:
7272
{{- end }}
7373
{{- end }}
7474
spec:
75+
{{- if $root.Values.tasSettings.useLegacyTAS }}
76+
schedulingGates:
77+
- name: "gke.io/topology-aware-auto-scheduling"
78+
{{- end }}
7579
{{- if $root.Values.network.hostNetwork }}
7680
hostNetwork: true
7781
dnsPolicy: ClusterFirstWithHostNet

0 commit comments

Comments
 (0)