AI-Hypercomputer
diff --git a/‎src/frameworks/a3ultra/nemo-configs/llama-3.1-405b-576gpus-a3ultra-bf16.yaml‎
Lines changed: 2 additions & 7 deletions b/‎src/frameworks/a3ultra/nemo-configs/llama-3.1-405b-576gpus-a3ultra-bf16.yaml‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎src/frameworks/a3ultra/nemo-configs/llama-3.1-405b-576gpus-a3ultra-fp8.yaml‎
Lines changed: 7 additions & 6 deletions b/‎src/frameworks/a3ultra/nemo-configs/llama-3.1-405b-576gpus-a3ultra-fp8.yaml‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/helm-charts/a3ultra/nemo-training/templates/nemo-launcher-job.yaml‎
Lines changed: 7 additions & 3 deletions b/‎src/helm-charts/a3ultra/nemo-training/templates/nemo-launcher-job.yaml‎
Lines changed: 7 additions & 3 deletions
@@ -10,7 +10,7 @@ trainer:
   enable_checkpointing: false
   use_distributed_sampler: false
   max_epochs: null
-  max_steps: 20
+  max_steps: 15
   max_time: 05:23:30:00
   log_every_n_steps: 1
   val_check_interval: null
@@ -44,6 +44,7 @@ exp_manager:
   seconds_to_sleep: 60
   explicit_log_dir: null
 model:
+  tp_only_amax_red: false
   mcore_gpt: true
   micro_batch_size: 1
   global_batch_size: 2016
@@ -123,12 +124,6 @@ model:
   deterministic_mode: false
   transformer_engine: true
   fp8: false
-  fp8_e4m3: false
-  fp8_hybrid: false
-  fp8_margin: 0
-  fp8_interval: 1
-  fp8_amax_history_len: 1024
-  fp8_amax_compute_algo: max
   ub_tp_comm_overlap: false
   use_flash_attention: true
   fsdp: false
 
@@ -44,6 +44,7 @@ exp_manager:
   seconds_to_sleep: 60
   explicit_log_dir: null
 model:
+  tp_only_amax_red: true
   mcore_gpt: true
   micro_batch_size: 1
   global_batch_size: 2016
@@ -84,7 +85,7 @@ model:
   apply_rope_fusion: true
   attention_type: multihead
   share_embeddings_and_output_weights: false
-  scale_positional_embedding: false
+  scale_positional_embedding: true
   tokenizer:
     library: megatron
     type: GPT2BPETokenizer
@@ -101,7 +102,7 @@ model:
   grad_allreduce_chunk_size_mb: 125
   grad_div_ar_fusion: true
   gradient_accumulation_fusion: true
-  cross_entropy_loss_fusion: false
+  cross_entropy_loss_fusion: true
   bias_activation_fusion: true
   bias_dropout_add_fusion: true
   masked_softmax_fusion: true
@@ -118,7 +119,7 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: true
-  defer_embedding_wgrad_compute: false
+  defer_embedding_wgrad_compute: true
   wgrad_deferral_limit: 50
   deterministic_mode: false
   transformer_engine: true
@@ -135,18 +136,18 @@ model:
   fsdp_sharding_strategy: full
   fsdp_grad_reduce_dtype: bf16
   fsdp_sharded_checkpoint: false
-  overlap_p2p_comm: false
+  overlap_p2p_comm: true
   batch_p2p_comm: false
   gc_interval: 100
   optim:
-    name: distributed_fused_adam
+    name: mcore_distributed_optim
     grad_sync_dtype: bf16
     lr: 0.00015
     weight_decay: 0.1
     betas:
     - 0.9
     - 0.95
-    bucket_cap_mb: 120
+    bucket_cap_mb: 125
     overlap_grad_sync: true
     overlap_param_sync: true
     contiguous_grad_buffer: true
 
@@ -42,9 +42,9 @@ spec:
   template:
     metadata:
       annotations:
-        {{- if $root.Values.queue }}
-        kueue.x-k8s.io/podset-preferred-topology: {{ $root.Values.podsetPreferredTopology }}
-        {{- end}}
+        {{- if and (eq $root.Values.tasSettings.useLegacyTAS false)  $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+          {{- toYaml .Values.tasSettings.topologyRequest | nindent 8 }}
+        {{- end }}
         kubectl.kubernetes.io/default-container: megatron
         {{- if $root.Values.volumes.gcsMounts }}
         gke-gcsfuse/volumes: "true"
@@ -72,6 +72,10 @@ spec:
         {{- end }}
         {{- end }}
     spec:
+      {{- if $root.Values.tasSettings.useLegacyTAS }}
+      schedulingGates:
+      - name: "gke.io/topology-aware-auto-scheduling"
+      {{- end }}
       {{- if $root.Values.network.hostNetwork }}
       hostNetwork: true
       dnsPolicy: ClusterFirstWithHostNet