Merge branch 'main' into helenn-dev-rl-training-graphs-test

mathemakitten · mathemakitten · commit d1861840d0b3 · 2026-01-21T07:10:42.000-08:00
diff --git a/.github/actions/action.yml b/.github/actions/action.yml
@@ -126,7 +126,7 @@ runs:
         IS_CI_WORKLOAD: ${{ inputs.is_ci_workload }}
       run: |
         PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
-        HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")')
+        HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "$IS_CI_WORKLOAD"
         HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
         echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
 
diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json
@@ -1,8 +1,4 @@
 [
-    {
-        "user": "Phlip79",
-        "date": "2026-01-07"
-    },
     {
         "user": "BoxiangW",
         "date": "2026-01-14"
@@ -26,5 +22,29 @@
     {
         "user": "janEbert",
         "date": "2026-02-18"
+    },
+    {
+        "user": "maanug-nv",
+        "date": "2026-02-25"
+    },
+    {
+        "user": "BoxiangW",
+        "date": "2026-03-04"
+    },
+    {
+        "user": "Phlip79",
+        "date": "2026-03-11"
+    },
+    {
+        "user": "asolergi-nv",
+        "date": "2026-03-18"
+    },
+    {
+        "user": "dimapihtar",
+        "date": "2026-03-25"
+    },
+    {
+        "user": "gautham-kollu",
+        "date": "2026-04-01"
     }
 ]
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -406,6 +406,7 @@ jobs:
           is_unit_test: "true"
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
+          is_ci_workload: ${{ needs.pre-flight.outputs.is_ci_workload }}
 
   cicd-parse-integration-tests:
     runs-on: ubuntu-latest
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import copy
+import inspect
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import NoReturn, Optional, Tuple, Union
@@ -578,6 +579,74 @@ def flash_decode(
         )
         return out
 
+    def _flash_attention_3_forward_wrapper(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        max_seqlen_q,
+        max_seqlen_k,
+        cu_seqlens_q,
+        seqlens_k,
+        block_table,
+        softmax_scale,
+    ):
+        """
+        Wrapper for calling the FA3 _flash_attn_forward function.
+        Handles argument conversion for different versions of the _flash_attn_forward API.
+        """
+        candidate_kwargs = {
+            "q": q,
+            "k": k,
+            "v": v,
+            "k_new": None,
+            "v_new": None,
+            "qv": None,
+            "out": None,
+            "out_": None,
+            "cu_seqlens_q": cu_seqlens_q,
+            "cu_seqlens_k": None,
+            "cu_seqlens_k_new": None,
+            "seqused_q": None,
+            "seqused_k": seqlens_k,
+            "max_seqlen_q": max_seqlen_q,
+            "max_seqlen_k": max_seqlen_k,
+            "page_table": block_table,
+            "kv_batch_idx": None,
+            "leftpad_k": None,
+            "rotary_cos": None,
+            "rotary_sin": None,
+            "seqlens_rotary": None,
+            "q_descale": None,
+            "k_descale": None,
+            "v_descale": None,
+            "softmax_scale": softmax_scale,
+            "causal": True,
+            "attention_chunk": 0,
+            "softcap": 0.0,
+            "window_size": (-1, -1),
+            "window_size_left": -1,
+            "window_size_right": -1,
+            "rotary_interleaved": True,
+            "scheduler_metadata": None,
+            "num_splits": 0 if not self.batch_invariant_mode else 1,
+            "pack_gqa": None,
+            "sm_margin": 0,
+        }
+
+        # Parse the expect argument names from the function signature
+        if inspect.isfunction(_flash_attn_forward):
+            sig = inspect.signature(_flash_attn_forward)
+        else:
+            assert isinstance(_flash_attn_forward, torch._library.custom_ops.CustomOpDef)
+            sig = inspect.signature(_flash_attn_forward._init_fn)
+        valid_kwargs = set(sig.parameters.keys())
+        final_kwargs = {k: candidate_kwargs[k] for k in valid_kwargs if k in candidate_kwargs}
+
+        output_total, *unused = _flash_attn_forward(**final_kwargs)
+
+        return output_total
+
     def flash_decode_and_prefill(
         self,
         q: Tensor,
@@ -619,40 +688,16 @@ def flash_decode_and_prefill(
             if HAVE_FA3:
                 # TODO(ksanthanam): Replace with call to flash_attn_varlen_func once
                 # it accepts block_table
-                output_total, *unused = _flash_attn_forward(
-                    q=q,
-                    k=k,
-                    v=v,
-                    k_new=None,
-                    v_new=None,
-                    qv=None,
-                    out=None,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=None,
-                    cu_seqlens_k_new=None,
-                    seqused_q=None,
-                    seqused_k=seqlens_k,
-                    max_seqlen_q=max_seqlen_q,
-                    max_seqlen_k=max_seqlen_k,
-                    page_table=block_table,
-                    kv_batch_idx=None,
-                    leftpad_k=None,
-                    rotary_cos=None,
-                    rotary_sin=None,
-                    seqlens_rotary=None,
-                    q_descale=None,
-                    k_descale=None,
-                    v_descale=None,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=(-1, -1),
-                    attention_chunk=0,
-                    softcap=0.0,
-                    rotary_interleaved=True,
-                    scheduler_metadata=None,
-                    num_splits=0 if not self.batch_invariant_mode else 1,
-                    pack_gqa=None,
-                    sm_margin=0,
+                output_total = self._flash_attention_3_forward_wrapper(
+                    q,
+                    k,
+                    v,
+                    max_seqlen_q,
+                    max_seqlen_k,
+                    cu_seqlens_q,
+                    seqlens_k,
+                    block_table,
+                    softmax_scale,
                 )
             else:
                 assert (
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -914,6 +914,8 @@ def validate_args(args, defaults={}):
         if args.save_retain_interval is not None:
             assert args.save_retain_interval > 0
             assert args.save_retain_interval % args.save_interval == 0
+    if args.log_memory_interval is not None:
+        assert args.log_memory_interval % args.log_interval == 0
     # Mixed precision checks.
     if args.fp16_lm_cross_entropy:
         assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
@@ -2369,6 +2371,10 @@ def _add_training_args(parser):
                        'with larger models, sequences, and batch sizes.')
     group.add_argument('--log-interval', type=int, default=100,
                        help='Report loss and timing interval.')
+    group.add_argument('--log-memory-interval', type=int, default=None,
+                       help='Report memory interval.')
+    group.add_argument('--log-device-memory-used', action='store_true',
+                       help='Log device memory used (as reported by nvidia-smi).')
     group.add_argument('--tensorboard-dir', type=str, default=None,
                        help='Write TensorBoard logs to this directory.')
     group.add_argument('--no-masked-softmax-fusion',
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
@@ -64,6 +64,7 @@
     has_nvidia_modelopt = False
 
 _CHECKPOINT_VERSION = None
+_LOADED_ITERATION = None
 
 logger = getLogger(__name__)
 _NON_PERSISTENT_CKPT_SUBDIR = 'non_persistent'
@@ -81,6 +82,22 @@ def get_checkpoint_version():
     return _CHECKPOINT_VERSION
 
 
+def set_loaded_iteration(value):
+    """Set the iteration that was loaded from checkpoint.
+
+    This is stored separately from args to avoid polluting the checkpoint
+    with runtime state (args is saved in checkpoints).
+    """
+    global _LOADED_ITERATION
+    _LOADED_ITERATION = value
+
+
+def get_loaded_iteration():
+    """Get the iteration that was loaded from checkpoint, or None if no checkpoint was loaded."""
+    global _LOADED_ITERATION
+    return _LOADED_ITERATION
+
+
 def check_checkpoint_args(checkpoint_args):
     """Ensure fixed arguments for a model are the same for the input
     arguments and the one retrieved from checkpoint."""
@@ -1132,6 +1149,10 @@ def _load_base_checkpoint(
     if getattr(args, "ckpt_step", None):
         iteration = args.ckpt_step
 
+    # Record the iteration loaded (stored separately from args to avoid
+    # polluting checkpoints, since args is saved in checkpoints).
+    set_loaded_iteration(iteration)
+
     if non_persistent_iteration != -1:  # there is a non-persistent checkpoint
         if non_persistent_iteration >= iteration:
             return _load_non_persistent_base_checkpoint(
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -102,6 +102,7 @@ def set_startup_timestamps(program_start=None, main_entry=None):
 from megatron.training.checkpointing import load_checkpoint
 from megatron.training.checkpointing import save_checkpoint
 from megatron.training.checkpointing import checkpoint_exists
+from megatron.training.checkpointing import get_loaded_iteration
 from megatron.core.full_cuda_graph import FullCudaGraphWrapper
 from megatron.core.transformer.cuda_graphs import TECudaGraphHelper
 from megatron.core.transformer.enums import CudaGraphScope
@@ -1899,6 +1900,8 @@ def training_log(
             writer.add_scalar('max_attention_logit', max_attention_logit, iteration)
             if wandb_writer:
                 wandb_writer.log({'max_attention_logit': max_attention_logit}, iteration)
+
+    # Log MoE metrics.
     if args.num_experts is not None:
         moe_loss_scale = 1 / get_num_microbatches()
         track_names = []
@@ -1930,12 +1933,15 @@ def training_log(
             mtp_num_layers=args.mtp_num_layers,
             pg_collection=pg_collection,
         )
+
+    # Log MTP metrics.
     if args.mtp_num_layers is not None:
         mtp_loss_scale = 1 / get_num_microbatches()
         MTPLossLoggingHelper.track_mtp_metrics(
             mtp_loss_scale, iteration, writer, wandb_writer, total_loss_dict
         )
-    # Track sparse attention indexer loss
+
+    # Track sparse attention indexer loss.
     if args.dsa_indexer_loss_coeff is not None and args.dsa_indexer_loss_coeff > 0:
         indexer_loss_scale = 1 / get_num_microbatches()
         DSAIndexerLossLoggingHelper.track_indexer_metrics(
@@ -1945,6 +1951,8 @@ def training_log(
             wandb_writer=wandb_writer,
             total_loss_dict=total_loss_dict,
         )
+
+    # Dump memory snapshot and print metrics to stdout.
     if iteration % args.log_interval == 0 or is_first_iteration:
         if args.record_memory_history and (is_last_rank() or torch.distributed.get_backend() == 'fake'):
             snapshot = torch.cuda.memory._snapshot()
@@ -2026,16 +2034,22 @@ def training_log(
             total_loss_dict[skipped_iters_key] = 0
             total_loss_dict[nan_iters_key] = 0
         print_rank_last(log_string)
+        reported_memory_in_this_iteration = False
         if report_memory_flag:
             # Report memory after optimizer state has been initialized.
             if torch.distributed.get_rank() == 0:
                 num_microbatches = get_num_microbatches()
                 report_theoretical_memory(args, num_microbatches=num_microbatches, verbose=True)
             report_memory(f'(after {iteration} iterations)')
-            if iteration > 1:
+            reported_memory_in_this_iteration = True
+            loaded_iteration = max(get_loaded_iteration() or 0, 0)
+            if iteration > (loaded_iteration + 1):
                 # Make sure the memory after the second iteration is reported to include optimizer state memory.
                 report_memory_flag = False
-        # Write timers to wandb, don't reset the counts
+        if args.log_memory_interval is not None and iteration % args.log_memory_interval == 0 and \
+            not reported_memory_in_this_iteration:
+            report_memory(f'(after {iteration} iterations)')
+        # Write timers to wandb, don't reset the counts.
         if args.log_timers_to_tensorboard:
             timers.write(timers_to_log, writer, iteration, normalizer=args.log_interval, reset=False)
             timers.write(timers_to_log, wandb_writer, iteration, normalizer=args.log_interval, reset=False)
@@ -2095,6 +2109,9 @@ def force_param_sync(model_chunks: list[DDP]) -> None:
         assert isinstance(model_chunk, DDP)
         model_chunk.start_param_sync(force_sync=True)
 
+# Only report memory for first 3 checkpoint saves.
+num_checkpoints_memory_reported = 0
+MAX_NUM_CHECKPOINTS_MEMORY_REPORTED = 3
 
 def save_checkpoint_and_time(
     iteration,
@@ -2122,6 +2139,14 @@ def save_checkpoint_and_time(
     one_logger_utils.track_e2e_metrics()
     if should_disable_forward_pre_hook(args):
         force_param_sync(model)
+
+    global num_checkpoints_memory_reported, MAX_NUM_CHECKPOINTS_MEMORY_REPORTED
+    should_report_memory = num_checkpoints_memory_reported < MAX_NUM_CHECKPOINTS_MEMORY_REPORTED
+
+    if should_report_memory:
+        # Track memory before checkpoint save.
+        report_memory(f"(before save_checkpoint for iteration {iteration})")
+    # Save checkpoint.
     save_checkpoint(
         iteration,
         model,
@@ -2133,6 +2158,11 @@ def save_checkpoint_and_time(
         train_data_iterator=train_data_iterator,
         preprocess_common_state_dict_fn=preprocess_common_state_dict,
     )
+    if should_report_memory:
+        # Track memory after checkpoint save.
+        report_memory(f"(after save_checkpoint for iteration {iteration})")
+    num_checkpoints_memory_reported += 1
+
     if args.fp8:
         # Run garbage collection after checkpoint saving to free memory from
         # dequantized bf16 tensors that were temporarily created during fp8
diff --git a/megatron/training/utils.py b/megatron/training/utils.py
@@ -276,12 +276,15 @@ def logical_and_across_model_parallel_group(input: bool) -> bool:
 
 def report_memory(name):
     """Simple GPU memory report."""
+    args = get_args()
     mega_bytes = 1024.0 * 1024.0
     string = name + ' memory (MB)'
-    string += ' | allocated: {}'.format(torch.cuda.memory_allocated() / mega_bytes)
-    string += ' | max allocated: {}'.format(torch.cuda.max_memory_allocated() / mega_bytes)
-    string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes)
-    string += ' | max reserved: {}'.format(torch.cuda.max_memory_reserved() / mega_bytes)
+    string += f" | allocated: {torch.cuda.memory_allocated() / mega_bytes:.2f}"
+    string += f" | max allocated: {torch.cuda.max_memory_allocated() / mega_bytes:.2f}"
+    string += f" | reserved: {torch.cuda.memory_reserved() / mega_bytes:.2f}"
+    string += f" | max reserved: {torch.cuda.max_memory_reserved() / mega_bytes:.2f}"
+    if args.log_device_memory_used:
+        string += f" | total device memory used: {torch.cuda.device_memory_used() / mega_bytes:.2f}"
     if mpu.get_data_parallel_rank() == 0:
         print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)