lint and address comments

ashors1 · ashors1 · commit fc94f8ba858a · 2026-01-20T20:54:07.000-08:00
Signed-off-by: ashors1 &lt;ashors@nvidia.com&gt;
diff --git a/nemo_rl/models/megatron/common.py b/nemo_rl/models/megatron/common.py
@@ -29,6 +29,7 @@
     get_moe_layer_wise_logging_tracker,
     reduce_aux_losses_tracker_across_ranks,
 )
+
 from nemo_rl.algorithms.loss_functions import LossFunction, SequencePackingLossWrapper
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 
@@ -40,6 +41,7 @@ def _round_up_to_multiple(value: int, multiple: int) -> int:
         else value
     )
 
+
 def forward_step_arbitrary_loss(
     state: GlobalState,
     global_valid_seqs: torch.Tensor,
diff --git a/nemo_rl/models/megatron/data.py b/nemo_rl/models/megatron/data.py
@@ -16,17 +16,18 @@
 from typing import Any, Iterator, Optional, Tuple
 
 import torch
-
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.parallel_state import (
     get_context_parallel_rank,
     get_context_parallel_world_size,
 )
+from megatron.core.utils import StragglerDetector
 from megatron.training.utils import get_ltor_masks_and_position_ids
-from nemo_rl.models.megatron.common import _round_up_to_multiple
+
 from nemo_rl.algorithms.interfaces import LossFunction, LossType
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.model_utils import _get_tokens_on_this_cp_rank
+from nemo_rl.models.megatron.common import _round_up_to_multiple
 
 
 @dataclass
@@ -45,6 +46,7 @@ class ProcessedMicrobatch:
         packed_seq_params: PackedSeqParams for sequence packing (None if not packing)
         cu_seqlens_padded: Padded cumulative sequence lengths (None if not packing)
     """
+
     data_dict: BatchedDataDict[Any]
     input_ids: torch.Tensor
     input_ids_cp_sharded: torch.Tensor
@@ -60,6 +62,7 @@ def make_processed_microbatch_iterator(
     seq_length_key: Optional[str],
     pad_individual_seqs_to_multiple_of: int,
     pad_packed_seq_to_multiple_of: int,
+    straggler_timer: StragglerDetector,
     pad_full_seq_to: Optional[int],
 ) -> Iterator[ProcessedMicrobatch]:
     """Wrap a raw microbatch iterator to yield processed microbatches.
@@ -100,6 +103,7 @@ def make_processed_microbatch_iterator(
             pad_packed_seq_to_multiple_of=pad_packed_seq_to_multiple_of,
             pad_full_seq_to=pad_full_seq_to,
             pack_sequences=pack_sequences,
+            straggler_timer=straggler_timer,
         )
 
         yield ProcessedMicrobatch(
@@ -117,6 +121,7 @@ def get_microbatch_iterator(
     data: BatchedDataDict[Any],
     cfg: dict[str, Any],
     mbs: int,
+    straggler_timer: StragglerDetector,
     seq_length_key: Optional[str] = None,
 ) -> Tuple[Iterator[ProcessedMicrobatch], int, int, int, int]:
     """Create a processed microbatch iterator from a batch of data.
@@ -179,6 +184,7 @@ def get_microbatch_iterator(
         pad_individual_seqs_to_multiple_of=pad_factor,
         pad_packed_seq_to_multiple_of=pad_packed_seq_to_multiple_of,
         pad_full_seq_to=pad_full_seq_to,
+        straggler_timer=straggler_timer,
     )
 
     # Compute padded sequence length for pipeline parallelism
@@ -192,70 +198,80 @@ def get_microbatch_iterator(
         padded_seq_length,
     )
 
+
 def process_microbatch(
     data_dict: BatchedDataDict[Any],
     seq_length_key: Optional[str] = None,
     pad_individual_seqs_to_multiple_of: int = 1,
     pad_packed_seq_to_multiple_of: int = 1,
     pad_full_seq_to: Optional[int] = None,
     pack_sequences: bool = False,
-):
-    #with straggler_timer(bdata=True):
-    input_ids = data_dict["input_ids"]
-    attention_mask = None
-    position_ids = None
-    packed_seq_params = None
-
-    original_batch_size = input_ids.shape[0]
-    original_seq_length = input_ids.shape[1]
-    seq_lengths = None  # Will be set if using packed sequences
-    cu_seqlens = None
-    cu_seqlens_padded = None
-
-    if pack_sequences:
-        # For packed sequences with padded input, we need sequence lengths
-        assert seq_length_key is not None, (
-            "seq_length_key must be provided for packed sequences"
-        )
-        assert seq_length_key in data_dict, (
-            f"{seq_length_key} not found in data_dict"
-        )
+    straggler_timer: StragglerDetector = None,
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[PackedSeqParams],
+    Optional[torch.Tensor],
+]:
+    """Process a microbatch for Megatron model forward pass."""
+    with straggler_timer(bdata=True):
+        input_ids = data_dict["input_ids"]
+        attention_mask = None
+        position_ids = None
+        packed_seq_params = None
+
+        original_batch_size = input_ids.shape[0]
+        original_seq_length = input_ids.shape[1]
+        seq_lengths = None  # Will be set if using packed sequences
+        cu_seqlens = None
+        cu_seqlens_padded = None
+
+        if pack_sequences:
+            # For packed sequences with padded input, we need sequence lengths
+            assert seq_length_key is not None, (
+                "seq_length_key must be provided for packed sequences"
+            )
+            assert seq_length_key in data_dict, (
+                f"{seq_length_key} not found in data_dict"
+            )
 
-        # Get sequence lengths and context parallel size
-        seq_lengths = data_dict[seq_length_key]
+            # Get sequence lengths and context parallel size
+            seq_lengths = data_dict[seq_length_key]
+
+            # Pack sequences
+            (
+                input_ids,
+                input_ids_cp_sharded,
+                packed_seq_params,
+                cu_seqlens,
+                cu_seqlens_padded,
+            ) = _pack_sequences_for_megatron(
+                input_ids,
+                seq_lengths,
+                pad_individual_seqs_to_multiple_of,
+                pad_packed_seq_to_multiple_of,
+                pad_full_seq_to,
+                cp_rank=get_context_parallel_rank(),
+                cp_size=get_context_parallel_world_size(),
+            )
 
-        # Pack sequences
-        (
-            input_ids,
-            input_ids_cp_sharded,
-            packed_seq_params,
-            cu_seqlens,
-            cu_seqlens_padded,
-        ) = _pack_sequences_for_megatron(
-            input_ids,
-            seq_lengths,
-            pad_individual_seqs_to_multiple_of,
-            pad_packed_seq_to_multiple_of,
-            pad_full_seq_to,
-            cp_rank=get_context_parallel_rank(),
-            cp_size=get_context_parallel_world_size(),
-        )
-    
-        # For packed sequences, position_ids and attention_mask are typically None
-        # The PackedSeqParams handles all necessary sequence information
-        position_ids = None
-        attention_mask = None
-    else:
-        input_ids_cp_sharded = input_ids
-        attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-            data=input_ids,
-            eod_token=0,  # used for loss_mask, which we don't use
-            pad_token=0,  # used for loss_mask, which we don't use
-            reset_position_ids=False,
-            reset_attention_mask=False,
-            eod_mask_loss=False,
-            pad_mask_loss=False,
-        )
+            # For packed sequences, position_ids and attention_mask are typically None
+            # The PackedSeqParams handles all necessary sequence information
+            position_ids = None
+            attention_mask = None
+        else:
+            input_ids_cp_sharded = input_ids
+            attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+                data=input_ids,
+                eod_token=0,  # used for loss_mask, which we don't use
+                pad_token=0,  # used for loss_mask, which we don't use
+                reset_position_ids=False,
+                reset_attention_mask=False,
+                eod_mask_loss=False,
+                pad_mask_loss=False,
+            )
     return (
         input_ids,
         input_ids_cp_sharded,
@@ -265,13 +281,15 @@ def process_microbatch(
         cu_seqlens_padded,
     )
 
+
 def process_global_batch(
     data: BatchedDataDict[Any],
     batch_idx: int,
     batch_size: int,
     loss_fn: LossFunction,
     dp_group: torch.distributed.ProcessGroup,
-) -> dict[str, Any]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Process a global batch for Megatron model forward pass."""
     batch = data.get_batch(batch_idx=batch_idx, batch_size=batch_size)
 
     assert "sample_mask" in batch, "sample_mask must be present in the data!"
@@ -301,6 +319,7 @@ def process_global_batch(
         global_valid_toks,
     )
 
+
 def _pack_sequences_for_megatron(
     input_ids: torch.Tensor,
     seq_lengths: torch.Tensor,
@@ -605,13 +624,14 @@ def _unpack_sequences_from_megatron(
 
     return unpacked_output
 
+
 def check_sequence_dim(data: BatchedDataDict[Any]):
     # dim 1 is always assumed to be the sequence dim, sanity check this here
     sequence_dim = 1
     seq_dim_size = data["input_ids"].shape[sequence_dim]
     for k, v in data.items():
         if torch.is_tensor(v) and len(v.shape) > 1:
             assert v.shape[sequence_dim] == seq_dim_size, (
-                f"Dim 1 must be the sequence dim, expected dim 1={seq_dim_size} but got shape {v.shape}"
+                f"Dim 1 must be the sequence dim, expected dim 1={seq_dim_size} but got shape {v.shape} for key {k}"
             )
-    return sequence_dim, seq_dim_size
+    return sequence_dim, seq_dim_size
diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py
@@ -113,11 +113,11 @@
     forward_step_arbitrary_loss,
     get_moe_metrics,
 )
+from nemo_rl.models.megatron.community_import import import_model_from_hf_name
 from nemo_rl.models.megatron.data import (
     get_microbatch_iterator,
     process_global_batch,
 )
-from nemo_rl.models.megatron.community_import import import_model_from_hf_name
 from nemo_rl.models.policy import PolicyConfig
 from nemo_rl.models.policy.interfaces import (
     ColocatablePolicyInterface,
@@ -1000,7 +1000,12 @@ def train(
                     micro_batch_size,
                     seq_length,
                     padded_seq_length,
-                ) = get_microbatch_iterator(batch, self.cfg, mbs)
+                ) = get_microbatch_iterator(
+                    batch,
+                    self.cfg,
+                    mbs,
+                    straggler_timer=self.mcore_state.straggler_timer,
+                )
                 # Track total microbatches for MoE aux-loss averaging
                 total_num_microbatches += int(num_microbatches)
 
@@ -1024,9 +1029,9 @@ def train(
                         data_iterator=data_iterator,
                         model=self.model,
                         num_microbatches=num_microbatches,
-                        seq_length=seq_dim_size,
+                        seq_length=padded_seq_length,
                         micro_batch_size=mbs,
-                        decoder_seq_length=seq_dim_size,
+                        decoder_seq_length=padded_seq_length,
                         forward_only=eval_mode,
                         do_not_average_loss=True,
                     )
@@ -1176,7 +1181,12 @@ def get_logprobs(
             micro_batch_size,
             seq_length,
             padded_seq_length,
-        ) = get_microbatch_iterator(data, self.cfg, logprob_batch_size)
+        ) = get_microbatch_iterator(
+            data,
+            self.cfg,
+            logprob_batch_size,
+            straggler_timer=self.mcore_state.straggler_timer,
+        )
 
         def forward_step_fn(
             data_iterator: Iterator[BatchedDataDict[Any]], model: GPTModel
@@ -1378,7 +1388,12 @@ def get_topk_logits(
             micro_batch_size,
             seq_length,
             padded_seq_length,
-        ) = get_microbatch_iterator(data, self.cfg, logprob_batch_size)
+        ) = get_microbatch_iterator(
+            data,
+            self.cfg,
+            logprob_batch_size,
+            straggler_timer=self.mcore_state.straggler_timer,
+        )
 
         def forward_step_fn(
             data_iterator: Iterator[BatchedDataDict[Any]], model: GPTModel
diff --git a/tests/unit/algorithms/test_sequence_packing_gradients.py b/tests/unit/algorithms/test_sequence_packing_gradients.py
@@ -339,8 +339,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
                         "pipeline_model_parallel_size": 1,
                         "context_parallel_size": cp_size,
                     },
-
-
                 },
                 seq_length_key="input_lengths",
                 pad_individual_seqs_to_multiple_of=pad_to_multiple,
diff --git a/tests/unit/models/megatron/test_megatron_data.py b/tests/unit/models/megatron/test_megatron_data.py

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`get_moe_layer_wise_logging_tracker,`
`30`	`30`	`reduce_aux_losses_tracker_across_ranks,`
`31`	`31`	`)`
	`32`	`+`
`32`	`33`	`from nemo_rl.algorithms.loss_functions import LossFunction, SequencePackingLossWrapper`
`33`	`34`	`from nemo_rl.distributed.batched_data_dict import BatchedDataDict`
`34`	`35`
`@@ -40,6 +41,7 @@ def _round_up_to_multiple(value: int, multiple: int) -> int:`
`40`	`41`	`else value`
`41`	`42`	`)`
`42`	`43`
	`44`	`+`
`43`	`45`	`def forward_step_arbitrary_loss(`
`44`	`46`	`state: GlobalState,`
`45`	`47`	`global_valid_seqs: torch.Tensor,`