need mopdify dataiterator_wrapper UT

xiaoyao0115 · xiaoyao0115 · commit 669e0f378bf3 · 2026-01-15T19:05:59.000-08:00
Signed-off-by: tailaim &lt;tailaim@nvidia.com&gt;
diff --git a/megatron/core/datasets/data_schedule.py b/megatron/core/datasets/data_schedule.py
@@ -1108,6 +1108,8 @@ def check_require_sample_keys(self, batch: List[Dict]):
         # we only fetch it once, rather than iterating num_micro_batches times.
         for key in required_keys:
             if key not in batch[0]:
+                #debugmtl
+                print(f"key {key} not in batch[0]: {batch[0]}")
                 return False
         return True
 
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
@@ -520,7 +520,6 @@ def wrap_iterator_helper(
 ):
     """Warp data iterator for sequence packing if needed."""
     if config.sequence_packing:
-        num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch = None, None
         scheduler_type_map = {
             'default_hybrid_cp': PackingScheduler.DEFAULT_HYBRID_CP,
             'empty_scheduler_with_packing': PackingScheduler.EMPTY_PACKING,
@@ -707,7 +706,7 @@ def forward_backward_no_pipelining(
     ):
         create_cudagraphs()
 
-    if config.sequence_packing:
+    if config.sequence_packing and not forward_only:
         forward_data_store.append(
             [num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch]
         )
@@ -2091,7 +2090,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None):
         create_cudagraphs()
     nvtx_range_pop(suffix="misc")
 
-    if config.sequence_packing:
+    if config.sequence_packing and not forward_only:
         forward_data_store.append(
             [num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch]
         )
@@ -2489,7 +2488,7 @@ def enable_grad_sync():
     ):
         create_cudagraphs()
 
-    if config.sequence_packing:
+    if config.sequence_packing and not forward_only:
         forward_data_store.append(
             [num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch]
         )
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -2839,10 +2839,6 @@ def evaluate(
                 decoder_seq_length=args.decoder_seq_length,
                 forward_only=True,
             )
-            if args.sequence_packing:
-                # need to drop first two elements which are total_num_tokens and
-                # total_sequence_square_sum
-                loss_dicts = loss_dicts[2:]
             ft_integration.on_eval_step_end()
             config.timers = get_timers()
 
diff --git a/tests/unit_tests/context_parallel/test_packing_and_hybrid_cp.py b/tests/unit_tests/context_parallel/test_packing_and_hybrid_cp.py