add test_wrap_dataloader UT

xiaoyao0115 · xiaoyao0115 · commit ffe8f94bad75 · 2026-01-19T01:18:42.000-08:00
Signed-off-by: xiaoyao0115 &lt;1804647152@qq.com&gt;
diff --git a/examples/hybrid_cp/run_hybrid_cp.sh b/examples/hybrid_cp/run_hybrid_cp.sh
diff --git a/megatron/core/datasets/data_schedule.py b/megatron/core/datasets/data_schedule.py
@@ -437,7 +437,8 @@ def _build_packed_microbatches(
         # When VPP is enabled, align num_micro_batches to this multiple.
         (
             None
-            if config.virtual_pipeline_model_parallel_size is None
+            if (config.virtual_pipeline_model_parallel_size is None or 
+                config.virtual_pipeline_model_parallel_size == 1)
             else config.microbatch_group_size_per_vp_stage
         ),
         config.hybrid_context_parallel,
@@ -1009,7 +1010,14 @@ def get_groups_and_subsamples(self, sample_id_seqlens):
         single_microbatch = []
 
         for i in range(len(sample_id_seqlens)):
-            single_microbatch = [i]
+            if sum_seqlen + sample_id_seqlens[i][1] <= self.max_seq_len_all_ranks:
+                single_microbatch.append(i)
+                sum_seqlen += sample_id_seqlens[i][1]
+            else:
+                packed_id_groups.append(single_microbatch)
+                single_microbatch = [i]
+                sum_seqlen = sample_id_seqlens[i][1]
+        if len(single_microbatch) > 0:
             packed_id_groups.append(single_microbatch)
 
         # we want the number of packed sequences to be multiple of dp_size
@@ -1100,6 +1108,8 @@ def check_require_sample_keys(self, batch: List[Dict]):
         # we only fetch it once, rather than iterating num_micro_batches times.
         for key in required_keys:
             if key not in batch[0]:
+                #debugmtl
+                print(f"key {key} not in batch[0]: {batch[0]}")
                 return False
         return True
 
@@ -1631,13 +1641,22 @@ def fill_empty(sample_id_group):
                 sample_id_group = fill_empty(sample_id_group)
             return sample_id_group
 
+        attempts_since_split = 0
         while remainder > 0:
-            assert i >= 0, f'align_sample_id_groups: no tail microbatch has enough ids to split'
+            if i < 0:
+                if attempts_since_split >= len(sample_id_groups):
+                    assert (
+                        False
+                    ), f'align_sample_id_groups: no tail microbatch has enough ids to split'
+                i = len(sample_id_groups) - 1
             group1, group2 = split_group(sample_id_groups[i])
             if group1 is not None and group2 is not None:
                 sample_id_groups[i] = group1
                 sample_id_groups.append(group2)
                 remainder -= 1
+                attempts_since_split = 0
+            else:
+                attempts_since_split += 1
             i -= 1
 
         return sample_id_groups
@@ -1704,16 +1723,18 @@ def _broadcast_to_tp_group(item):
 
     # data_iterator should return a batch including the following keys.
     batch_keys = [
-        'tokens',
-        'position_ids',
-        'labels',
-        'loss_mask',
         'cu_seqlens',
         'cu_seqlens_padded',
         'max_seqlen',
     ]
     if hybrid_context_parallel:
         batch_keys.append('local_cp_size')
+    if is_first_stage:
+        batch_keys.append('tokens')
+        batch_keys.append('position_ids')
+    if is_last_stage:
+        batch_keys.append('labels')
+        batch_keys.append('loss_mask')
 
     # Get a batch from data_iterator or create an emtpy batch.
     if is_tp_rank_0:
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
@@ -1329,21 +1329,17 @@ def forward(
         """Forward."""
         if packed_seq_params is not None:
             # If Dynamic CP group is provided, update TE DPA CP group
-            if packed_seq_params.cp_group is not None:
-                self.cp_group = packed_seq_params.cp_group
-                super().set_context_parallel_group(
-                    self.cp_group,
-                    torch.distributed.get_process_group_ranks(self.cp_group),
-                    TEDotProductAttention.cp_stream,
-                    self.cp_comm_type,
-                )
-            # If cp_group is None but local_cp_size is provided,
-            # Indicates to turn off CP dynamically
-            elif packed_seq_params.local_cp_size is not None:
-                assert (
-                    packed_seq_params.local_cp_size == 1
-                ), "local_cp_size must be == 1 if provided without cp_group"
-                super().set_context_parallel_group(None, None, None, self.cp_comm_type)
+            if packed_seq_params.local_cp_size is not None:
+                if packed_seq_params.local_cp_size == 1:
+                    super().set_context_parallel_group(None, None, None, self.cp_comm_type)
+                else:
+                    self.cp_group = packed_seq_params.cp_group
+                    super().set_context_parallel_group(
+                        self.cp_group,
+                        torch.distributed.get_process_group_ranks(self.cp_group),
+                        TEDotProductAttention.cp_stream,
+                        self.cp_comm_type,
+                    )
             self.kept_packed_seq_params.discard("cp_group")
             self.kept_packed_seq_params.discard("local_cp_size")
 
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
@@ -478,3 +478,8 @@ def __post_init__(self):
                     "SFT sequence packing requires Transformer Engine >= 2.9.0 "
                     f"but got {get_te_version()} (TE < 2.9.0 may have convergence issues)."
                 )
+            if self.sequence_packing_scheduler == None:
+               if self.hybrid_context_parallel:
+                  self.sequence_packing_scheduler = "default_hybrid_cp"
+               else:
+                  self.sequence_packing_scheduler = "naive_sequence_packing"
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
@@ -520,7 +520,6 @@ def wrap_iterator_helper(
 ):
     """Warp data iterator for sequence packing if needed."""
     if config.sequence_packing:
-        num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch = None, None
         scheduler_type_map = {
             'default_hybrid_cp': PackingScheduler.DEFAULT_HYBRID_CP,
             'empty_scheduler_with_packing': PackingScheduler.EMPTY_PACKING,
@@ -707,7 +706,7 @@ def forward_backward_no_pipelining(
     ):
         create_cudagraphs()
 
-    if config.sequence_packing:
+    if config.sequence_packing and not forward_only:
         forward_data_store.append(
             [num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch]
         )
@@ -2091,7 +2090,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None):
         create_cudagraphs()
     nvtx_range_pop(suffix="misc")
 
-    if config.sequence_packing:
+    if config.sequence_packing and not forward_only:
         forward_data_store.append(
             [num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch]
         )
@@ -2489,7 +2488,7 @@ def enable_grad_sync():
     ):
         create_cudagraphs()
 
-    if config.sequence_packing:
+    if config.sequence_packing and not forward_only:
         forward_data_store.append(
             [num_total_tokens_this_global_batch, sequence_square_sum_this_global_batch]
         )
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -815,11 +815,6 @@ def validate_args(args, defaults={}):
         # TODO(tailaim): add support for other dispatcher types
         print(f"Setting moe_token_dispatcher_type to alltoall for sft sequence packing with pipeline parallelism")
         args.moe_token_dispatcher_type = "alltoall"
-        if args.sequence_packing_scheduler is None:
-            if args.hybrid_context_parallel:
-                args.sequence_packing_scheduler = 'default_hybrid_cp'
-            else:
-                args.sequence_packing_scheduler = 'naive_sequence_packing'
     else:
         args.variable_seq_lengths = False
 
@@ -983,6 +978,9 @@ def validate_args(args, defaults={}):
         assert args.context_parallel_size == 1, 'context parallel size must be 1 for hybrid context parallelism'
 
     if args.sequence_packing:
+        assert not args.create_attention_mask_in_dataloader, \
+        'Sequence packing does not support create_attention_mask_in_dataloader. ' \
+        'Please set --no-create-attention-mask-in-dataloader'
         # Validate that packed sequence buffer is large enough for single sequences
         if args.hybrid_context_parallel:
             # packed_buffer_size = hdp_size * max_seqlen_per_rank >= single_seq_max_len
@@ -2932,7 +2930,7 @@ def _add_distributed_args(parser):
                        'Requires --max-seqlen-per-dp-cp-rank to be set.')
     group.add_argument('--min-hybrid-context-parallel-size', type=int, default=1,
                         help='Minimum size of the hybrid context parallel groups.')
-    group.add_argument('--sequence-packing-scheduler', type=str, default='default_hybrid_cp',
+    group.add_argument('--sequence-packing-scheduler', type=str, default=None,
                         choices=['default_hybrid_cp', 'empty_scheduler_with_packing', 'empty_scheduler_no_packing', 'naive_sequence_packing'],
                         help='Scheduler for sequence packing and hybrid context parallel. '
                         'naive_sequence_packing: default naive sequence packing scheduler(just THD, no Hybrid-CP, this '
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -2839,10 +2839,6 @@ def evaluate(
                 decoder_seq_length=args.decoder_seq_length,
                 forward_only=True,
             )
-            if args.sequence_packing:
-                # need to drop first two elements which are total_num_tokens and
-                # total_sequence_square_sum
-                loss_dicts = loss_dicts[2:]
             ft_integration.on_eval_step_end()
             config.timers = get_timers()
 
diff --git a/tests/unit_tests/context_parallel/test_packing_and_hybrid_cp.py b/tests/unit_tests/context_parallel/test_packing_and_hybrid_cp.py

Original file line number	Diff line number	Diff line change
`@@ -478,3 +478,8 @@ def __post_init__(self):`
`478`	`478`	`"SFT sequence packing requires Transformer Engine >= 2.9.0 "`
`479`	`479`	`f"but got {get_te_version()} (TE < 2.9.0 may have convergence issues)."`
`480`	`480`	`)`
	`481`	`+ if self.sequence_packing_scheduler == None:`
	`482`	`+ if self.hybrid_context_parallel:`
	`483`	`+ self.sequence_packing_scheduler = "default_hybrid_cp"`
	`484`	`+ else:`
	`485`	`+ self.sequence_packing_scheduler = "naive_sequence_packing"`