support tp

xiaoyao0115 · xiaoyao0115 · commit 983e5f377c28 · 2025-11-04T22:34:57.000-08:00
Signed-off-by: tailaim &lt;tailaim@nvidia.com&gt;
diff --git a/megatron/core/pipeline_parallel/data_schedule.py b/megatron/core/pipeline_parallel/data_schedule.py
@@ -307,10 +307,9 @@ def _broadcast(item):
     total_hdp_gpus = dp_cp_group.size()
     dev = torch.cuda.current_device()
 
-    # TODO(tailaim): handle the case when data_iterator is None
     if data_iterator is None:
         # TP-0 reads from data_iterator, others receive via broadcast.
-        sample_id_groups, batch = None, None, None
+        sample_id_groups, batch = None, None
         num_total_groups_broadcast = torch.tensor([0], dtype=torch.int32, device=dev)
         _broadcast(num_total_groups_broadcast)
         num_micro_batches = int(num_total_groups_broadcast.item())
@@ -352,6 +351,10 @@ def _broadcast(item):
         )
         _broadcast(num_total_groups_broadcast)
 
+        # TODO(tailaim): calculate this two values properly
+        # num_total_tokens_this_GA = losses_reduced.pop(0)
+        # sequence_square_sum_this_GA = losses_reduced.pop(0)
+
         # pack sequences in the same group and create a new data iterator
         new_samples = []
         for i in range(num_micro_batches):
@@ -375,10 +378,10 @@ def _pack_tensors(tensors):
 
             # TODO(tailaim): do we need attention_mask for sequence packing?
             new_sample = {}
-            new_sample["tokens"] = tokens.unsqueeze(0)
-            new_sample["labels"] = labels.unsqueeze(0)
-            new_sample["loss_mask"] = loss_mask.unsqueeze(0)
-            new_sample["position_ids"] = position_ids.unsqueeze(0)
+            new_sample["tokens"] = tokens
+            new_sample["labels"] = labels
+            new_sample["loss_mask"] = loss_mask
+            new_sample["position_ids"] = position_ids
             new_sample["local_cp_size"] = torch.tensor(
                 partner_cp_size, dtype=torch.int32, device=dev
             )
@@ -442,6 +445,7 @@ def __init__(self, config):
         super().__init__(config)
         self.max_seq_len_all_ranks = config.max_seqlen_per_dp_cp_rank * config.context_parallel_size
         self.dp_size = parallel_state.get_data_parallel_world_size()
+        self.cp_size = parallel_state.get_context_parallel_world_size()
 
     def get_groups_and_subsamples(self, sample_id_seqlens, config):
         """
@@ -451,35 +455,44 @@ def get_groups_and_subsamples(self, sample_id_seqlens, config):
         """
         groups = []
         sample_id_groups = []
+        packed_id_groups = []
         sum_seqlen = 0
         single_microbatch = []
 
         for i in range(len(sample_id_seqlens)):
-            if sum_seqlen + sample_id_seqlens[i] <= self.max_seq_len_all_ranks:
+            if sum_seqlen + sample_id_seqlens[i][1] <= self.max_seq_len_all_ranks:
                 single_microbatch.append(i)
                 sum_seqlen += sample_id_seqlens[i][1]
             else:
                 groups.append(single_microbatch)
-                sample_id_groups.append(single_microbatch)
+                packed_id_groups.append(single_microbatch)
                 single_microbatch = [i]
                 sum_seqlen = sample_id_seqlens[i][1]
 
-        # we want the number of microbatches to be multiple of dp_size
+        # we want the number of packed sequences to be multiple of dp_size
         # so we move few samples from previous microbatch
         # to the end of the microbatches if needed
-        num_microbatches_before = len(sample_id_groups)
-        if num_microbatches_before % self.dp_size != 0:
-            remainder = num_microbatches_before % self.dp_size
+        num_packed_sequence = len(packed_id_groups)
+        if num_packed_sequence % self.dp_size != 0:
+            remainder = num_packed_sequence % self.dp_size
             num_to_move = self.dp_size - remainder
-            i = num_microbatches_before - 1
+            i = num_packed_sequence - 1
             while num_to_move > 0:
                 assert i > 0, "Not enough samples to move"
-                if len(sample_id_groups[i]) > 1:
-                    seq_id = sample_id_groups[i].pop()
-                    sample_id_groups[i].append(seq_id)
+                if len(packed_id_groups[i]) > 1:
+                    seq_id = packed_id_groups[i].pop()
+                    packed_id_groups[i].append(seq_id)
                     num_to_move -= 1
                 else:
                     i -= 1
+
+        num_micro_batches = int(len(packed_id_groups) / self.dp_size)
+        for i in range(num_micro_batches):
+            sample_id_groups.append([])
+            for j in range(self.cp_size * self.dp_size):
+                seq_id = int(i * self.dp_size + j / self.cp_size)
+                sample_id_groups[i].append(packed_id_groups[seq_id])
+
         return groups, sample_id_groups
 
 
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
@@ -1910,11 +1910,16 @@ def get_thd_batch_on_this_cp_rank(
         cp_group=cp_group,
     )
 
+    for key in ['tokens', 'position_ids', 'labels', 'loss_mask']:
+        if key in batch:
+            batch[key] = batch[key].unsqueeze(0)
+
     if cp_size > 1:  # slice batch along sequence dimension for context parallelism
         assert tex is not None and is_te_min_version("1.10.0"), (
             "Please update Transformer Engine to >= 1.10 to use "
             "Context Parallel with THD format data"
         )
+        # print(f"tokens shape before cp slice: {batch['tokens'].shape}")
         index = tex.thd_get_partitioned_indices(
             cu_seqlens_padded, batch['tokens'].size(1), cp_size, cp_rank
         )
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -932,7 +932,6 @@ def validate_args(args, defaults={}):
     if args.hybrid_context_parallel:
         assert not args.pipeline_model_parallel_size > 1, 'Hybrid context parallelism not supported with pipeline parallelism'
         assert not args.enable_cuda_graph, 'Hybrid context parallelism not supported with CUDA Graph'
-        assert not args.use_megatron_fsdp, 'Hybrid context parallelism not supported with Megatron FSDP'
         assert args.dataloader_type == 'single', 'Hybrid context parallelism only supported with single dataloader type'
         assert args.calculate_per_token_loss, 'Hybrid context parallelism must be used with --calculate-per-token-loss'
 
diff --git a/megatron/training/datasets/sft_dataset.py b/megatron/training/datasets/sft_dataset.py
@@ -81,11 +81,14 @@ def _calculate_padding_divisor(self) -> int:
             cp_pad = cp_pad * dp_size if hybrid_cp else cp_pad
             divisor = cp_pad * tp_pad
         """
-        cp_pad = self.config.context_parallel_size * 2 if self.config.context_parallel_size > 1 else 1
-        cp_pad = cp_pad * self.config.data_parallel_size if self.config.hybrid_context_parallel else cp_pad
+        if self.config.hybrid_context_parallel:
+            # Hybrid CP: consider both CP and DP
+            cp_pad = self.config.data_parallel_size * self.config.context_parallel_size * 2
+        else:
+            # Standard CP: only consider CP
+            cp_pad = self.context_parallel_size * 2 if self.context_parallel_size > 1 else 1
         tp_pad = self.config.sequence_parallel_size if self.config.sequence_parallel_size > 0 else 1
         divisor = cp_pad * tp_pad
-
         return divisor
     
     def get_padding_size(
diff --git a/megatron/training/utils.py b/megatron/training/utils.py
@@ -563,8 +563,6 @@ def _broadcast_cu_seqlens(cu_seqlens):
             else:
                 assert isinstance(cu_seqlens, torch.Tensor)
                 assert cu_seqlens.dtype == torch.int32
-                #TODO(tailaim): verify the shape for this tensor
-                # assert cu_seqlens.shape[0] == 1, "micro-batch-size must be 1 for packing"
                 buf = cu_seqlens.to(device=dev, non_blocking=True).contiguous()
             _broadcast(buf)
 
@@ -732,6 +730,10 @@ def _broadcast_cu_seqlens():
             'local_cp_size': local_cp_size,
         }
 
+    if not args.sft_sequence_packing:
+        keys_to_keep = ['tokens', 'labels', 'loss_mask', 'attention_mask', 'position_ids']
+        batch = {k: v for k, v in batch.items() if k in keys_to_keep}
+
     return batch
 
 

Original file line number	Diff line number	Diff line change
`@@ -1910,11 +1910,16 @@ def get_thd_batch_on_this_cp_rank(`
`1910`	`1910`	`cp_group=cp_group,`
`1911`	`1911`	`)`
`1912`	`1912`
	`1913`	`+ for key in ['tokens', 'position_ids', 'labels', 'loss_mask']:`
	`1914`	`+ if key in batch:`
	`1915`	`+ batch[key] = batch[key].unsqueeze(0)`
	`1916`	`+`
`1913`	`1917`	`if cp_size > 1: # slice batch along sequence dimension for context parallelism`
`1914`	`1918`	`assert tex is not None and is_te_min_version("1.10.0"), (`
`1915`	`1919`	`"Please update Transformer Engine to >= 1.10 to use "`
`1916`	`1920`	`"Context Parallel with THD format data"`
`1917`	`1921`	`)`
	`1922`	`+ # print(f"tokens shape before cp slice: {batch['tokens'].shape}")`
`1918`	`1923`	`index = tex.thd_get_partitioned_indices(`
`1919`	`1924`	`cu_seqlens_padded, batch['tokens'].size(1), cp_size, cp_rank`
`1920`	`1925`	`)`