NVIDIA
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py‎
Lines changed: 3 additions & 0 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 59 additions & 1 deletion b/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎megatron/core/parallel_state.py‎
Lines changed: 11 additions & 0 deletions b/‎megatron/core/parallel_state.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎megatron/core/pipeline_parallel/data_schedule.py‎
Lines changed: 51 additions & 7 deletions b/‎megatron/core/pipeline_parallel/data_schedule.py‎
Lines changed: 51 additions & 7 deletions
diff --git a/‎megatron/core/pipeline_parallel/schedules.py‎
Lines changed: 2 additions & 1 deletion b/‎megatron/core/pipeline_parallel/schedules.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎megatron/core/transformer/moe/moe_layer.py‎
Lines changed: 12 additions & 0 deletions b/‎megatron/core/transformer/moe/moe_layer.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎megatron/core/transformer/moe/moe_utils.py‎
Lines changed: 6 additions & 0 deletions b/‎megatron/core/transformer/moe/moe_utils.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎megatron/core/transformer/moe/router.py‎
Lines changed: 13 additions & 0 deletions b/‎megatron/core/transformer/moe/router.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎megatron/core/transformer/moe/token_dispatcher.py‎
Lines changed: 46 additions & 6 deletions b/‎megatron/core/transformer/moe/token_dispatcher.py‎
Lines changed: 46 additions & 6 deletions
@@ -3342,6 +3342,9 @@ def wait_bucket_ready(self, bucket_id, empty_ok=False):
         # Wait for asynchronous / overlapped NCCL operations to complete.
         param_gather_event, mark_bucket_ready_to_use = self.param_gather_event_map.pop(bucket_id)
         param_gather_event.wait()
+        # debugmtl
+        if self.ag_stream is not None:
+            torch.cuda.current_stream().wait_stream(self.ag_stream)
         mark_bucket_ready_to_use()
 
     @torch.no_grad()
 
@@ -37,6 +37,65 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import WrappedTensor, deprecate_inference_params
 
+# #debugmtl
+# _ACT_GRAD_DUMP_COUNTERS = {}
+
+# def _sanitize_name(name: str) -> str:
+#     return str(name).replace('/', '_').replace('\\', '_').replace('.', '_').replace(' ', '_')
+
+# def _next_act_dump_index(rank: int, layer_name: str) -> int:
+#     key = (rank, layer_name)
+#     cnt = _ACT_GRAD_DUMP_COUNTERS.get(key, 0) + 1
+#     _ACT_GRAD_DUMP_COUNTERS[key] = cnt
+#     return cnt
+
+# def get_debug_hook(layer_name: str):
+#     """
+#     Tensor-level grad hook: save activation grad by (rank, layer_name, index).
+#     """
+#     import os
+#     def hook(grad: torch.Tensor):
+#         if grad is None:
+#             return
+
+#         rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+
+#         # 基础目录自行改成你想要的
+#         base_dir = "/home/tailaim/act_grad_dump"
+#         if not base_dir:
+#             return
+
+#         try:
+#             idx = _next_act_dump_index(rank, layer_name)
+#             layer_dir = os.path.join(
+#                 base_dir,
+#                 f"rank_{rank}",
+#                 _sanitize_name(layer_name),
+#             )
+#             os.makedirs(layer_dir, exist_ok=True)
+#             file_path = os.path.join(layer_dir, f"grad_{idx:06d}.pt")
+
+#             # 只前几次写盘，避免太多文件
+#             if idx <= 16:
+#                 torch.save(grad.detach().cpu(), file_path)
+
+#             # 只在第一次写时打印一行日志
+#             if idx == 1:
+#                 try:
+#                     g_shape = tuple(grad.shape)
+#                     g_dtype = str(grad.dtype)
+#                 except Exception:
+#                     g_shape = "unknown"
+#                     g_dtype = "unknown"
+#                 print(
+#                     f"[Rank {rank}] Saved act grad: layer={layer_name}, "
+#                     f"idx={idx:06d}, shape={g_shape}, dtype={g_dtype}, path={file_path}"
+#                 )
+#         except Exception as e:
+#             print(f"[Rank {rank}] act grad dump failed for {layer_name}: {e}")
+
+#     return hook
+
 
 class GPTModel(LanguageModule):
     """GPT Transformer language model.
@@ -640,7 +699,6 @@ def _postprocess(
             hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
         )
 
-        # Restore sequence parallel execution to the output layer if necessary.
         if sequence_parallel_override:
             assert (
                 in_inference_mode
 
@@ -970,6 +970,17 @@ def initialize_model_parallel(
             if rank in ranks:
                 _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS = hierarchical_groups
 
+    if hybrid_context_parallel:
+        # PyTorch is performing lazy initialization of the communicator group.
+        # Therefore, we need to perform a nccl call to ensure that the communicator group is created.
+        group_sizes = [2**i for i in range(int(log2(data_parallel_size)))]
+        if group_sizes[-1] * 2 == data_parallel_size:
+            group_sizes.append(data_parallel_size)
+        for group_size in group_sizes:
+            group = get_hybrid_data_context_parallel_groups(group_size=group_size)
+            torch.distributed.barrier(group=group, device_ids=[torch.cuda.current_device()])
+            torch.cuda.synchronize()
+
     # Build the model-parallel groups.
     global _MODEL_PARALLEL_GROUP
     global _MODEL_PARALLEL_GLOBAL_RANKS
 
@@ -10,6 +10,13 @@
 import torch
 
 from megatron.core import parallel_state
+
+# from megatron.core.pipeline_parallel.utils import (
+#     is_pp_first_stage,
+#     is_pp_last_stage,
+#     is_vp_first_stage,
+#     is_vp_last_stage,
+# )
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.rerun_state_machine import RerunDataIterator
 
@@ -293,17 +300,24 @@ def _broadcast(item):
         dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
         dp_group = parallel_state.get_data_parallel_group()
         tp_group = parallel_state.get_tensor_model_parallel_group()
+        pp_group = parallel_state.get_pipeline_model_parallel_group()
     else:
         dp_cp_group = pg_collection.dp_cp
         dp_group = pg_collection.dp
         tp_group = pg_collection.tp
+        pp_group = pg_collection.pp
     assert (
         dp_cp_group is not None and dp_group is not None and tp_group is not None
     ), "dp_cp_group, dp_group, tp_group must not be None when using hybrid context parallel"
 
     total_hdp_gpus = dp_cp_group.size()
     dev = torch.cuda.current_device()
 
+    # if is_pp_first_stage(pp_group) or is_pp_last_stage(pp_group) and tp_group.rank() == 0:
+    #     # do what data_iterator is doing
+
+    #     # first stage tp-0 broadcast num_micro_batches cu_seqlens to
+
     if data_iterator is None:
         # TP-0 reads from data_iterator, others receive via broadcast.
         sample_id_groups, batch = None, None
@@ -329,6 +343,16 @@ def _broadcast(item):
 
         groups, sample_id_groups = scheduler.get_groups_and_subsamples(global_id_seqlens, config)
 
+        # debugmtl
+        set_gbs = set()
+        for group in sample_id_groups:
+            for sub in group:
+                set_gbs.update(sub)
+        assert len(set_gbs) == len(
+            global_id_seqlens
+        ), f"set_gbs length: {len(set_gbs)} \
+        != global_ids_this_rank length: {len(global_id_seqlens)}"
+
         batch = _unpack_batch(batch)
         samples_this_rank_with_id = _reroute_samples_to_hdp_ranks(
             batch,
@@ -384,9 +408,10 @@ def _pack_tensors(tensors):
             new_sample["labels"] = labels
             new_sample["loss_mask"] = loss_mask
             new_sample["position_ids"] = position_ids
-            new_sample["local_cp_size"] = torch.tensor(
-                partner_cp_size, dtype=torch.int32, device=dev
-            )
+            if scheduler_type is PackingScheduler.HYBRID_CP:
+                new_sample["local_cp_size"] = torch.tensor(
+                    partner_cp_size, dtype=torch.int32, device=dev
+                )
 
             # create cu_seqlens_padded
             lengths_padding = np.fromiter(
@@ -415,7 +440,9 @@ def _pack_tensors(tensors):
             new_sample["cu_seqlens"] = cu_seqlens
 
             new_samples.append(new_sample)
-
+        # #debugmtl
+        # print(f"rank {parallel_state.get_data_parallel_rank
+        # (with_context_parallel=True)} new_samples length: {len(new_samples)}")
         new_data_iterator = RerunDataIterator(iter(new_samples))
 
         return (
@@ -460,15 +487,30 @@ def get_groups_and_subsamples(self, sample_id_seqlens, config):
         sum_seqlen = 0
         single_microbatch = []
 
+        # # debugmtl use 1 seq per microbatch
+        # num_micro_batches = len(sample_id_seqlens)//self.dp_size
+        # for i in range(num_micro_batches):
+        #     for j in range(self.dp_size):
+        #         packed_id_groups.append([i+j*num_micro_batches])
+
         for i in range(len(sample_id_seqlens)):
             if sum_seqlen + sample_id_seqlens[i][1] <= self.max_seq_len_all_ranks:
                 single_microbatch.append(i)
                 sum_seqlen += sample_id_seqlens[i][1]
             else:
-                groups.append(single_microbatch)
                 packed_id_groups.append(single_microbatch)
                 single_microbatch = [i]
                 sum_seqlen = sample_id_seqlens[i][1]
+        if len(single_microbatch) > 0:
+            packed_id_groups.append(single_microbatch)
+
+        # debugmtl
+        gbs_sum = 0
+        for i in packed_id_groups:
+            gbs_sum += len(i)
+        assert gbs_sum == len(
+            sample_id_seqlens
+        ), f"gbs_sum: {gbs_sum} != sample_id_seqlens length: {len(sample_id_seqlens)}"
 
         # we want the number of packed sequences to be multiple of dp_size
         # so we move few samples from previous microbatch
@@ -482,7 +524,7 @@ def get_groups_and_subsamples(self, sample_id_seqlens, config):
                 assert i > 0, "Not enough samples to move"
                 if len(packed_id_groups[i]) > 1:
                     seq_id = packed_id_groups[i].pop()
-                    packed_id_groups[i].append(seq_id)
+                    packed_id_groups.append([seq_id])
                     num_to_move -= 1
                 else:
                     i -= 1
@@ -493,7 +535,9 @@ def get_groups_and_subsamples(self, sample_id_seqlens, config):
             for j in range(self.cp_size * self.dp_size):
                 seq_id = int(i * self.dp_size + j / self.cp_size)
                 sample_id_groups[i].append(packed_id_groups[seq_id])
-
+        # debugmtl
+        # print(f"rank {parallel_state.get_data_parallel_rank(with_context_parallel=True)} \
+        # sample_id_groups: {len(sample_id_groups)}")
         return groups, sample_id_groups
 
 
 
@@ -147,7 +147,8 @@ def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
     if (out is None) or (not deallocate_pipeline_outputs):
         return
     assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
-    assert out._base is None, "counter-productive to free a view of another tensor."
+    # debugmtl
+    # assert out._base is None, "counter-productive to free a view of another tensor."
     out.data = torch.empty((1,), device=out.device, dtype=out.dtype)
 
 
 
@@ -185,6 +185,10 @@ def route(self, hidden_states: torch.Tensor):
         producing routing probabilities and a mapping.
         """
         probs, routing_map = self.router(hidden_states)
+        # #debugmtl
+        # true_per_row = routing_map.sum(dim=1)  # tensor of shape [n]
+        # assert torch.all(true_per_row == 8), \
+        #     f"routing_map row true counts not all 8, got: {true_per_row}"
         return probs, routing_map
 
     @maybe_skip_or_early_return_by_cudagraph("preprocess")
@@ -290,6 +294,14 @@ def forward(self, hidden_states: torch.Tensor):
                 "During training, performance may degrade if MoE and tensor parallelism"
                 "are enabled without also enabling sequence parallelism."
             )
+        # # debugmtl
+        # if torch.isnan(hidden_states).any():
+        #     bad_mask = torch.isnan(hidden_states)
+        #     bad_idx = bad_mask.nonzero(as_tuple=False)[:10]
+        #     raise RuntimeError(
+        #         f"[MoE] hidden_states contains NaN, first indices: {bad_idx.tolist()}, "
+        #         f"shape={tuple(hidden_states.shape)}"
+        #     )
 
         # MoE forward: route -> dispatch -> compute -> combine
         def custom_forward(hidden_states):
 
@@ -618,6 +618,12 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None):
             (rows, top_indices), torch.ones_like(probs, dtype=routing_map.dtype), accumulate=False
         )
         routing_map = routing_map.bool()
+        # debugmtl
+        true_per_row = routing_map.sum(dim=1)  # tensor of shape [n]
+        assert torch.all(
+            true_per_row == 8
+        ), f"in topk routing_with_score_function row true counts not \
+        all 8, got: {true_per_row}, num_tokens: {num_tokens}, logits shape: {logits.shape}"
     else:
         # TODO Try using element-wise operations instead of scatter?
         routing_probs = torch.zeros_like(logits).scatter(1, top_indices, probs)
 
@@ -512,6 +512,19 @@ def routing(self, logits: torch.Tensor):
                 fused=self.config.moe_router_fusion,
             )
 
+        # debugmtl
+        # true_per_row = routing_map.sum(dim=1)  # tensor of shape [n]
+        # torch.set_printoptions(threshold=torch.inf)
+        # assert torch.all(true_per_row == 8), \
+        #     f"in class topkrouter routing_map row true counts not all 8,
+        #  got: {true_per_row}, logits is:{logits}, topk is:{self.topk},
+        # use_pre_softmax is:{self.config.moe_router_pre_softmax}, num_groups
+        # is:{self.config.moe_router_num_groups}, group_topk is:
+        # {self.config.moe_router_group_topk}, scaling_factor is:
+        # {self.config.moe_router_topk_scaling_factor}, score_function
+        # is:{self.score_function}, expert_bias is:{self.expert_bias},
+        # fused is:{self.config.moe_router_fusion}"
+
         # Apply token dropping to probs and routing_map.
         if self.config.moe_expert_capacity_factor is not None:
             probs, routing_map = apply_router_token_dropping(
 
@@ -664,12 +664,52 @@ def token_dispatch(self, permutated_local_input_tokens, permuted_probs):
         self.tokens_per_expert = self._maybe_dtoh_and_synchronize(
             "before_ep_alltoall", self.tokens_per_expert
         )
-        global_input_tokens = all_to_all(
-            self.ep_group, permutated_local_input_tokens, self.output_splits, self.input_splits
-        )
-        global_probs = all_to_all(
-            self.ep_group, permuted_probs, self.output_splits, self.input_splits
-        )
+        # debugmtl
+        # global_input_tokens = all_to_all(
+        #     self.ep_group, permutated_local_input_tokens,
+        # self.output_splits, self.input_splits
+        # )
+        # global_probs = all_to_all(
+        #     self.ep_group, permuted_probs, self.output_splits,
+        # self.input_splits
+        # )
+        try:
+            global_input_tokens = all_to_all(
+                self.ep_group, permutated_local_input_tokens, self.output_splits, self.input_splits
+            )
+            global_probs = all_to_all(
+                self.ep_group, permuted_probs, self.output_splits, self.input_splits
+            )
+        except RuntimeError as e:
+            # 获取 EP group 内的 rank（防止 group 还没初始化时报错）
+            try:
+                rank = torch.distributed.get_rank(self.ep_group)
+            except Exception:
+                rank = -1
+
+            print(f"[MoE all_to_all error] rank={rank}, err={e}")
+            print(
+                f"[MoE all_to_all debug] "
+                f"tokens_shape={getattr(permutated_local_input_tokens, 'shape', None)}, "
+                f"probs_shape={getattr(permuted_probs, 'shape', None)}"
+            )
+            print(
+                f"[MoE all_to_all debug] "
+                f"input_splits={self.input_splits}, sum={sum(self.input_splits) \
+                if self.input_splits is not None else None}, "
+                f"output_splits={self.output_splits}, sum={sum(self.output_splits) \
+                if self.output_splits is not None else None}"
+            )
+            print(
+                f"[MoE all_to_all debug] "
+                f"tokens_per_expert={self.tokens_per_expert}, "
+                f"sum={self.tokens_per_expert.sum() if \
+                hasattr(self.tokens_per_expert, 'sum') else None}"
+            )
+            torch.set_printoptions(profile="full")
+            print(f"hidden_states shape: {self.hidden_shape}")
+            print(f"routing_map: {self.routing_map}")
+            raise
 
         return global_input_tokens, global_probs