debugging convergence issue caused by padding tokens

xiaoyao0115 · xiaoyao0115 · commit e0c90c5a1320 · 2025-11-25T22:59:55.000-08:00
Signed-off-by: tailaim &lt;tailaim@nvidia.com&gt;
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
@@ -37,6 +37,43 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import WrappedTensor, deprecate_inference_params
 
+# #debugmtl
+# def get_debug_hook(layer_name):
+#     """
+#     这是一个“生产 Hook 的工厂”。
+#     调用它会返回一个已经记住了 layer_name 的 hook 函数。
+#     """
+#     def hook(grad_output):
+#         # 如果没梯度或者梯度为空，直接跳过
+#         if grad_output is None:
+#             print(f"[Rank {rank}] [BWD] {layer_name:25s} | grad_output is None")
+#             return
+
+#         g = grad_output[0]
+#         if g is None:
+#             return
+#         if torch.distributed.is_initialized():
+#             rank = torch.distributed.get_rank()
+#         # if rank == 0:
+#             # 简单的统计
+#         g_float = g.float()
+#         g_max = g_float.max().item()
+#         g_min = g_float.min().item()
+#         g_mean = g_float.mean().item()
+#         g_norm = torch.linalg.vector_norm(g_float, ord=2).item()
+#         has_nan = torch.isnan(g_float).any().item()
+
+#         # 【关键】这里可以直接打印 layer_name
+#         print(f"[Rank {rank}] [BWD] {layer_name:25s} | "
+#                 f"Max: {g_max:.4e} | Min: {g_min:.4e} | Mean: {g_mean:.4e} | "
+#                 f"Norm: {g_norm:.4e} | NaN: {has_nan}")
+
+#         # 如果发现 NaN，可以加个断点或者报错
+#         # if has_nan:
+#         #    raise RuntimeError(f"NaN found in {layer_name}")
+
+#     return hook
+
 
 class GPTModel(LanguageModule):
     """GPT Transformer language model.
@@ -475,6 +512,9 @@ def forward(
             preproc_output[:5]
         )
 
+        # #debugmtl
+        # decoder_input.register_hook(get_debug_hook("Embedding_Output"))
+
         rotary_pos_cos_sin = preproc_output[5] if len(preproc_output) == 6 else None
 
         # Run decoder.
@@ -491,6 +531,9 @@ def forward(
             **(extra_block_kwargs or {}),
         )
 
+        # #debugmtl
+        # hidden_states.register_hook(get_debug_hook("Decoder_Output_Before_Head"))
+
         return self._postprocess(
             hidden_states=hidden_states,
             input_ids=input_ids,
@@ -633,6 +676,8 @@ def _postprocess(
             hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
         )
 
+        # #debugmtl
+        # logits.register_hook(get_debug_hook("Logits_Output"))
         # Restore sequence parallel execution to the output layer if necessary.
         if sequence_parallel_override:
             assert (
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
@@ -970,6 +970,17 @@ def initialize_model_parallel(
             if rank in ranks:
                 _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS = hierarchical_groups
 
+    if hybrid_context_parallel:
+        # PyTorch is performing lazy initialization of the communicator group.
+        # Therefore, we need to perform a nccl call to ensure that the communicator group is created.
+        group_sizes = [2**i for i in range(int(log2(data_parallel_size)))]
+        if group_sizes[-1] * 2 == data_parallel_size:
+            group_sizes.append(data_parallel_size)
+        for group_size in group_sizes:
+            group = get_hybrid_data_context_parallel_groups(group_size=group_size)
+            torch.distributed.barrier(group=group, device_ids=[torch.cuda.current_device()])
+            torch.cuda.synchronize()
+
     # Build the model-parallel groups.
     global _MODEL_PARALLEL_GROUP
     global _MODEL_PARALLEL_GLOBAL_RANKS
diff --git a/megatron/core/pipeline_parallel/data_schedule.py b/megatron/core/pipeline_parallel/data_schedule.py
@@ -329,6 +329,27 @@ def _broadcast(item):
 
         groups, sample_id_groups = scheduler.get_groups_and_subsamples(global_id_seqlens, config)
 
+        # #debugmtl
+        # if parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0:
+        #     k = 0
+        #     for group in sample_id_groups:
+        #         print(f"group {k}: ",end="")
+        #         for i in range(len(group)):
+        #             print(f"GPU-{i}: [",end="")
+        #             for j in range(len(group[i])):
+        #                 print(f"{group[i][j]}-{global_id_seqlens[group[i][j]][1]}, ",end=" ")
+        #             print(f"], ")
+        #         k += 1
+        #         print()
+
+        # debugmtl
+        # set_gbs = set()
+        # for group in sample_id_groups:
+        #     for sub in group:
+        #         set_gbs.update(sub)
+        # assert len(set_gbs) == len(global_id_seqlens),
+        # f"set_gbs length: {len(set_gbs)} != global_ids_this_rank length: {len(global_id_seqlens)}"
+
         batch = _unpack_batch(batch)
         samples_this_rank_with_id = _reroute_samples_to_hdp_ranks(
             batch,
@@ -415,7 +436,9 @@ def _pack_tensors(tensors):
             new_sample["cu_seqlens"] = cu_seqlens
 
             new_samples.append(new_sample)
-
+        # #debugmtl
+        # print(f"rank {parallel_state.get_data_parallel_rank
+        # (with_context_parallel=True)} new_samples length: {len(new_samples)}")
         new_data_iterator = RerunDataIterator(iter(new_samples))
 
         return (
@@ -460,15 +483,28 @@ def get_groups_and_subsamples(self, sample_id_seqlens, config):
         sum_seqlen = 0
         single_microbatch = []
 
+        # debugmtl use 1 seq per microbatch
         for i in range(len(sample_id_seqlens)):
-            if sum_seqlen + sample_id_seqlens[i][1] <= self.max_seq_len_all_ranks:
-                single_microbatch.append(i)
-                sum_seqlen += sample_id_seqlens[i][1]
-            else:
-                groups.append(single_microbatch)
-                packed_id_groups.append(single_microbatch)
-                single_microbatch = [i]
-                sum_seqlen = sample_id_seqlens[i][1]
+            packed_id_groups.append([i])
+
+        # for i in range(len(sample_id_seqlens)):
+        #     if sum_seqlen + sample_id_seqlens[i][1] <= self.max_seq_len_all_ranks:
+        #         single_microbatch.append(i)
+        #         sum_seqlen += sample_id_seqlens[i][1]
+        #     else:
+        #         packed_id_groups.append(single_microbatch)
+        #         single_microbatch = [i]
+        #         sum_seqlen = sample_id_seqlens[i][1]
+        # if len(single_microbatch) > 0:
+        #     packed_id_groups.append(single_microbatch)
+
+        # debugmtl
+        gbs_sum = 0
+        for i in packed_id_groups:
+            gbs_sum += len(i)
+        assert gbs_sum == len(
+            sample_id_seqlens
+        ), f"gbs_sum: {gbs_sum} != sample_id_seqlens length: {len(sample_id_seqlens)}"
 
         # we want the number of packed sequences to be multiple of dp_size
         # so we move few samples from previous microbatch
@@ -482,7 +518,7 @@ def get_groups_and_subsamples(self, sample_id_seqlens, config):
                 assert i > 0, "Not enough samples to move"
                 if len(packed_id_groups[i]) > 1:
                     seq_id = packed_id_groups[i].pop()
-                    packed_id_groups[i].append(seq_id)
+                    packed_id_groups.append([seq_id])
                     num_to_move -= 1
                 else:
                     i -= 1
@@ -493,7 +529,9 @@ def get_groups_and_subsamples(self, sample_id_seqlens, config):
             for j in range(self.cp_size * self.dp_size):
                 seq_id = int(i * self.dp_size + j / self.cp_size)
                 sample_id_groups[i].append(packed_id_groups[seq_id])
-
+        # debugmtl
+        # print(f"rank {parallel_state.get_data_parallel_rank(with_context_parallel=True)} \
+        # sample_id_groups: {len(sample_id_groups)}")
         return groups, sample_id_groups
 
 
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
@@ -146,7 +146,8 @@ def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
     if (out is None) or (not deallocate_pipeline_outputs):
         return
     assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
-    assert out._base is None, "counter-productive to free a view of another tensor."
+    # debugmtl
+    # assert out._base is None, "counter-productive to free a view of another tensor."
     out.data = torch.empty((1,), device=out.device, dtype=out.dtype)
 
 
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
@@ -672,12 +672,49 @@ def token_dispatch(self, permutated_local_input_tokens, permuted_probs):
         self.tokens_per_expert = self._maybe_dtoh_and_synchronize(
             "before_ep_alltoall", self.tokens_per_expert
         )
-        global_input_tokens = all_to_all(
-            self.ep_group, permutated_local_input_tokens, self.output_splits, self.input_splits
-        )
-        global_probs = all_to_all(
-            self.ep_group, permuted_probs, self.output_splits, self.input_splits
-        )
+        # debugmtl
+        # global_input_tokens = all_to_all(
+        #     self.ep_group, permutated_local_input_tokens,
+        # self.output_splits, self.input_splits
+        # )
+        # global_probs = all_to_all(
+        #     self.ep_group, permuted_probs, self.output_splits,
+        # self.input_splits
+        # )
+        try:
+            global_input_tokens = all_to_all(
+                self.ep_group, permutated_local_input_tokens, self.output_splits, self.input_splits
+            )
+            global_probs = all_to_all(
+                self.ep_group, permuted_probs, self.output_splits, self.input_splits
+            )
+        except RuntimeError as e:
+            # 获取 EP group 内的 rank（防止 group 还没初始化时报错）
+            try:
+                rank = torch.distributed.get_rank(self.ep_group)
+            except Exception:
+                rank = -1
+
+            print(f"[MoE all_to_all error] rank={rank}, err={e}")
+            print(
+                f"[MoE all_to_all debug] "
+                f"tokens_shape={getattr(permutated_local_input_tokens, 'shape', None)}, "
+                f"probs_shape={getattr(permuted_probs, 'shape', None)}"
+            )
+            print(
+                f"[MoE all_to_all debug] "
+                f"input_splits={self.input_splits}, sum={sum(self.input_splits) \
+                if self.input_splits is not None else None}, "
+                f"output_splits={self.output_splits}, sum={sum(self.output_splits) \
+                if self.output_splits is not None else None}"
+            )
+            print(
+                f"[MoE all_to_all debug] "
+                f"tokens_per_expert={self.tokens_per_expert}, "
+                f"sum={self.tokens_per_expert.sum() if \
+                hasattr(self.tokens_per_expert, 'sum') else None}"
+            )
+            raise
 
         return global_input_tokens, global_probs
 
diff --git a/megatron/training/datasets/sft_dataset.py b/megatron/training/datasets/sft_dataset.py
@@ -124,6 +124,8 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
         num_tokens = len(tokens) + force_eod_length
         if sft_sequence_packing:
             padding_len = self.get_padding_size(num_tokens) - num_tokens
+            # debugmtl
+            # padding_len = max_seq_len - num_tokens
         else:
             padding_len = max_seq_len - num_tokens
         assert padding_len >= 0
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -476,7 +476,7 @@ def transformer_flops():
             )
             +                
             # Self Attention
-            standard_self_attn_term
+            self_attn_term
             
         )
         return total_floating_point_operations
@@ -1460,6 +1460,8 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
                     val,
                     group=mpu.get_data_parallel_group(with_context_parallel=True)
                 )
+                #debugmtl
+                print_rank_0(f"key: {key}, val: {val}")
                 loss_reduced[key] = val[0] / val[1]
             elif val[0].numel() == 1:
                 # legacy behavior, we average over the number of microbatches
@@ -1747,6 +1749,9 @@ def training_log(
                 avg = total_loss_dict[key].item() / float(
                     max(1, total_loss_dict[advanced_iters_key])
                 )
+                #debugmtl
+                print_rank_0(f"in training_log, key: {key}, avg: {total_loss_dict[key].item()}, \
+                 advanced_iters_key: {total_loss_dict[advanced_iters_key]}")
                 if avg > 0.0:
                     log_string += ' {}: {:.6E} |'.format(key, avg)
                 total_loss_dict[key] = torch.tensor([0.0], dtype=torch.float, device='cuda')
@@ -2070,7 +2075,70 @@ def train(
     """Training function: run train_step desired number of times, run validation, checkpoint."""
     args = get_args()
     timers = get_timers()
+    # debugmtl
+    def get_debug_hook(layer_name):
+        """
+        这是一个“生产 Hook 的工厂”。
+        调用它会返回一个已经记住了 layer_name 的 hook 函数。
+        """
+        def hook(module, grad_input, grad_output):
+            # 如果没梯度或者梯度为空，直接跳过
+            if not grad_output:
+                return
+            
+            g = grad_output[0]
+            if g is None:
+                return
+            if torch.distributed.is_initialized():
+                rank = torch.distributed.get_rank()
+            if rank == 0:
+                # 简单的统计
+                g_float = g.float()
+                g_max = g_float.max().item()
+                g_min = g_float.min().item()
+                g_mean = g_float.mean().item()
+                g_norm = torch.linalg.vector_norm(g_float, ord=2).item()
+                has_nan = torch.isnan(g_float).any().item()
+                
+                # 【关键】这里可以直接打印 layer_name
+                print(f"[Rank {rank}] [BWD] {layer_name:25s} | "
+                      f"Max: {g_max:.4e} | Min: {g_min:.4e} | Mean: {g_mean:.4e} | "
+                      f"Norm: {g_norm:.4e} | NaN: {has_nan}")
+            
+            # 如果发现 NaN，可以加个断点或者报错
+            # if has_nan:
+            #    raise RuntimeError(f"NaN found in {layer_name}")
+
+        return hook
+
+    for chunk_id, model_chunk in enumerate(model):
+        prefix = f"Chunk{chunk_id}"
+        gpt_model = model_chunk.module.module
+        # --- 注册 Embedding ---
+        if hasattr(gpt_model, 'embedding'):
+            # 传入名字 "Embedding"
+            gpt_model.embedding.register_full_backward_hook(
+                get_debug_hook(f"{prefix}.Embedding")
+            )
+            
+        if hasattr(gpt_model, 'output_layer'):
+            # 传入名字 "Embedding"
+            gpt_model.output_layer.register_full_backward_hook(
+                get_debug_hook(f"{prefix}.OutputLayer")
+            )
 
+        # --- 注册 Decoder Layers ---
+        if hasattr(gpt_model, 'decoder') and hasattr(gpt_model.decoder, 'layers'):
+            for i, layer in enumerate(gpt_model.decoder.layers):
+                # 传入名字 "Layer_0", "Layer_1" ...
+                layer.register_full_backward_hook(
+                    get_debug_hook(f"{prefix}.Layer_{i}")
+                )
+        
+        print_rank_0(f">>> {prefix} backward debug hook registered")
+        print_rank_0(f"model chunk is: {model_chunk.module.module}")
+        
+        
     if getattr(args, 'perform_rl_step', False):
         assert has_rl_utils, "RL cannot run without the megatron.rl package"
 
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
@@ -58,7 +58,6 @@ def get_batch(data_iterator, vp_stage: Optional[int] = None):
     """Generate a batch."""
     args = get_args()
     config = core_transformer_config_from_args(args)
-    args = get_args()
     
     # TODO: this is pretty hacky, find a better way
     if not is_first_or_last_pipeline_stage(vp_stage) and (
@@ -83,6 +82,22 @@ def get_batch(data_iterator, vp_stage: Optional[int] = None):
                 cu_seqlens_padded, max_seqlen, local_cp_size=local_cp_size)
         
     else:
+        # #debugmtl
+        # sample_length = batch['tokens'].shape[1]
+        # if args.sft:
+        #     packed_seq_params = PackedSeqParams(
+        #     qkv_format="sbhd",
+        #     cu_seqlens_q=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        #     cu_seqlens_kv=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        #     cu_seqlens_q_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        #     cu_seqlens_kv_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
+        #     max_seqlen_q=sample_length,
+        #     max_seqlen_kv=sample_length,
+        #     local_cp_size=None,
+        #     cp_group=None,
+        # )
+        # else:
+        #     packed_seq_params = None
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)  # The implementation of this function is in MCore
         packed_seq_params = None