debug merge

tdene · tdene · commit b34746e5dfa8 · 2026-03-19T19:14:31.000-05:00
diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py
@@ -1659,11 +1659,22 @@ def initialize_attention_state(
         self.padded_active_request_count = self.padded_batch_dimensions.req_count
         self.padding_slice = slice(self.active_token_count, self.padded_active_token_count)
 
+        import os, sys
+        _rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        _dbg_f = open(f"/tmp/tde_debug_rank{_rank}.log", "a")
+        def _dbg(msg):
+            _dbg_f.write(f"[ATTN] {msg}\n"); _dbg_f.flush()
+            print(f"[rank{_rank}] [ATTN] {msg}", flush=True, file=sys.stderr)
+
+        _dbg(f"build_active_slices start (padded_req={self.padded_active_request_count}, padded_tok={self.padded_active_token_count}, paused={self.paused_request_count}, total={self.total_request_count})")
         self.build_active_slices(self.padded_active_request_count)
+        _dbg("build_active_slices done")
         self.pad_active_slices()
+        _dbg("pad_active_slices done")
 
         batch_size = self.total_request_count - self.paused_request_count
         assert self.active_attn_metadata is not None
+        _dbg(f"mha_metadata.update start (batch_size={batch_size})")
         self.active_attn_metadata["mha_metadata"].update(
             request_query_lengths=self.active_request_query_lengths[:batch_size],
             request_kv_length_offsets=self.active_request_kv_length_offsets[:batch_size],
@@ -1672,6 +1683,7 @@ def initialize_attention_state(
             padded_batch_dimensions=self.padded_batch_dimensions,
             num_speculative_tokens=self.num_speculative_tokens,
         )
+        _dbg("mha_metadata.update done")
 
         if self.is_hybrid_model:
             active_slice = slice(self.paused_request_count, self.total_request_count)
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
@@ -334,7 +334,16 @@ def create_cuda_graphs(self, reset_context: bool = True):
             reset_context (bool): Whether to reset the context after building cuda graphs.
         """
 
+        import sys
+        _rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        _dbg_f = open(f"/tmp/tde_debug_rank{_rank}.log", "a")
+        def _dbg(msg):
+            _dbg_f.write(f"[CG] {msg}\n"); _dbg_f.flush()
+            print(f"[rank{_rank}] [CG] {msg}", flush=True, file=sys.stderr)
+
+        _dbg(f"create_cuda_graphs start (impl={self.cuda_graph_impl})")
         if self.cuda_graph_impl != "local":
+            _dbg("skipping (not local)")
             return
 
         if (
@@ -393,12 +402,15 @@ def create_cuda_graphs(self, reset_context: bool = True):
                 )
 
         tbar = enumerate(context.cuda_graph_batch_dimensions_list)
+        _dbg(f"warmup loop start ({len(context.cuda_graph_batch_dimensions_list)} graphs)")
         if HAVE_TQDM:
             tbar = tqdm(tbar, total=len(context.cuda_graph_batch_dimensions_list))
         for tbar_idx, cuda_graph_batch_dimension in tbar:
+            _dbg(f"warmup iter {tbar_idx}: context_init start ({cuda_graph_batch_dimension})")
             input_ids, position_ids = self.controller._dynamic_step_context_init(
                 construct_graph_dimensions=cuda_graph_batch_dimension
             )
+            _dbg(f"warmup iter {tbar_idx}: context_init done")
             # Progress.
             tbar_str = f"cuda graph warmup - {cuda_graph_batch_dimension}"
             if HAVE_TQDM:
@@ -1630,12 +1642,23 @@ async def async_forward(self) -> Tuple[Dict, Dict, float]:
                 step_time (float): How long this step took.
         """
 
+        import sys
+        _rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        _dbg_f = open(f"/tmp/tde_debug_rank{_rank}.log", "a")
+        def _dbg(msg):
+            _dbg_f.write(f"[ENGINE] {msg}\n"); _dbg_f.flush()
+            print(f"[rank{_rank}] [ENGINE] {msg}", flush=True, file=sys.stderr)
+
+        _dbg("async_forward enter")
+
         # If suspended, no stepping.
         if self.state in (EngineState.SUSPENDED, EngineState.SUSPENDING):
             raise EngineSuspendedError(self.context.step_count)
 
         # schedule requests
+        _dbg("schedule_waiting_requests start")
         self.schedule_waiting_requests()
+        _dbg(f"schedule_waiting_requests done (total={self.context.total_request_count}, paused={self.context.paused_request_count}, tokens={self.context.active_token_count})")
 
         # Saving pre-step state, for printing output below.
         is_decode_only = self.context.is_decode_only()
@@ -1654,7 +1677,9 @@ async def async_forward(self) -> Tuple[Dict, Dict, float]:
         self.is_decode_only = is_decode_only
 
         self.step_start_event.record()
+        _dbg("async_generate_output_tokens_dynamic_batch start")
         result = await self.controller.async_generate_output_tokens_dynamic_batch()
+        _dbg("async_generate_output_tokens_dynamic_batch done")
         self.step_end_event.record()
         self.step_end_event.synchronize()
         step_time = self.step_start_event.elapsed_time(self.step_end_event) / 1e3
@@ -2283,17 +2308,30 @@ async def run_engine_with_coordinator(
         self._loop = get_asyncio_loop(loop)
         self.use_coordinator = True
 
+        import sys
+        _rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        _dbg_f = open(f"/tmp/tde_debug_rank{_rank}.log", "a")
+        _iter = 0
+        def _dbg(msg):
+            _dbg_f.write(f"[COORD iter={_iter}] {msg}\n"); _dbg_f.flush()
+            print(f"[rank{_rank}] [COORD iter={_iter}] {msg}", flush=True, file=sys.stderr)
+
         try:
             while True:
+                _iter += 1
+                _dbg(f"loop top (state={self.state})")
                 self.schedule_requests()
+                _dbg(f"schedule done (active={self.context.get_active_request_count()}, waiting={len(self.waiting_request_ids)})")
 
                 if self.state in (EngineState.RUNNING, EngineState.PAUSING):
                     local_pending = self.context.get_active_request_count() + len(
                         self.waiting_request_ids
                     )
+                    _dbg(f"ep_consensus start (local_pending={local_pending})")
                     global_work, all_pausing = await self._ep_establish_consensus(
                         local_pending, signal_consensus=(self.state == EngineState.PAUSING)
                     )
+                    _dbg(f"ep_consensus done (global_work={global_work}, all_pausing={all_pausing})")
 
                     if all_pausing:
                         # All EP peers are PAUSING: pause immediately.
@@ -2303,15 +2341,19 @@ async def run_engine_with_coordinator(
                     elif global_work > 0:
                         # At least one EP peer has work: all must participate.
                         if local_pending > 0:
+                            _dbg("async_step start")
                             await self.async_step()
+                            _dbg("async_step done")
                         else:
                             # Dummy forward to participate in the EP collective.
+                            _dbg("dummy_forward start")
                             self.step_start_event.record()
                             self.controller.dummy_forward()
                             self.step_end_event.record()
                             self.step_end_event.synchronize()
                             self.context.step_count += 1
                             self.context.prefix_cache_lru_clock += 1
+                            _dbg("dummy_forward done")
                     else:
                         # No work, but not all pausing: idle.
                         await asyncio.sleep(0.02)
diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py
@@ -107,6 +107,11 @@ def set_stop_word_finished_ids_callback(self, callback):
 
     def _init_dynamic_sampling_tensors(self):
         """Initialize tensors needed for dynamic sampling."""
+        import sys
+        _rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        _dbg_f = open(f"/tmp/tde_debug_rank{_rank}.log", "a")
+        _dbg_f.write("[INIT] _init_dynamic_sampling_tensors start\n"); _dbg_f.flush()
+        print(f"[rank{_rank}] [INIT] _init_dynamic_sampling_tensors start", flush=True, file=sys.stderr)
         context = self.inference_wrapped_model.inference_context
         max_requests = context.max_requests
         if context.config.materialize_only_last_token_logits:
@@ -143,6 +148,8 @@ def _init_dynamic_sampling_tensors(self):
             self._torch_sampling_buckets: List[Tuple] = []
 
         self._init_mtp_sampling_tensor()
+        _dbg_f.write("[INIT] _init_dynamic_sampling_tensors done\n"); _dbg_f.flush()
+        print(f"[rank{_rank}] [INIT] _init_dynamic_sampling_tensors done", flush=True, file=sys.stderr)
 
     def _init_mtp_sampling_tensor(self):
         """Initialize the MTP sampling tensor after num_speculative_tokens is set."""
@@ -626,11 +633,20 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor):
             else context.padded_active_token_count
         )
 
+        import os, sys
+        _rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        _dbg_f = open(f"/tmp/tde_debug_rank{_rank}.log", "a")
+        def _dbg(msg):
+            _dbg_f.write(f"[FWD] {msg}\n"); _dbg_f.flush()
+            print(f"[rank{_rank}] [FWD] {msg}", flush=True, file=sys.stderr)
+
+        _dbg(f"run_one_forward_step start (logits_seq_len={logits_seq_len})")
         with torch.inference_mode():
             logits = self.inference_wrapped_model.run_one_forward_step(
                 {"tokens": input_ids, "position_ids": position_ids, "attention_mask": None}
             )
             # logits shape: [1, seq_len, vocab_size]
+        _dbg(f"run_one_forward_step done (logits={'None' if logits is None else tuple(logits.shape)})")
 
         # Note: When speculative decoding is active (num_speculative_tokens > 0),
         # the model skips MTP computation during the forward pass. MTP logits
@@ -653,12 +669,14 @@ def _dynamic_step_forward_logits(self, input_ids: Tensor, position_ids: Tensor):
             if is_pipeline_last_stage(self.pp_group):
                 assert logits is not None and torch.Size(logits_shape) == logits.shape
 
+            _dbg("broadcast_from_last_pipeline_stage start")
             logits = broadcast_from_last_pipeline_stage(
                 logits_shape,
                 dtype=self.model_config.params_dtype,
                 tensor=logits,
                 pp_group=self.pp_group,
             )
+            _dbg("broadcast_from_last_pipeline_stage done")
 
         # Copy logits to contiguous buffer.
         if self._enable_cuda_graph:
@@ -1754,11 +1772,20 @@ async def async_generate_output_tokens_dynamic_batch(
         context = self.inference_wrapped_model.inference_context
         active_request_count = context.total_request_count - context.paused_request_count
 
+        import os, sys
+        _rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+        _dbg_f = open(f"/tmp/tde_debug_rank{_rank}.log", "a")
+        def _dbg(msg):
+            _dbg_f.write(f"[STEP] {msg}\n"); _dbg_f.flush()
+            print(f"[rank{_rank}] [STEP] {msg}", flush=True, file=sys.stderr)
+
         # No tokens and no active requests?
         if context.active_token_count == 0 and active_request_count == 0:
             return None
 
+        _dbg(f"context_init start (tokens={context.active_token_count}, reqs={active_request_count})")
         input_ids, position_ids = self._dynamic_step_context_init()
+        _dbg(f"context_init done (input_ids.shape={tuple(input_ids.shape)})")
 
         cuda_graph_request_count = (
             context.padded_active_request_count if context.is_decode_only() else None
@@ -1771,7 +1798,9 @@ async def async_generate_output_tokens_dynamic_batch(
 
         # Forward pass produces only base logits. When speculative decoding is
         # active, MTP logits are computed serially after verification.
+        _dbg("forward_logits start")
         self._dynamic_step_forward_logits(input_ids, position_ids)
+        _dbg("forward_logits done")
 
         # Commit Mamba intermediate states before update_requests, which
         # may swap request indices. The Python lists tracking EOS block IDs
@@ -1790,10 +1819,13 @@ async def async_generate_output_tokens_dynamic_batch(
         # asynchronous.
         # Todo [Siddharth]: Can we condition the sleep on a cuda event?
         # NOTE [TDE]: This will be moved once CPU and GPU methods are separated.
+        _dbg("yield start")
         await asyncio.sleep(0)
+        _dbg("yield done")
         return_log_probs, return_top_n_logprobs = self._dynamic_step_log_probs_bookkeeping()
 
         self._dynamic_step_sample_bookkeeping()
+        _dbg("sample_logits start")
 
         if self.num_speculative_tokens > 0:
             # Phase 1: Verify speculative tokens using base logits only.
@@ -1810,6 +1842,7 @@ async def async_generate_output_tokens_dynamic_batch(
             self._compute_serial_mtp_and_sample()
         else:
             self._dynamic_step_sample_logits()
+        _dbg("sample_logits done")
 
         log_probs = None
         top_n_logprobs = None
@@ -1825,10 +1858,12 @@ async def async_generate_output_tokens_dynamic_batch(
                 if return_top_n_logprobs:
                     top_n_logprobs = self._dynamic_step_calculate_top_n_logprobs(log_probs_tensor)
 
+        _dbg("bookkeeping start")
         if skip_bookkeeping:
             request_bookkeeping = {}
         else:
             request_bookkeeping = self._dynamic_step_context_bookkeeping()
+        _dbg("bookkeeping done")
 
         ret = {
             # Clone needed: _sampled_tokens_cuda is a reused buffer overwritten each step.